diff options
author | agri <agri@yandex-team.ru> | 2022-02-10 16:48:12 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:48:12 +0300 |
commit | 2909866fbc652492b7d7cab3023cb19489dc4fd8 (patch) | |
tree | b222e5ac2e2e98872661c51ccceee5da0d291e13 /library/cpp/sse | |
parent | d3530b2692e400bd4d29bd4f07cafaee139164e7 (diff) | |
download | ydb-2909866fbc652492b7d7cab3023cb19489dc4fd8.tar.gz |
Restoring authorship annotation for <agri@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/sse')
-rw-r--r-- | library/cpp/sse/sse.h | 28 | ||||
-rw-r--r-- | library/cpp/sse/sse2neon.h | 1122 | ||||
-rw-r--r-- | library/cpp/sse/ut/test.cpp | 2290 | ||||
-rw-r--r-- | library/cpp/sse/ut/ya.make | 12 |
4 files changed, 1726 insertions, 1726 deletions
diff --git a/library/cpp/sse/sse.h b/library/cpp/sse/sse.h index 918a942803..19bac17de0 100644 --- a/library/cpp/sse/sse.h +++ b/library/cpp/sse/sse.h @@ -1,18 +1,18 @@ -#pragma once - -/* - The header chooses appropriate SSE support. - On Intel: SSE intrinsics - On ARM64: translation to NEON intrinsics or software emulation +#pragma once + +/* + The header chooses appropriate SSE support. + On Intel: SSE intrinsics + On ARM64: translation to NEON intrinsics or software emulation On PowerPc: translation to Altivec intrinsics or software emulation -*/ +*/ /* Author: Vitaliy Manushkin <agri@yandex-team.ru>, Danila Kutenin <danlark@yandex-team.ru> */ - -#include <util/system/platform.h> + +#include <util/system/platform.h> #if (defined(_i386_) || defined(_x86_64_)) && defined(_sse_) -#include <xmmintrin.h> -#include <emmintrin.h> +#include <xmmintrin.h> +#include <emmintrin.h> #include <pmmintrin.h> #define ARCADIA_SSE #if defined(_ssse3_) @@ -24,10 +24,10 @@ #if defined(_sse4_2_) #include <nmmintrin.h> #endif -#elif defined(_arm64_) -#include "sse2neon.h" +#elif defined(_arm64_) +#include "sse2neon.h" #define ARCADIA_SSE #elif defined(_ppc64_) #include "powerpc.h" #define ARCADIA_SSE -#endif +#endif diff --git a/library/cpp/sse/sse2neon.h b/library/cpp/sse/sse2neon.h index af7f3ed242..695dbd3041 100644 --- a/library/cpp/sse/sse2neon.h +++ b/library/cpp/sse/sse2neon.h @@ -1,60 +1,60 @@ -#pragma once - -/* - The header contains inlining code - which translates SSE intrinsics to NEON intrinsics or software emulation. - You are encouraged for commitments. - Add missing intrinsics, add unittests, purify the implementation, - merge and simplify templates. - Warning: The code is made in deep nights, so it surely contains bugs, - imperfections, flaws and all other kinds of errors and mistakes. -*/ -/* Author: Vitaliy Manushkin <agri@yandex-team.ru> */ - -#include <util/system/platform.h> +#pragma once + +/* + The header contains inlining code + which translates SSE intrinsics to NEON intrinsics or software emulation. + You are encouraged for commitments. + Add missing intrinsics, add unittests, purify the implementation, + merge and simplify templates. + Warning: The code is made in deep nights, so it surely contains bugs, + imperfections, flaws and all other kinds of errors and mistakes. +*/ +/* Author: Vitaliy Manushkin <agri@yandex-team.ru> */ + +#include <util/system/platform.h> #include <util/system/compiler.h> -#include <util/system/types.h> - -#if !defined(_arm64_) -#error "This header is for ARM64 (aarch64) platform only. " \ +#include <util/system/types.h> + +#if !defined(_arm64_) +#error "This header is for ARM64 (aarch64) platform only. " \ "Include sse.h instead of including this header directly." -#endif - -#include <arm_neon.h> - -union __m128i { - uint64x2_t AsUi64x2; - int64x2_t AsSi64x2; - - uint32x4_t AsUi32x4; - int32x4_t AsSi32x4; - - uint16x8_t AsUi16x8; - int16x8_t AsSi16x8; - - uint8x16_t AsUi8x16; - int8x16_t AsSi8x16; - - float32x4_t AsFloat32x4; - float64x2_t AsFloat64x2; -}; - -union __m128 { - float32x4_t AsFloat32x4; - float64x2_t AsFloat64x2; +#endif + +#include <arm_neon.h> + +union __m128i { + uint64x2_t AsUi64x2; + int64x2_t AsSi64x2; uint32x4_t AsUi32x4; int32x4_t AsSi32x4; - uint64x2_t AsUi64x2; - int64x2_t AsSi64x2; + uint16x8_t AsUi16x8; + int16x8_t AsSi16x8; - uint8x16_t AsUi8x16; + uint8x16_t AsUi8x16; + int8x16_t AsSi8x16; + + float32x4_t AsFloat32x4; + float64x2_t AsFloat64x2; +}; + +union __m128 { + float32x4_t AsFloat32x4; + float64x2_t AsFloat64x2; + + uint32x4_t AsUi32x4; + int32x4_t AsSi32x4; + + uint64x2_t AsUi64x2; + int64x2_t AsSi64x2; + + uint8x16_t AsUi8x16; int8x16_t AsSi8x16; __m128i As128i; -}; - +}; + typedef float64x2_t __m128d; enum _mm_hint @@ -72,128 +72,128 @@ Y_FORCE_INLINE void _mm_prefetch(const void *p, enum _mm_hint) { __builtin_prefetch(p); } -template <typename TType> -struct TQType; - -template <> -struct TQType<uint8x16_t> { - static inline uint8x16_t& As(__m128i& value) { - return value.AsUi8x16; - } - static inline const uint8x16_t& As(const __m128i& value) { - return value.AsUi8x16; - } -}; - -template <> -struct TQType<int8x16_t> { - static inline int8x16_t& As(__m128i& value) { - return value.AsSi8x16; - } - static inline const int8x16_t& As(const __m128i& value) { - return value.AsSi8x16; - } -}; - -template <> -struct TQType<uint16x8_t> { - static inline uint16x8_t& As(__m128i& value) { - return value.AsUi16x8; - } - static inline const uint16x8_t& As(const __m128i& value) { - return value.AsUi16x8; - } -}; - -template <> -struct TQType<int16x8_t> { - static inline int16x8_t& As(__m128i& value) { - return value.AsSi16x8; - } - static inline const int16x8_t& As(const __m128i& value) { - return value.AsSi16x8; - } -}; - -template <> -struct TQType<uint32x4_t> { - static inline uint32x4_t& As(__m128i& value) { - return value.AsUi32x4; - } - static inline const uint32x4_t& As(const __m128i& value) { - return value.AsUi32x4; - } -}; - -template <> -struct TQType<int32x4_t> { - static inline int32x4_t& As(__m128i& value) { - return value.AsSi32x4; - } - static inline const int32x4_t& As(const __m128i& value) { - return value.AsSi32x4; - } -}; - -template <> -struct TQType<uint64x2_t> { - static inline uint64x2_t& As(__m128i& value) { - return value.AsUi64x2; - } - static inline const uint64x2_t& As(const __m128i& value) { - return value.AsUi64x2; - } - static inline uint64x2_t& As(__m128& value) { - return value.AsUi64x2; - } - static inline const uint64x2_t& As(const __m128& value) { - return value.AsUi64x2; - } -}; - -template <> -struct TQType<int64x2_t> { - static inline int64x2_t& As(__m128i& value) { - return value.AsSi64x2; - } - static inline const int64x2_t& As(const __m128i& value) { - return value.AsSi64x2; - } -}; - -template <typename TValue> -struct TBaseWrapper { - TValue Value; - +template <typename TType> +struct TQType; + +template <> +struct TQType<uint8x16_t> { + static inline uint8x16_t& As(__m128i& value) { + return value.AsUi8x16; + } + static inline const uint8x16_t& As(const __m128i& value) { + return value.AsUi8x16; + } +}; + +template <> +struct TQType<int8x16_t> { + static inline int8x16_t& As(__m128i& value) { + return value.AsSi8x16; + } + static inline const int8x16_t& As(const __m128i& value) { + return value.AsSi8x16; + } +}; + +template <> +struct TQType<uint16x8_t> { + static inline uint16x8_t& As(__m128i& value) { + return value.AsUi16x8; + } + static inline const uint16x8_t& As(const __m128i& value) { + return value.AsUi16x8; + } +}; + +template <> +struct TQType<int16x8_t> { + static inline int16x8_t& As(__m128i& value) { + return value.AsSi16x8; + } + static inline const int16x8_t& As(const __m128i& value) { + return value.AsSi16x8; + } +}; + +template <> +struct TQType<uint32x4_t> { + static inline uint32x4_t& As(__m128i& value) { + return value.AsUi32x4; + } + static inline const uint32x4_t& As(const __m128i& value) { + return value.AsUi32x4; + } +}; + +template <> +struct TQType<int32x4_t> { + static inline int32x4_t& As(__m128i& value) { + return value.AsSi32x4; + } + static inline const int32x4_t& As(const __m128i& value) { + return value.AsSi32x4; + } +}; + +template <> +struct TQType<uint64x2_t> { + static inline uint64x2_t& As(__m128i& value) { + return value.AsUi64x2; + } + static inline const uint64x2_t& As(const __m128i& value) { + return value.AsUi64x2; + } + static inline uint64x2_t& As(__m128& value) { + return value.AsUi64x2; + } + static inline const uint64x2_t& As(const __m128& value) { + return value.AsUi64x2; + } +}; + +template <> +struct TQType<int64x2_t> { + static inline int64x2_t& As(__m128i& value) { + return value.AsSi64x2; + } + static inline const int64x2_t& As(const __m128i& value) { + return value.AsSi64x2; + } +}; + +template <typename TValue> +struct TBaseWrapper { + TValue Value; + Y_FORCE_INLINE - operator TValue&() { - return Value; - } - + operator TValue&() { + return Value; + } + Y_FORCE_INLINE - operator const TValue&() const { - return Value; - } -}; - -template <typename TOp, typename TFunc, TFunc* func, - typename TDup, TDup* dupfunc> -struct TWrapperSingleDup: public TBaseWrapper<__m128i> { + operator const TValue&() const { + return Value; + } +}; + +template <typename TOp, typename TFunc, TFunc* func, + typename TDup, TDup* dupfunc> +struct TWrapperSingleDup: public TBaseWrapper<__m128i> { Y_FORCE_INLINE - TWrapperSingleDup(const __m128i& op, const int shift) { - TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(shift)); - } -}; - -template <typename TOp, typename TFunc, TFunc* func, - typename TDup, TDup* dupfunc> -struct TWrapperSingleNegDup: public TBaseWrapper<__m128i> { + TWrapperSingleDup(const __m128i& op, const int shift) { + TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(shift)); + } +}; + +template <typename TOp, typename TFunc, TFunc* func, + typename TDup, TDup* dupfunc> +struct TWrapperSingleNegDup: public TBaseWrapper<__m128i> { Y_FORCE_INLINE - TWrapperSingleNegDup(const __m128i& op, const int shift) { - TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(-shift)); - } -}; - + TWrapperSingleNegDup(const __m128i& op, const int shift) { + TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(-shift)); + } +}; + inline __m128i _mm_srl_epi16(__m128i a, __m128i count) { __m128i res; res.AsUi16x8 = vshlq_u16(a.AsUi16x8, vdupq_n_s16(-count.AsUi16x8[0])); @@ -225,16 +225,16 @@ inline __m128i _mm_srai_epi32(__m128i a, int count) { return res; } -using _mm_srli_epi16 = - TWrapperSingleNegDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16, - decltype(vdupq_n_s16), vdupq_n_s16>; -using _mm_srli_epi32 = - TWrapperSingleNegDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32, - decltype(vdupq_n_s32), vdupq_n_s32>; -using _mm_srli_epi64 = - TWrapperSingleNegDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64, - decltype(vdupq_n_s64), vdupq_n_s64>; - +using _mm_srli_epi16 = + TWrapperSingleNegDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16, + decltype(vdupq_n_s16), vdupq_n_s16>; +using _mm_srli_epi32 = + TWrapperSingleNegDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32, + decltype(vdupq_n_s32), vdupq_n_s32>; +using _mm_srli_epi64 = + TWrapperSingleNegDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64, + decltype(vdupq_n_s64), vdupq_n_s64>; + inline __m128i _mm_sll_epi16(__m128i a, __m128i count) { __m128i res; @@ -255,57 +255,57 @@ inline __m128i _mm_sll_epi64(__m128i a, __m128i count) { return res; } -using _mm_slli_epi16 = - TWrapperSingleDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16, - decltype(vdupq_n_s16), vdupq_n_s16>; -using _mm_slli_epi32 = - TWrapperSingleDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32, - decltype(vdupq_n_s32), vdupq_n_s32>; -using _mm_slli_epi64 = - TWrapperSingleDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64, - decltype(vdupq_n_s64), vdupq_n_s64>; - -template <typename TOp, typename TFunc, TFunc* func, typename... TParams> -struct TWrapperDual : TBaseWrapper<__m128i> { +using _mm_slli_epi16 = + TWrapperSingleDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16, + decltype(vdupq_n_s16), vdupq_n_s16>; +using _mm_slli_epi32 = + TWrapperSingleDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32, + decltype(vdupq_n_s32), vdupq_n_s32>; +using _mm_slli_epi64 = + TWrapperSingleDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64, + decltype(vdupq_n_s64), vdupq_n_s64>; + +template <typename TOp, typename TFunc, TFunc* func, typename... TParams> +struct TWrapperDual : TBaseWrapper<__m128i> { Y_FORCE_INLINE - TWrapperDual(const __m128i& op1, const __m128i& op2, TParams... params) { - TQType<TOp>::As(Value) = (TOp) - func(TQType<TOp>::As(op1), - TQType<TOp>::As(op2), - params...); - } -}; - -template <typename TOp, typename TFunc, TFunc* func, typename... TParams> -struct TWrapperDualSwap : TBaseWrapper<__m128i> { + TWrapperDual(const __m128i& op1, const __m128i& op2, TParams... params) { + TQType<TOp>::As(Value) = (TOp) + func(TQType<TOp>::As(op1), + TQType<TOp>::As(op2), + params...); + } +}; + +template <typename TOp, typename TFunc, TFunc* func, typename... TParams> +struct TWrapperDualSwap : TBaseWrapper<__m128i> { Y_FORCE_INLINE - TWrapperDualSwap(const __m128i& op1, const __m128i& op2, TParams... params) { - TQType<TOp>::As(Value) = - func(TQType<TOp>::As(op2), - TQType<TOp>::As(op1), - params...); - } -}; - + TWrapperDualSwap(const __m128i& op1, const __m128i& op2, TParams... params) { + TQType<TOp>::As(Value) = + func(TQType<TOp>::As(op2), + TQType<TOp>::As(op1), + params...); + } +}; + template <typename TOp, typename TFunc, TFunc* func, typename TArgument = __m128> struct TWrapperDualF : TBaseWrapper<TArgument> { Y_FORCE_INLINE TWrapperDualF(const TArgument& op1, const TArgument& op2) { TQType<TOp>::As(TBaseWrapper<TArgument>::Value) = (TOp) func(TQType<TOp>::As(op1), TQType<TOp>::As(op2)); - } -}; - -using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>; -using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>; -using _mm_andnot_si128 = - TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>; + } +}; + +using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>; +using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>; +using _mm_andnot_si128 = + TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>; using _mm_xor_si128 = TWrapperDual<uint64x2_t, decltype(veorq_u64), veorq_u64>; - + using _mm_add_epi8 = TWrapperDual<uint8x16_t, decltype(vaddq_u8), vaddq_u8>; -using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>; -using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>; -using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>; - +using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>; +using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>; +using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>; + inline __m128i _mm_madd_epi16(__m128i a, __m128i b) { int32x4_t aLow; int32x4_t aHigh; @@ -343,118 +343,118 @@ inline __m128i _mm_madd_epi16(__m128i a, __m128i b) { } using _mm_sub_epi8 = TWrapperDual<uint8x16_t, decltype(vsubq_u8), vsubq_u8>; -using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>; -using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>; -using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>; - -using _mm_unpacklo_epi8 = - TWrapperDual<uint8x16_t, decltype(vzip1q_u8), vzip1q_u8>; -using _mm_unpackhi_epi8 = - TWrapperDual<uint8x16_t, decltype(vzip2q_u8), vzip2q_u8>; -using _mm_unpacklo_epi16 = - TWrapperDual<uint16x8_t, decltype(vzip1q_u16), vzip1q_u16>; -using _mm_unpackhi_epi16 = - TWrapperDual<uint16x8_t, decltype(vzip2q_u16), vzip2q_u16>; -using _mm_unpacklo_epi32 = - TWrapperDual<uint32x4_t, decltype(vzip1q_u32), vzip1q_u32>; -using _mm_unpackhi_epi32 = - TWrapperDual<uint32x4_t, decltype(vzip2q_u32), vzip2q_u32>; -using _mm_unpacklo_epi64 = - TWrapperDual<uint64x2_t, decltype(vzip1q_u64), vzip1q_u64>; -using _mm_unpackhi_epi64 = - TWrapperDual<uint64x2_t, decltype(vzip2q_u64), vzip2q_u64>; - -using _mm_cmpeq_epi8 = - TWrapperDual<uint8x16_t, decltype(vceqq_u8), vceqq_u8>; -using _mm_cmpeq_epi16 = - TWrapperDual<uint16x8_t, decltype(vceqq_u16), vceqq_u16>; -using _mm_cmpeq_epi32 = - TWrapperDual<uint32x4_t, decltype(vceqq_u32), vceqq_u32>; - -using _mm_cmpgt_epi8 = - TWrapperDual<int8x16_t, decltype(vcgtq_s8), vcgtq_s8>; -using _mm_cmpgt_epi16 = - TWrapperDual<int16x8_t, decltype(vcgtq_s16), vcgtq_s16>; -using _mm_cmpgt_epi32 = - TWrapperDual<int32x4_t, decltype(vcgtq_s32), vcgtq_s32>; - -using _mm_cmplt_epi8 = - TWrapperDual<int8x16_t, decltype(vcltq_s8), vcltq_s8>; -using _mm_cmplt_epi16 = - TWrapperDual<int16x8_t, decltype(vcltq_s16), vcltq_s16>; -using _mm_cmplt_epi32 = - TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>; - +using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>; +using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>; +using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>; + +using _mm_unpacklo_epi8 = + TWrapperDual<uint8x16_t, decltype(vzip1q_u8), vzip1q_u8>; +using _mm_unpackhi_epi8 = + TWrapperDual<uint8x16_t, decltype(vzip2q_u8), vzip2q_u8>; +using _mm_unpacklo_epi16 = + TWrapperDual<uint16x8_t, decltype(vzip1q_u16), vzip1q_u16>; +using _mm_unpackhi_epi16 = + TWrapperDual<uint16x8_t, decltype(vzip2q_u16), vzip2q_u16>; +using _mm_unpacklo_epi32 = + TWrapperDual<uint32x4_t, decltype(vzip1q_u32), vzip1q_u32>; +using _mm_unpackhi_epi32 = + TWrapperDual<uint32x4_t, decltype(vzip2q_u32), vzip2q_u32>; +using _mm_unpacklo_epi64 = + TWrapperDual<uint64x2_t, decltype(vzip1q_u64), vzip1q_u64>; +using _mm_unpackhi_epi64 = + TWrapperDual<uint64x2_t, decltype(vzip2q_u64), vzip2q_u64>; + +using _mm_cmpeq_epi8 = + TWrapperDual<uint8x16_t, decltype(vceqq_u8), vceqq_u8>; +using _mm_cmpeq_epi16 = + TWrapperDual<uint16x8_t, decltype(vceqq_u16), vceqq_u16>; +using _mm_cmpeq_epi32 = + TWrapperDual<uint32x4_t, decltype(vceqq_u32), vceqq_u32>; + +using _mm_cmpgt_epi8 = + TWrapperDual<int8x16_t, decltype(vcgtq_s8), vcgtq_s8>; +using _mm_cmpgt_epi16 = + TWrapperDual<int16x8_t, decltype(vcgtq_s16), vcgtq_s16>; +using _mm_cmpgt_epi32 = + TWrapperDual<int32x4_t, decltype(vcgtq_s32), vcgtq_s32>; + +using _mm_cmplt_epi8 = + TWrapperDual<int8x16_t, decltype(vcltq_s8), vcltq_s8>; +using _mm_cmplt_epi16 = + TWrapperDual<int16x8_t, decltype(vcltq_s16), vcltq_s16>; +using _mm_cmplt_epi32 = + TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>; + Y_FORCE_INLINE __m128i _mm_load_si128(const __m128i* ptr) { - __m128i result; + __m128i result; result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr); - return result; -} - + return result; +} + Y_FORCE_INLINE __m128i _mm_loadu_si128(const __m128i* ptr) { - __m128i result; + __m128i result; result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr); - return result; -} - + return result; +} + Y_FORCE_INLINE __m128i _mm_lddqu_si128(const __m128i* ptr) { return _mm_loadu_si128(ptr); } Y_FORCE_INLINE void _mm_storeu_si128(__m128i* ptr, const __m128i& op) { vst1q_u64((uint64_t*)ptr, op.AsUi64x2); -} - +} + Y_FORCE_INLINE void -_mm_store_si128(__m128i* ptr, const __m128i& op) { +_mm_store_si128(__m128i* ptr, const __m128i& op) { vst1q_u64((uint64_t*)ptr, op.AsUi64x2); -} - -template <typename TOp, typename TFunc, TFunc* func, typename... TParams> -struct TWrapperSimple : TBaseWrapper<__m128i> { +} + +template <typename TOp, typename TFunc, TFunc* func, typename... TParams> +struct TWrapperSimple : TBaseWrapper<__m128i> { Y_FORCE_INLINE - TWrapperSimple(TParams... params) { - TQType<TOp>::As(Value) = func(params...); - } -}; - -template <typename TOp, typename TFunc, TFunc* func, typename... TParams> -struct TWrapperSimpleF : TBaseWrapper<__m128> { + TWrapperSimple(TParams... params) { + TQType<TOp>::As(Value) = func(params...); + } +}; + +template <typename TOp, typename TFunc, TFunc* func, typename... TParams> +struct TWrapperSimpleF : TBaseWrapper<__m128> { Y_FORCE_INLINE - TWrapperSimpleF(TParams... params) { - TQType<TOp>::As(Value) = func(params...); - } -}; - -using _mm_set1_epi8 = - TWrapperSimple<int8x16_t, decltype(vdupq_n_s8), vdupq_n_s8, const char>; -using _mm_set1_epi16 = - TWrapperSimple<int16x8_t, decltype(vdupq_n_s16), vdupq_n_s16, const ui16>; -using _mm_set1_epi32 = - TWrapperSimple<int32x4_t, decltype(vdupq_n_s32), vdupq_n_s32, const ui32>; - -struct _mm_setzero_si128 : TBaseWrapper<__m128i> { + TWrapperSimpleF(TParams... params) { + TQType<TOp>::As(Value) = func(params...); + } +}; + +using _mm_set1_epi8 = + TWrapperSimple<int8x16_t, decltype(vdupq_n_s8), vdupq_n_s8, const char>; +using _mm_set1_epi16 = + TWrapperSimple<int16x8_t, decltype(vdupq_n_s16), vdupq_n_s16, const ui16>; +using _mm_set1_epi32 = + TWrapperSimple<int32x4_t, decltype(vdupq_n_s32), vdupq_n_s32, const ui32>; + +struct _mm_setzero_si128 : TBaseWrapper<__m128i> { Y_FORCE_INLINE - _mm_setzero_si128() { - TQType<uint64x2_t>::As(Value) = vdupq_n_u64(0); - } -}; - -struct _mm_loadl_epi64 : TBaseWrapper<__m128i> { + _mm_setzero_si128() { + TQType<uint64x2_t>::As(Value) = vdupq_n_u64(0); + } +}; + +struct _mm_loadl_epi64 : TBaseWrapper<__m128i> { Y_FORCE_INLINE - _mm_loadl_epi64(const __m128i* p) { + _mm_loadl_epi64(const __m128i* p) { uint64x1_t im = vld1_u64((const uint64_t*)p); - TQType<uint64x2_t>::As(Value) = vcombine_u64(im, vdup_n_u64(0)); - } -}; - -struct _mm_storel_epi64 : TBaseWrapper<__m128i> { + TQType<uint64x2_t>::As(Value) = vcombine_u64(im, vdup_n_u64(0)); + } +}; + +struct _mm_storel_epi64 : TBaseWrapper<__m128i> { Y_FORCE_INLINE - _mm_storel_epi64(__m128i* a, __m128i op) { + _mm_storel_epi64(__m128i* a, __m128i op) { vst1_u64((uint64_t*)a, vget_low_u64(op.AsUi64x2)); - } -}; - + } +}; + struct ShuffleStruct4 { ui8 x[4]; }; @@ -470,45 +470,45 @@ _MM_SHUFFLE(ui8 x4, ui8 x3, ui8 x2, ui8 x1) { } Y_FORCE_INLINE __m128i -_mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) { - __m128i result; +_mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) { + __m128i result; const ui8 xi[4] = { ui8(op2.x[0] * 4), ui8(op2.x[1] * 4), ui8(op2.x[2] * 4), ui8(op2.x[3] * 4) }; const uint8x16_t transform = { - ui8(xi[0]), ui8(xi[0] + 1), ui8(xi[0] + 2), ui8(xi[0] + 3), - ui8(xi[1]), ui8(xi[1] + 1), ui8(xi[1] + 2), ui8(xi[1] + 3), - ui8(xi[2]), ui8(xi[2] + 1), ui8(xi[2] + 2), ui8(xi[2] + 3), + ui8(xi[0]), ui8(xi[0] + 1), ui8(xi[0] + 2), ui8(xi[0] + 3), + ui8(xi[1]), ui8(xi[1] + 1), ui8(xi[1] + 2), ui8(xi[1] + 3), + ui8(xi[2]), ui8(xi[2] + 1), ui8(xi[2] + 2), ui8(xi[2] + 3), ui8(xi[3]), ui8(xi[3] + 1), ui8(xi[3] + 2), ui8(xi[3] + 3) }; - result.AsUi8x16 = vqtbl1q_u8(op1.AsUi8x16, transform); - return result; -} - + result.AsUi8x16 = vqtbl1q_u8(op1.AsUi8x16, transform); + return result; +} + Y_FORCE_INLINE int -_mm_movemask_epi8(const __m128i& op) { - uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - uint8x16_t opmasked = vandq_u8(op.AsUi8x16, mask); - int8x16_t byteshifter = { - 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7}; - uint8x16_t opshifted = vshlq_u8(opmasked, byteshifter); - int16x8_t wordshifter = {-7, -5, -3, -1, 1, 3, 5, 7}; - uint16x8_t wordshifted = - vshlq_u16(vreinterpretq_u16_u8(opshifted), wordshifter); - return vaddvq_u16(wordshifted); -} - -template <int imm> -struct THelper_mm_srli_si128 : TBaseWrapper<__m128i> { +_mm_movemask_epi8(const __m128i& op) { + uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + uint8x16_t opmasked = vandq_u8(op.AsUi8x16, mask); + int8x16_t byteshifter = { + 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7}; + uint8x16_t opshifted = vshlq_u8(opmasked, byteshifter); + int16x8_t wordshifter = {-7, -5, -3, -1, 1, 3, 5, 7}; + uint16x8_t wordshifted = + vshlq_u16(vreinterpretq_u16_u8(opshifted), wordshifter); + return vaddvq_u16(wordshifted); +} + +template <int imm> +struct THelper_mm_srli_si128 : TBaseWrapper<__m128i> { Y_FORCE_INLINE THelper_mm_srli_si128(const __m128i a) { const auto zero = vdupq_n_u8(0); - TQType<uint8x16_t>::As(Value) = vextq_u8(a.AsUi8x16, zero, imm); - } -}; - + TQType<uint8x16_t>::As(Value) = vextq_u8(a.AsUi8x16, zero, imm); + } +}; + template <> struct THelper_mm_srli_si128<16> : TBaseWrapper<__m128i> { Y_FORCE_INLINE @@ -518,8 +518,8 @@ struct THelper_mm_srli_si128<16> : TBaseWrapper<__m128i> { } }; -#define _mm_srli_si128(a, imm) THelper_mm_srli_si128<imm>(a) - +#define _mm_srli_si128(a, imm) THelper_mm_srli_si128<imm>(a) + template<int imm> inline uint8x16_t vextq_u8_function(uint8x16_t a, uint8x16_t b) { return vextq_u8(a, b, imm); @@ -531,33 +531,33 @@ inline uint8x16_t vextq_u8_function<16>(uint8x16_t /* a */, uint8x16_t b) { } -template <int imm> -struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> { +template <int imm> +struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> { Y_FORCE_INLINE THelper_mm_slli_si128(const __m128i a) { - auto zero = vdupq_n_u8(0); + auto zero = vdupq_n_u8(0); TQType<uint8x16_t>::As(Value) = vextq_u8_function<16 - imm>(zero, a.AsUi8x16); - } -}; - -#define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a) - + } +}; + +#define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a) + Y_FORCE_INLINE int _mm_cvtsi128_si32(const __m128i& op) { - return vgetq_lane_s32(op.AsSi32x4, 0); -} - -struct _mm_set_epi16 : TBaseWrapper<__m128i> { + return vgetq_lane_s32(op.AsSi32x4, 0); +} + +struct _mm_set_epi16 : TBaseWrapper<__m128i> { Y_FORCE_INLINE - _mm_set_epi16(const short w7, const short w6, - const short w5, const short w4, - const short w3, const short w2, - const short w1, const short w0) { - int16x4_t d0 = {w0, w1, w2, w3}; - int16x4_t d1 = {w4, w5, w6, w7}; - TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1); - } -}; - + _mm_set_epi16(const short w7, const short w6, + const short w5, const short w4, + const short w3, const short w2, + const short w1, const short w0) { + int16x4_t d0 = {w0, w1, w2, w3}; + int16x4_t d1 = {w4, w5, w6, w7}; + TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1); + } +}; + struct _mm_setr_epi16 : TBaseWrapper<__m128i> { Y_FORCE_INLINE _mm_setr_epi16(const short w7, const short w6, @@ -570,16 +570,16 @@ struct _mm_setr_epi16 : TBaseWrapper<__m128i> { } }; -struct _mm_set_epi32 : TBaseWrapper<__m128i> { +struct _mm_set_epi32 : TBaseWrapper<__m128i> { Y_FORCE_INLINE - _mm_set_epi32(const int x3, const int x2, - const int x1, const int x0) { - int32x2_t d0 = {x0, x1}; - int32x2_t d1 = {x2, x3}; - TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1); - } -}; - + _mm_set_epi32(const int x3, const int x2, + const int x1, const int x0) { + int32x2_t d0 = {x0, x1}; + int32x2_t d1 = {x2, x3}; + TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1); + } +}; + struct _mm_setr_epi32 : TBaseWrapper<__m128i> { Y_FORCE_INLINE _mm_setr_epi32(const int x3, const int x2, @@ -590,14 +590,14 @@ struct _mm_setr_epi32 : TBaseWrapper<__m128i> { } }; -struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> { +struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> { Y_FORCE_INLINE - _mm_cvtsi32_si128(int op) { - auto zero = vdupq_n_s32(0); - TQType<int32x4_t>::As(Value) = vsetq_lane_s32(op, zero, 0); - } -}; - + _mm_cvtsi32_si128(int op) { + auto zero = vdupq_n_s32(0); + TQType<int32x4_t>::As(Value) = vsetq_lane_s32(op, zero, 0); + } +}; + struct _mm_cvtsi64_si128 : TBaseWrapper<__m128i> { Y_FORCE_INLINE _mm_cvtsi64_si128(i64 op) { @@ -606,41 +606,41 @@ struct _mm_cvtsi64_si128 : TBaseWrapper<__m128i> { } }; -template <typename TOpOut, typename TOpIn, - typename TFunc, TFunc* func, - typename TCombine, TCombine* combine> -struct TCombineWrapper : TBaseWrapper<__m128i> { +template <typename TOpOut, typename TOpIn, + typename TFunc, TFunc* func, + typename TCombine, TCombine* combine> +struct TCombineWrapper : TBaseWrapper<__m128i> { Y_FORCE_INLINE - TCombineWrapper(const __m128i op1, const __m128i op2) { - TQType<TOpOut>::As(Value) = - combine(func(TQType<TOpIn>::As(op1)), - func(TQType<TOpIn>::As(op2))); - } -}; - -using _mm_packs_epi16 = - TCombineWrapper<int8x16_t, int16x8_t, - decltype(vqmovn_s16), vqmovn_s16, - decltype(vcombine_s8), vcombine_s8>; -using _mm_packs_epi32 = - TCombineWrapper<int16x8_t, int32x4_t, - decltype(vqmovn_s32), vqmovn_s32, - decltype(vcombine_s16), vcombine_s16>; -using _mm_packus_epi16 = - TCombineWrapper<uint8x16_t, int16x8_t, - decltype(vqmovun_s16), vqmovun_s16, - decltype(vcombine_u8), vcombine_u8>; - -template <typename TOpOut, typename TOpIn, - typename TFunc, TFunc* func, typename... TParams> -struct TScalarOutWrapper : TBaseWrapper<TOpOut> { + TCombineWrapper(const __m128i op1, const __m128i op2) { + TQType<TOpOut>::As(Value) = + combine(func(TQType<TOpIn>::As(op1)), + func(TQType<TOpIn>::As(op2))); + } +}; + +using _mm_packs_epi16 = + TCombineWrapper<int8x16_t, int16x8_t, + decltype(vqmovn_s16), vqmovn_s16, + decltype(vcombine_s8), vcombine_s8>; +using _mm_packs_epi32 = + TCombineWrapper<int16x8_t, int32x4_t, + decltype(vqmovn_s32), vqmovn_s32, + decltype(vcombine_s16), vcombine_s16>; +using _mm_packus_epi16 = + TCombineWrapper<uint8x16_t, int16x8_t, + decltype(vqmovun_s16), vqmovun_s16, + decltype(vcombine_u8), vcombine_u8>; + +template <typename TOpOut, typename TOpIn, + typename TFunc, TFunc* func, typename... TParams> +struct TScalarOutWrapper : TBaseWrapper<TOpOut> { Y_FORCE_INLINE - TScalarOutWrapper(const __m128i op, TParams... params) { - TBaseWrapper<TOpOut>::Value = - func(TQType<TOpIn>::As(op), params...); - } -}; - + TScalarOutWrapper(const __m128i op, TParams... params) { + TBaseWrapper<TOpOut>::Value = + func(TQType<TOpIn>::As(op), params...); + } +}; + template<int imm> int extract_epi8_arm(__m128i arg) { return vgetq_lane_u8(arg.AsUi8x16, imm); @@ -649,13 +649,13 @@ int extract_epi8_arm(__m128i arg) { template<int imm> int extract_epi16_arm(__m128i arg) { return vgetq_lane_u16(arg.AsUi16x8, imm); -} - +} + template<int imm> int extract_epi32_arm(__m128i arg) { return vgetq_lane_s32(arg.AsSi32x4, imm); } - + template<int imm> long long extract_epi64_arm(__m128i arg) { return vgetq_lane_s64(arg.AsSi64x2, imm); @@ -669,49 +669,49 @@ long long extract_epi64_arm(__m128i arg) { static Y_FORCE_INLINE __m128i _mm_mul_epu32(__m128i op1, __m128i op2) { - __m128i result; - uint32x4_t r1 = vuzp1q_u32(op1.AsUi32x4, op2.AsUi32x4); - uint32x4_t r2 = vuzp1q_u32(op2.AsUi32x4, op1.AsUi32x4); - result.AsUi64x2 = vmull_u32(vget_low_u32(r1), vget_low_u32(r2)); - return result; -} - -template <> -struct TQType<float32x4_t> { - static inline float32x4_t& As(__m128& value) { - return value.AsFloat32x4; - } - - static inline const float32x4_t& As(const __m128& value) { - return value.AsFloat32x4; - } - - static inline float32x4_t& As(__m128i& value) { - return value.AsFloat32x4; - } - - static inline const float32x4_t& As(const __m128i& value) { - return value.AsFloat32x4; - } -}; - -template <> -struct TQType<float64x2_t> { - static inline float64x2_t& As(__m128& value) { - return value.AsFloat64x2; - } - - static inline const float64x2_t& As(const __m128& value) { - return value.AsFloat64x2; - } - - static inline float64x2_t& As(__m128i& value) { - return value.AsFloat64x2; - } - - static inline const float64x2_t& As(const __m128i& value) { - return value.AsFloat64x2; - } + __m128i result; + uint32x4_t r1 = vuzp1q_u32(op1.AsUi32x4, op2.AsUi32x4); + uint32x4_t r2 = vuzp1q_u32(op2.AsUi32x4, op1.AsUi32x4); + result.AsUi64x2 = vmull_u32(vget_low_u32(r1), vget_low_u32(r2)); + return result; +} + +template <> +struct TQType<float32x4_t> { + static inline float32x4_t& As(__m128& value) { + return value.AsFloat32x4; + } + + static inline const float32x4_t& As(const __m128& value) { + return value.AsFloat32x4; + } + + static inline float32x4_t& As(__m128i& value) { + return value.AsFloat32x4; + } + + static inline const float32x4_t& As(const __m128i& value) { + return value.AsFloat32x4; + } +}; + +template <> +struct TQType<float64x2_t> { + static inline float64x2_t& As(__m128& value) { + return value.AsFloat64x2; + } + + static inline const float64x2_t& As(const __m128& value) { + return value.AsFloat64x2; + } + + static inline float64x2_t& As(__m128i& value) { + return value.AsFloat64x2; + } + + static inline const float64x2_t& As(const __m128i& value) { + return value.AsFloat64x2; + } static inline float64x2_t& As(__m128d& value) { return value; @@ -720,30 +720,30 @@ struct TQType<float64x2_t> { static inline const float64x2_t& As(const __m128d& value) { return value; } -}; - -using _mm_set1_ps = TWrapperSimpleF<float32x4_t, - decltype(vdupq_n_f32), vdupq_n_f32, const float>; -using _mm_set_ps1 = TWrapperSimpleF<float32x4_t, - decltype(vdupq_n_f32), vdupq_n_f32, const float>; - -struct _mm_setzero_ps : TBaseWrapper<__m128> { +}; + +using _mm_set1_ps = TWrapperSimpleF<float32x4_t, + decltype(vdupq_n_f32), vdupq_n_f32, const float>; +using _mm_set_ps1 = TWrapperSimpleF<float32x4_t, + decltype(vdupq_n_f32), vdupq_n_f32, const float>; + +struct _mm_setzero_ps : TBaseWrapper<__m128> { Y_FORCE_INLINE - _mm_setzero_ps() { - TQType<float32x4_t>::As(Value) = vdupq_n_f32(0.); - } -}; - + _mm_setzero_ps() { + TQType<float32x4_t>::As(Value) = vdupq_n_f32(0.); + } +}; + Y_FORCE_INLINE __m128d _mm_setzero_pd() { return vdupq_n_f64(0.); } Y_FORCE_INLINE __m128 _mm_loadu_ps(const float* ptr) { - __m128 result; - result.AsFloat32x4 = vld1q_f32(ptr); - return result; -} - + __m128 result; + result.AsFloat32x4 = vld1q_f32(ptr); + return result; +} + Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) { __m128 result; result.AsFloat32x4 = vld1q_f32(ptr); @@ -751,23 +751,23 @@ Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) { } Y_FORCE_INLINE void _mm_storeu_ps(float* ptr, const __m128& op) { - vst1q_f32(ptr, op.AsFloat32x4); -} - + vst1q_f32(ptr, op.AsFloat32x4); +} + Y_FORCE_INLINE void _mm_store_ps(float* ptr, const __m128& op) { vst1q_f32(ptr, op.AsFloat32x4); } -struct _mm_set_ps : TBaseWrapper<__m128> { +struct _mm_set_ps : TBaseWrapper<__m128> { Y_FORCE_INLINE - _mm_set_ps(const float x3, const float x2, - const float x1, const float x0) { - float32x2_t d0 = {x0, x1}; - float32x2_t d1 = {x2, x3}; - TQType<float32x4_t>::As(Value) = vcombine_f32(d0, d1); - } -}; - + _mm_set_ps(const float x3, const float x2, + const float x1, const float x0) { + float32x2_t d0 = {x0, x1}; + float32x2_t d1 = {x2, x3}; + TQType<float32x4_t>::As(Value) = vcombine_f32(d0, d1); + } +}; + Y_FORCE_INLINE __m128d _mm_set_pd(double d1, double d0) { const float64x1_t p0 = {d0}; const float64x1_t p1 = {d1}; @@ -788,81 +788,81 @@ Y_FORCE_INLINE void _mm_store_pd(double* res, __m128d a) { vst1q_f64(res, a); } -using _mm_add_ps = TWrapperDualF<float32x4_t, decltype(vaddq_f32), vaddq_f32>; -using _mm_sub_ps = TWrapperDualF<float32x4_t, decltype(vsubq_f32), vsubq_f32>; -using _mm_mul_ps = TWrapperDualF<float32x4_t, decltype(vmulq_f32), vmulq_f32>; -using _mm_div_ps = TWrapperDualF<float32x4_t, decltype(vdivq_f32), vdivq_f32>; -using _mm_cmpeq_ps = TWrapperDualF<float32x4_t, decltype(vceqq_f32), vceqq_f32>; -using _mm_cmpgt_ps = TWrapperDualF<float32x4_t, decltype(vcgtq_f32), vcgtq_f32>; -using _mm_max_ps = TWrapperDualF<float32x4_t, decltype(vmaxq_f32), vmaxq_f32>; -using _mm_min_ps = TWrapperDualF<float32x4_t, decltype(vminq_f32), vminq_f32>; - +using _mm_add_ps = TWrapperDualF<float32x4_t, decltype(vaddq_f32), vaddq_f32>; +using _mm_sub_ps = TWrapperDualF<float32x4_t, decltype(vsubq_f32), vsubq_f32>; +using _mm_mul_ps = TWrapperDualF<float32x4_t, decltype(vmulq_f32), vmulq_f32>; +using _mm_div_ps = TWrapperDualF<float32x4_t, decltype(vdivq_f32), vdivq_f32>; +using _mm_cmpeq_ps = TWrapperDualF<float32x4_t, decltype(vceqq_f32), vceqq_f32>; +using _mm_cmpgt_ps = TWrapperDualF<float32x4_t, decltype(vcgtq_f32), vcgtq_f32>; +using _mm_max_ps = TWrapperDualF<float32x4_t, decltype(vmaxq_f32), vmaxq_f32>; +using _mm_min_ps = TWrapperDualF<float32x4_t, decltype(vminq_f32), vminq_f32>; + using _mm_add_pd = TWrapperDualF<float64x2_t, decltype(vaddq_f64), vaddq_f64, __m128d>; using _mm_sub_pd = TWrapperDualF<float64x2_t, decltype(vsubq_f64), vsubq_f64, __m128d>; using _mm_mul_pd = TWrapperDualF<float64x2_t, decltype(vmulq_f64), vmulq_f64, __m128d>; using _mm_div_pd = TWrapperDualF<float64x2_t, decltype(vdivq_f64), vdivq_f64, __m128d>; -struct _mm_and_ps : TBaseWrapper<__m128> { +struct _mm_and_ps : TBaseWrapper<__m128> { Y_FORCE_INLINE - _mm_and_ps(const __m128& op1, const __m128& op2) { - TQType<uint64x2_t>::As(Value) = - vandq_u64(TQType<uint64x2_t>::As(op1), - TQType<uint64x2_t>::As(op2)); - } -}; - + _mm_and_ps(const __m128& op1, const __m128& op2) { + TQType<uint64x2_t>::As(Value) = + vandq_u64(TQType<uint64x2_t>::As(op1), + TQType<uint64x2_t>::As(op2)); + } +}; + Y_FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { return vandq_u64(a, b); } Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m128& op3) { - float64x2_t im0 = - (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4); - float64x2_t im1 = - (float64x2_t)vtrn2q_f32(op0.AsFloat32x4, op1.AsFloat32x4); - float64x2_t im2 = - (float64x2_t)vtrn1q_f32(op2.AsFloat32x4, op3.AsFloat32x4); - float64x2_t im3 = - (float64x2_t)vtrn2q_f32(op2.AsFloat32x4, op3.AsFloat32x4); - - TQType<float64x2_t>::As(op0) = vtrn1q_f64(im0, im2); - TQType<float64x2_t>::As(op1) = vtrn1q_f64(im1, im3); - TQType<float64x2_t>::As(op2) = vtrn2q_f64(im0, im2); - TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3); -}; - + float64x2_t im0 = + (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4); + float64x2_t im1 = + (float64x2_t)vtrn2q_f32(op0.AsFloat32x4, op1.AsFloat32x4); + float64x2_t im2 = + (float64x2_t)vtrn1q_f32(op2.AsFloat32x4, op3.AsFloat32x4); + float64x2_t im3 = + (float64x2_t)vtrn2q_f32(op2.AsFloat32x4, op3.AsFloat32x4); + + TQType<float64x2_t>::As(op0) = vtrn1q_f64(im0, im2); + TQType<float64x2_t>::As(op1) = vtrn1q_f64(im1, im3); + TQType<float64x2_t>::As(op2) = vtrn2q_f64(im0, im2); + TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3); +}; + Y_FORCE_INLINE __m128 _mm_castsi128_ps(__m128i op) { - return reinterpret_cast<__m128&>(op); -} - + return reinterpret_cast<__m128&>(op); +} + Y_FORCE_INLINE __m128i _mm_castps_si128(__m128 op) { - return reinterpret_cast<__m128i&>(op); -} - -template <typename TOpOut, typename TOpIn, - typename TFunc, TFunc* func, typename... TParams> -struct TCvtS2FWrapperSingle : TBaseWrapper<__m128> { + return reinterpret_cast<__m128i&>(op); +} + +template <typename TOpOut, typename TOpIn, + typename TFunc, TFunc* func, typename... TParams> +struct TCvtS2FWrapperSingle : TBaseWrapper<__m128> { Y_FORCE_INLINE - TCvtS2FWrapperSingle(const __m128i& op, TParams... params) { - TQType<TOpOut>::As(Value) = - func(TQType<TOpIn>::As(op), params...); - } -}; - -using _mm_cvtepi32_ps = - TCvtS2FWrapperSingle<float32x4_t, int32x4_t, - decltype(vcvtq_f32_s32), vcvtq_f32_s32>; - -template <typename TOpOut, typename TOpIn, - typename TFunc, TFunc* func, typename... TParams> -struct TCvtF2SWrapperSingle : TBaseWrapper<__m128i> { + TCvtS2FWrapperSingle(const __m128i& op, TParams... params) { + TQType<TOpOut>::As(Value) = + func(TQType<TOpIn>::As(op), params...); + } +}; + +using _mm_cvtepi32_ps = + TCvtS2FWrapperSingle<float32x4_t, int32x4_t, + decltype(vcvtq_f32_s32), vcvtq_f32_s32>; + +template <typename TOpOut, typename TOpIn, + typename TFunc, TFunc* func, typename... TParams> +struct TCvtF2SWrapperSingle : TBaseWrapper<__m128i> { Y_FORCE_INLINE - TCvtF2SWrapperSingle(const __m128& op, TParams... params) { - TQType<TOpOut>::As(Value) = - func(TQType<TOpIn>::As(op), params...); - } -}; - + TCvtF2SWrapperSingle(const __m128& op, TParams... params) { + TQType<TOpOut>::As(Value) = + func(TQType<TOpIn>::As(op), params...); + } +}; + inline __m128i _mm_cvtps_epi32(__m128 a) { /// vcvtq_s32_f32 rounds to zero, but we need to round to the nearest. static const float32x4_t half = vdupq_n_f32(0.5f); @@ -874,26 +874,26 @@ inline __m128i _mm_cvtps_epi32(__m128 a) { return res; } -using _mm_cvttps_epi32 = - TCvtF2SWrapperSingle<int32x4_t, float32x4_t, - decltype(vcvtq_s32_f32), vcvtq_s32_f32>; - +using _mm_cvttps_epi32 = + TCvtF2SWrapperSingle<int32x4_t, float32x4_t, + decltype(vcvtq_s32_f32), vcvtq_s32_f32>; + Y_FORCE_INLINE int -_mm_movemask_ps(const __m128& op) { - uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; - uint32x4_t bits = vandq_u32(op.AsUi32x4, mask); - int32x4_t shifts = {-31, -30, -29, -28}; - bits = vshlq_u32(bits, shifts); - return vaddvq_u32(bits); -} +_mm_movemask_ps(const __m128& op) { + uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; + uint32x4_t bits = vandq_u32(op.AsUi32x4, mask); + int32x4_t shifts = {-31, -30, -29, -28}; + bits = vshlq_u32(bits, shifts); + return vaddvq_u32(bits); +} Y_FORCE_INLINE i64 _mm_cvtsi128_si64(__m128i a) { return vgetq_lane_s64(a.AsSi64x2, 0); } - -static inline void _mm_pause() { + +static inline void _mm_pause() { __asm__ ("YIELD"); -} +} static inline __m128 _mm_rsqrt_ps(__m128 a) { __m128 res; diff --git a/library/cpp/sse/ut/test.cpp b/library/cpp/sse/ut/test.cpp index 42a82a8cfa..33c999d284 100644 --- a/library/cpp/sse/ut/test.cpp +++ b/library/cpp/sse/ut/test.cpp @@ -1,10 +1,10 @@ -/* - Unittests for all SSE instrinsics translated to NEON instrinsics or - software implementation. - Should be tested both on Intel and ARM64. - */ -/* Author: Vitaliy Manushkin <agri@yandex-team.ru */ - +/* + Unittests for all SSE instrinsics translated to NEON instrinsics or + software implementation. + Should be tested both on Intel and ARM64. + */ +/* Author: Vitaliy Manushkin <agri@yandex-team.ru */ + #include <library/cpp/testing/unittest/registar.h> #include <util/generic/typetraits.h> @@ -13,35 +13,35 @@ #include <util/stream/output.h> #include <algorithm> -#include <array> -#include <limits> +#include <array> +#include <limits> #include <memory> #include <type_traits> #include <utility> - -template <typename TResult, typename TFunc, TFunc* func> -struct T_mm_CallWrapper { - TResult Value; - - template <typename... TParams> - T_mm_CallWrapper(TParams&&... params) { - Value = func(std::forward<TParams>(params)...); - } - - operator TResult&() { - return Value; - } - - operator const TResult&() const { - return Value; - } -}; - -#if defined(_arm64_) + +template <typename TResult, typename TFunc, TFunc* func> +struct T_mm_CallWrapper { + TResult Value; + + template <typename... TParams> + T_mm_CallWrapper(TParams&&... params) { + Value = func(std::forward<TParams>(params)...); + } + + operator TResult&() { + return Value; + } + + operator const TResult&() const { + return Value; + } +}; + +#if defined(_arm64_) #include "library/cpp/sse/sse2neon.h" #elif defined(_i386_) || defined(_x86_64_) -#include <xmmintrin.h> -#include <emmintrin.h> +#include <xmmintrin.h> +#include <emmintrin.h> #include <smmintrin.h> #elif defined(_ppc64_) #include "library/cpp/sse/powerpc.h" @@ -54,10 +54,10 @@ struct T_mm_CallWrapper { #define WrapF(T_mm_func) T_mm_func #define WrapD(T_mm_func) T_mm_func #elif defined(_ppc64_) || defined(_i386_) || defined(_x86_64_) -#define Wrap(_mm_func) \ - T_mm_CallWrapper<__m128i, decltype(_mm_func), _mm_func> -#define WrapF(_mm_func) \ - T_mm_CallWrapper<__m128, decltype(_mm_func), _mm_func> +#define Wrap(_mm_func) \ + T_mm_CallWrapper<__m128i, decltype(_mm_func), _mm_func> +#define WrapF(_mm_func) \ + T_mm_CallWrapper<__m128, decltype(_mm_func), _mm_func> #define WrapD(_mm_func) \ T_mm_CallWrapper<__m128d, decltype(_mm_func), _mm_func> using int8x16_t = std::array<i8, 16>; @@ -70,69 +70,69 @@ using uint32x4_t = std::array<ui32, 4>; using uint64x2_t = std::array<ui64, 2>; using float32x4_t = std::array<float, 4>; using float64x2_t = std::array<double, 2>; - + template <typename TVectorType> -struct TQType { +struct TQType { static TVectorType As(__m128i param) { TVectorType value; - _mm_storeu_si128((__m128i*)&value, param); - return value; - } + _mm_storeu_si128((__m128i*)&value, param); + return value; + } static TVectorType As(__m128 param) { TVectorType value; - _mm_storeu_ps((float*)&value, param); - return value; - } + _mm_storeu_ps((float*)&value, param); + return value; + } static TVectorType As(__m128d param) { TVectorType value; _mm_storeu_pd((double*)&value, param); return value; } -}; -#endif - +}; +#endif + template <typename TVectorType> -struct TFuncLoad; +struct TFuncLoad; template <typename TVectorType> -struct TFuncStore; - -template <> -struct TFuncLoad<__m128i> { - __m128i Value; - - template <typename TPointer> - TFuncLoad(TPointer* ptr) { - Value = _mm_loadu_si128((__m128i*)ptr); - } - - operator __m128i&() { - return Value; - } - - operator const __m128i&() const { - return Value; - } -}; - -template <> -struct TFuncLoad<__m128> { - __m128 Value; - - template <typename TPointer> - TFuncLoad(TPointer* ptr) { - Value = _mm_loadu_ps((float*)ptr); - } - - operator __m128&() { - return Value; - } - - operator const __m128&() const { - return Value; - } -}; - -template <> +struct TFuncStore; + +template <> +struct TFuncLoad<__m128i> { + __m128i Value; + + template <typename TPointer> + TFuncLoad(TPointer* ptr) { + Value = _mm_loadu_si128((__m128i*)ptr); + } + + operator __m128i&() { + return Value; + } + + operator const __m128i&() const { + return Value; + } +}; + +template <> +struct TFuncLoad<__m128> { + __m128 Value; + + template <typename TPointer> + TFuncLoad(TPointer* ptr) { + Value = _mm_loadu_ps((float*)ptr); + } + + operator __m128&() { + return Value; + } + + operator const __m128&() const { + return Value; + } +}; + +template <> struct TFuncLoad<__m128d> { __m128d Value; @@ -151,153 +151,153 @@ struct TFuncLoad<__m128d> { }; template <> -struct TFuncStore<__m128i> { - template <typename TPointer> - TFuncStore(TPointer* ptr, __m128i Value) { - _mm_storeu_si128((__m128i*)ptr, Value); - } -}; - -template <> -struct TFuncStore<__m128> { - template <typename TPointer> - TFuncStore(TPointer* ptr, __m128 Value) { - _mm_storeu_ps((float*)ptr, Value); - } -}; - -class TSSEEmulTest: public TTestBase { -private: - UNIT_TEST_SUITE(TSSEEmulTest); - UNIT_TEST(Test_mm_load_si128); - UNIT_TEST(Test_mm_loadu_si128); +struct TFuncStore<__m128i> { + template <typename TPointer> + TFuncStore(TPointer* ptr, __m128i Value) { + _mm_storeu_si128((__m128i*)ptr, Value); + } +}; + +template <> +struct TFuncStore<__m128> { + template <typename TPointer> + TFuncStore(TPointer* ptr, __m128 Value) { + _mm_storeu_ps((float*)ptr, Value); + } +}; + +class TSSEEmulTest: public TTestBase { +private: + UNIT_TEST_SUITE(TSSEEmulTest); + UNIT_TEST(Test_mm_load_si128); + UNIT_TEST(Test_mm_loadu_si128); UNIT_TEST(Test_mm_storeu_si128); UNIT_TEST(Test_mm_loadu_si128_2); UNIT_TEST(Test_mm_loadu_ps); UNIT_TEST(Test_mm_storeu_ps); - + UNIT_TEST(Test_mm_slli_epi16); UNIT_TEST(Test_mm_slli_epi32); UNIT_TEST(Test_mm_slli_epi64); UNIT_TEST(Test_mm_slli_si128); - UNIT_TEST(Test_mm_srli_epi16); - UNIT_TEST(Test_mm_srli_epi32); - UNIT_TEST(Test_mm_srli_epi64); + UNIT_TEST(Test_mm_srli_epi16); + UNIT_TEST(Test_mm_srli_epi32); + UNIT_TEST(Test_mm_srli_epi64); UNIT_TEST(Test_mm_srli_si128); - + UNIT_TEST(Test_mm_srai_epi16); UNIT_TEST(Test_mm_srai_epi32); UNIT_TEST(Test_mm_sll_epi16); UNIT_TEST(Test_mm_sll_epi32); UNIT_TEST(Test_mm_sll_epi64); - + UNIT_TEST(Test_mm_srl_epi16); UNIT_TEST(Test_mm_srl_epi32); UNIT_TEST(Test_mm_srl_epi64); - UNIT_TEST(Test_mm_add_epi16); - UNIT_TEST(Test_mm_add_epi32); - UNIT_TEST(Test_mm_add_epi64); - UNIT_TEST(Test_mm_add_ps); + UNIT_TEST(Test_mm_add_epi16); + UNIT_TEST(Test_mm_add_epi32); + UNIT_TEST(Test_mm_add_epi64); + UNIT_TEST(Test_mm_add_ps); UNIT_TEST(Test_mm_add_pd); - + UNIT_TEST(Test_mm_madd_epi16); - UNIT_TEST(Test_mm_sub_epi16); - UNIT_TEST(Test_mm_sub_epi32); - UNIT_TEST(Test_mm_sub_epi64); - UNIT_TEST(Test_mm_sub_ps); + UNIT_TEST(Test_mm_sub_epi16); + UNIT_TEST(Test_mm_sub_epi32); + UNIT_TEST(Test_mm_sub_epi64); + UNIT_TEST(Test_mm_sub_ps); UNIT_TEST(Test_mm_sub_pd); - - UNIT_TEST(Test_mm_mul_ps); + + UNIT_TEST(Test_mm_mul_ps); UNIT_TEST(Test_mm_mul_pd); - UNIT_TEST(Test_mm_div_ps); + UNIT_TEST(Test_mm_div_ps); UNIT_TEST(Test_mm_div_pd); - UNIT_TEST(Test_mm_max_ps); - UNIT_TEST(Test_mm_min_ps); - UNIT_TEST(Test_mm_and_ps); - - UNIT_TEST(Test_mm_unpacklo_epi8); - UNIT_TEST(Test_mm_unpackhi_epi8); - UNIT_TEST(Test_mm_unpacklo_epi16); - UNIT_TEST(Test_mm_unpackhi_epi16); - UNIT_TEST(Test_mm_unpacklo_epi32); - UNIT_TEST(Test_mm_unpackhi_epi32); - UNIT_TEST(Test_mm_unpacklo_epi64); - UNIT_TEST(Test_mm_unpackhi_epi64); - - UNIT_TEST(Test_mm_or_si128); - UNIT_TEST(Test_mm_and_si128); - UNIT_TEST(Test_mm_andnot_si128); - - UNIT_TEST(Test_mm_cmpeq_epi8); - UNIT_TEST(Test_mm_cmpeq_epi16); - UNIT_TEST(Test_mm_cmpeq_epi32); - UNIT_TEST(Test_mm_cmpeq_ps); - - UNIT_TEST(Test_mm_cmpgt_epi8); - UNIT_TEST(Test_mm_cmpgt_epi16); - UNIT_TEST(Test_mm_cmpgt_epi32); - UNIT_TEST(Test_mm_cmpgt_ps); - - UNIT_TEST(Test_mm_cmplt_epi8); - UNIT_TEST(Test_mm_cmplt_epi16); - UNIT_TEST(Test_mm_cmplt_epi32); - - UNIT_TEST(Test_mm_set1_epi8); - UNIT_TEST(Test_mm_set1_epi16); - UNIT_TEST(Test_mm_set1_epi32); - UNIT_TEST(Test_mm_set1_ps); + UNIT_TEST(Test_mm_max_ps); + UNIT_TEST(Test_mm_min_ps); + UNIT_TEST(Test_mm_and_ps); + + UNIT_TEST(Test_mm_unpacklo_epi8); + UNIT_TEST(Test_mm_unpackhi_epi8); + UNIT_TEST(Test_mm_unpacklo_epi16); + UNIT_TEST(Test_mm_unpackhi_epi16); + UNIT_TEST(Test_mm_unpacklo_epi32); + UNIT_TEST(Test_mm_unpackhi_epi32); + UNIT_TEST(Test_mm_unpacklo_epi64); + UNIT_TEST(Test_mm_unpackhi_epi64); + + UNIT_TEST(Test_mm_or_si128); + UNIT_TEST(Test_mm_and_si128); + UNIT_TEST(Test_mm_andnot_si128); + + UNIT_TEST(Test_mm_cmpeq_epi8); + UNIT_TEST(Test_mm_cmpeq_epi16); + UNIT_TEST(Test_mm_cmpeq_epi32); + UNIT_TEST(Test_mm_cmpeq_ps); + + UNIT_TEST(Test_mm_cmpgt_epi8); + UNIT_TEST(Test_mm_cmpgt_epi16); + UNIT_TEST(Test_mm_cmpgt_epi32); + UNIT_TEST(Test_mm_cmpgt_ps); + + UNIT_TEST(Test_mm_cmplt_epi8); + UNIT_TEST(Test_mm_cmplt_epi16); + UNIT_TEST(Test_mm_cmplt_epi32); + + UNIT_TEST(Test_mm_set1_epi8); + UNIT_TEST(Test_mm_set1_epi16); + UNIT_TEST(Test_mm_set1_epi32); + UNIT_TEST(Test_mm_set1_ps); UNIT_TEST(Test_mm_set_ps1); - - UNIT_TEST(Test_mm_setzero_si128); - UNIT_TEST(Test_mm_setzero_ps); + + UNIT_TEST(Test_mm_setzero_si128); + UNIT_TEST(Test_mm_setzero_ps); UNIT_TEST(Test_mm_setzero_pd); - - UNIT_TEST(Test_mm_storel_epi64); - UNIT_TEST(Test_mm_loadl_epi64); - + + UNIT_TEST(Test_mm_storel_epi64); + UNIT_TEST(Test_mm_loadl_epi64); + UNIT_TEST(Test_mm_loadl_pd); UNIT_TEST(Test_mm_loadh_pd); UNIT_TEST(Test_mm_cvtsd_f64); - UNIT_TEST(Test_mm_shuffle_epi32); - UNIT_TEST(Test_mm_movemask_epi8); - UNIT_TEST(Test_mm_cvtsi128_si32); + UNIT_TEST(Test_mm_shuffle_epi32); + UNIT_TEST(Test_mm_movemask_epi8); + UNIT_TEST(Test_mm_cvtsi128_si32); UNIT_TEST(Test_mm_cvtsi128_si64); - - UNIT_TEST(Test_mm_set_epi16); - UNIT_TEST(Test_mm_set_epi32); - UNIT_TEST(Test_mm_set_ps); + + UNIT_TEST(Test_mm_set_epi16); + UNIT_TEST(Test_mm_set_epi32); + UNIT_TEST(Test_mm_set_ps); UNIT_TEST(Test_mm_set_pd); - - UNIT_TEST(Test_mm_cvtsi32_si128); + + UNIT_TEST(Test_mm_cvtsi32_si128); UNIT_TEST(Test_mm_cvtsi64_si128); - - UNIT_TEST(Test_mm_packs_epi16); - UNIT_TEST(Test_mm_packs_epi32); - UNIT_TEST(Test_mm_packus_epi16); - - UNIT_TEST(Test_mm_extract_epi16); + + UNIT_TEST(Test_mm_packs_epi16); + UNIT_TEST(Test_mm_packs_epi32); + UNIT_TEST(Test_mm_packus_epi16); + + UNIT_TEST(Test_mm_extract_epi16); UNIT_TEST(Test_mm_extract_epi8); UNIT_TEST(Test_mm_extract_epi32); UNIT_TEST(Test_mm_extract_epi64); - - UNIT_TEST(Test_MM_TRANSPOSE4_PS); - UNIT_TEST(Test_mm_movemask_ps); + + UNIT_TEST(Test_MM_TRANSPOSE4_PS); + UNIT_TEST(Test_mm_movemask_ps); UNIT_TEST(Test_mm_movemask_ps_2); - - UNIT_TEST(Test_mm_cvtepi32_ps); - UNIT_TEST(Test_mm_cvtps_epi32); - UNIT_TEST(Test_mm_cvttps_epi32); - - UNIT_TEST(Test_mm_castsi128_ps); - UNIT_TEST(Test_mm_castps_si128); - - UNIT_TEST(Test_mm_mul_epu32); - + + UNIT_TEST(Test_mm_cvtepi32_ps); + UNIT_TEST(Test_mm_cvtps_epi32); + UNIT_TEST(Test_mm_cvttps_epi32); + + UNIT_TEST(Test_mm_castsi128_ps); + UNIT_TEST(Test_mm_castps_si128); + + UNIT_TEST(Test_mm_mul_epu32); + UNIT_TEST(Test_mm_cmpunord_ps); UNIT_TEST(Test_mm_andnot_ps); UNIT_TEST(Test_mm_shuffle_ps); @@ -310,36 +310,36 @@ private: UNIT_TEST(Test_mm_rsqrt_ps); UNIT_TEST(Test_matrixnet_powerpc); - UNIT_TEST_SUITE_END(); - -public: - void Test_mm_load_si128(); - void Test_mm_loadu_si128(); + UNIT_TEST_SUITE_END(); + +public: + void Test_mm_load_si128(); + void Test_mm_loadu_si128(); void Test_mm_storeu_si128(); void Test_mm_loadu_si128_2(); void Test_mm_loadu_ps(); void Test_mm_storeu_ps(); - - template <typename TElem, int bits, int elemCount, + + template <typename TElem, int bits, int elemCount, typename TFunc, typename TShifter, typename TOp, typename TElemFunc> - void Test_mm_shifter_epiXX(); - + void Test_mm_shifter_epiXX(); + enum class EDirection { Left, Right }; - + struct TShiftRes { __m128i Value[17]; }; void Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo); - void Test_mm_slli_epi16(); - void Test_mm_slli_epi32(); - void Test_mm_slli_epi64(); + void Test_mm_slli_epi16(); + void Test_mm_slli_epi32(); + void Test_mm_slli_epi64(); void Test_mm_slli_si128(); - + void Test_mm_srli_epi16(); void Test_mm_srli_epi32(); void Test_mm_srli_epi64(); @@ -356,134 +356,134 @@ public: void Test_mm_srl_epi32(); void Test_mm_srl_epi64(); - void Test_mm_add_epi8(); - void Test_mm_add_epi16(); - void Test_mm_add_epi32(); - void Test_mm_add_epi64(); - void Test_mm_add_ps(); + void Test_mm_add_epi8(); + void Test_mm_add_epi16(); + void Test_mm_add_epi32(); + void Test_mm_add_epi64(); + void Test_mm_add_ps(); void Test_mm_add_pd(); - + void Test_mm_madd_epi16(); - void Test_mm_sub_epi8(); - void Test_mm_sub_epi16(); - void Test_mm_sub_epi32(); - void Test_mm_sub_epi64(); - void Test_mm_sub_ps(); + void Test_mm_sub_epi8(); + void Test_mm_sub_epi16(); + void Test_mm_sub_epi32(); + void Test_mm_sub_epi64(); + void Test_mm_sub_ps(); void Test_mm_sub_pd(); - - void Test_mm_mul_ps(); + + void Test_mm_mul_ps(); void Test_mm_mul_pd(); - void Test_mm_div_ps(); + void Test_mm_div_ps(); void Test_mm_div_pd(); - void Test_mm_max_ps(); - void Test_mm_min_ps(); - void Test_mm_and_ps(); - - template <typename TElem, int bits, int elemCount, int shift, - typename TFunc, typename TOp> - void Test_mm_unpack_epiXX(); - void Test_mm_unpacklo_epi8(); - void Test_mm_unpackhi_epi8(); - void Test_mm_unpacklo_epi16(); - void Test_mm_unpackhi_epi16(); - void Test_mm_unpacklo_epi32(); - void Test_mm_unpackhi_epi32(); - void Test_mm_unpacklo_epi64(); - void Test_mm_unpackhi_epi64(); - - template <typename TElem, unsigned elemCount, - typename TFunc, typename TElemFunc, + void Test_mm_max_ps(); + void Test_mm_min_ps(); + void Test_mm_and_ps(); + + template <typename TElem, int bits, int elemCount, int shift, + typename TFunc, typename TOp> + void Test_mm_unpack_epiXX(); + void Test_mm_unpacklo_epi8(); + void Test_mm_unpackhi_epi8(); + void Test_mm_unpacklo_epi16(); + void Test_mm_unpackhi_epi16(); + void Test_mm_unpacklo_epi32(); + void Test_mm_unpackhi_epi32(); + void Test_mm_unpacklo_epi64(); + void Test_mm_unpackhi_epi64(); + + template <typename TElem, unsigned elemCount, + typename TFunc, typename TElemFunc, typename TOp, typename TVectorType = __m128i> - void Test_mm_dualop(); - - template <typename TElem, unsigned elemCount, - typename TFunc, typename TElemFunc, + void Test_mm_dualop(); + + template <typename TElem, unsigned elemCount, + typename TFunc, typename TElemFunc, typename TOp, typename TVectorType = __m128i> - void Test_mm_dualcmp(); - - void Test_mm_or_si128(); - void Test_mm_and_si128(); - void Test_mm_andnot_si128(); - - void Test_mm_cmpeq_epi8(); - void Test_mm_cmpeq_epi16(); - void Test_mm_cmpeq_epi32(); - void Test_mm_cmpeq_ps(); - - void Test_mm_cmpgt_epi8(); - void Test_mm_cmpgt_epi16(); - void Test_mm_cmpgt_epi32(); - void Test_mm_cmpgt_ps(); - - void Test_mm_cmplt_epi8(); - void Test_mm_cmplt_epi16(); - void Test_mm_cmplt_epi32(); - - template <typename TElem, int elemCount, + void Test_mm_dualcmp(); + + void Test_mm_or_si128(); + void Test_mm_and_si128(); + void Test_mm_andnot_si128(); + + void Test_mm_cmpeq_epi8(); + void Test_mm_cmpeq_epi16(); + void Test_mm_cmpeq_epi32(); + void Test_mm_cmpeq_ps(); + + void Test_mm_cmpgt_epi8(); + void Test_mm_cmpgt_epi16(); + void Test_mm_cmpgt_epi32(); + void Test_mm_cmpgt_ps(); + + void Test_mm_cmplt_epi8(); + void Test_mm_cmplt_epi16(); + void Test_mm_cmplt_epi32(); + + template <typename TElem, int elemCount, typename TFunc, typename TOp, typename TVectorType> - void Test_mm_setter_epiXX(); - void Test_mm_set1_epi8(); - void Test_mm_set1_epi16(); - void Test_mm_set1_epi32(); - void Test_mm_set1_ps(); + void Test_mm_setter_epiXX(); + void Test_mm_set1_epi8(); + void Test_mm_set1_epi16(); + void Test_mm_set1_epi32(); + void Test_mm_set1_ps(); void Test_mm_set_ps1(); - - void Test_mm_setzero_si128(); - void Test_mm_setzero_ps(); + + void Test_mm_setzero_si128(); + void Test_mm_setzero_ps(); void Test_mm_setzero_pd(); - - void Test_mm_loadl_epi64(); - void Test_mm_storel_epi64(); - + + void Test_mm_loadl_epi64(); + void Test_mm_storel_epi64(); + void Test_mm_loadl_pd(); void Test_mm_loadh_pd(); void Test_mm_cvtsd_f64(); - void Test_mm_shuffle_epi32(); - void Test_mm_movemask_epi8(); - void Test_mm_cvtsi128_si32(); + void Test_mm_shuffle_epi32(); + void Test_mm_movemask_epi8(); + void Test_mm_cvtsi128_si32(); void Test_mm_cvtsi128_si64(); - - void Test_mm_set_epi16(); - void Test_mm_set_epi32(); - void Test_mm_set_ps(); + + void Test_mm_set_epi16(); + void Test_mm_set_epi32(); + void Test_mm_set_ps(); void Test_mm_set_pd(); - - void Test_mm_cvtsi32_si128(); + + void Test_mm_cvtsi32_si128(); void Test_mm_cvtsi64_si128(); - - template <typename TElem, typename TNarrow, unsigned elemCount, - typename TFunc> - void Test_mm_packs_epiXX(); - void Test_mm_packs_epi16(); - void Test_mm_packs_epi32(); - void Test_mm_packus_epi16(); - - void Test_mm_extract_epi16(); + + template <typename TElem, typename TNarrow, unsigned elemCount, + typename TFunc> + void Test_mm_packs_epiXX(); + void Test_mm_packs_epi16(); + void Test_mm_packs_epi32(); + void Test_mm_packus_epi16(); + + void Test_mm_extract_epi16(); void Test_mm_extract_epi8(); void Test_mm_extract_epi32(); void Test_mm_extract_epi64(); - - void Test_MM_TRANSPOSE4_PS(); - void Test_mm_movemask_ps(); + + void Test_MM_TRANSPOSE4_PS(); + void Test_mm_movemask_ps(); void Test_mm_movemask_ps_2(); - - template <typename TFrom, typename TTo, unsigned elemCount, - typename TLoadVector, typename TResultVector, - typename TElemFunc, typename TFunc, typename TOp> - void Test_mm_convertop(); - void Test_mm_cvtepi32_ps(); - void Test_mm_cvtps_epi32(); - void Test_mm_cvttps_epi32(); - - template <typename TLoadVector, typename TCastVector, - typename TFunc, TFunc* func> - void Test_mm_castXX(); - void Test_mm_castsi128_ps(); - void Test_mm_castps_si128(); - - void Test_mm_mul_epu32(); + + template <typename TFrom, typename TTo, unsigned elemCount, + typename TLoadVector, typename TResultVector, + typename TElemFunc, typename TFunc, typename TOp> + void Test_mm_convertop(); + void Test_mm_cvtepi32_ps(); + void Test_mm_cvtps_epi32(); + void Test_mm_cvttps_epi32(); + + template <typename TLoadVector, typename TCastVector, + typename TFunc, TFunc* func> + void Test_mm_castXX(); + void Test_mm_castsi128_ps(); + void Test_mm_castps_si128(); + + void Test_mm_mul_epu32(); void Test_mm_cmpunord_ps(); void Test_mm_store_ss(); @@ -497,30 +497,30 @@ public: void Test_mm_rsqrt_ps(); void Test_mm_rsqrt_ss(); void Test_matrixnet_powerpc(); -}; - -UNIT_TEST_SUITE_REGISTRATION(TSSEEmulTest); - -void TSSEEmulTest::Test_mm_load_si128() { +}; + +UNIT_TEST_SUITE_REGISTRATION(TSSEEmulTest); + +void TSSEEmulTest::Test_mm_load_si128() { alignas(16) char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - __m128i value = _mm_load_si128((__m128i*)&data); - UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[0], 0xAABB2211CCFF00AAUL); - UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[1], 0x1C66775588449933UL); -} - -void TSSEEmulTest::Test_mm_loadu_si128() { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + __m128i value = _mm_load_si128((__m128i*)&data); + UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[0], 0xAABB2211CCFF00AAUL); + UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[1], 0x1C66775588449933UL); +} + +void TSSEEmulTest::Test_mm_loadu_si128() { alignas(16) char data[17] = { - '\x66', - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - UNIT_ASSERT((ui64(&data[1]) & 0x1) == 0x1); - __m128i value = _mm_loadu_si128((__m128i*)&data[1]); - UNIT_ASSERT(TQType<uint64x2_t>::As(value)[0] == 0xAABB2211CCFF00AAUL); - UNIT_ASSERT(TQType<uint64x2_t>::As(value)[1] == 0x1C66775588449933UL); -} - + '\x66', + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + UNIT_ASSERT((ui64(&data[1]) & 0x1) == 0x1); + __m128i value = _mm_loadu_si128((__m128i*)&data[1]); + UNIT_ASSERT(TQType<uint64x2_t>::As(value)[0] == 0xAABB2211CCFF00AAUL); + UNIT_ASSERT(TQType<uint64x2_t>::As(value)[1] == 0x1C66775588449933UL); +} + void TSSEEmulTest::Test_mm_storeu_si128() { alignas(16) unsigned char stub[32] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -640,32 +640,32 @@ unsigned MakeNumber<unsigned>(unsigned number) { return number; } -template <typename TElem, int bits, int elemCount, +template <typename TElem, int bits, int elemCount, typename TFunc, typename TShifter, typename TOp, typename TElemFunc> -void TSSEEmulTest::Test_mm_shifter_epiXX() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - TElem* dataw = reinterpret_cast<TElem*>(&data); - - __m128i value = _mm_loadu_si128((__m128i*)&data); - +void TSSEEmulTest::Test_mm_shifter_epiXX() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + TElem* dataw = reinterpret_cast<TElem*>(&data); + + __m128i value = _mm_loadu_si128((__m128i*)&data); + for (unsigned shifter = 0; shifter <= bits; ++shifter) { - TElem shiftedData[elemCount]; + TElem shiftedData[elemCount]; for (unsigned i = 0; i < elemCount; ++i) { - shiftedData[i] = TElemFunc::Call(dataw[i], shifter); + shiftedData[i] = TElemFunc::Call(dataw[i], shifter); } - + const TShifter adhoc_shifter = MakeNumber<TShifter>(shifter); __m128i result = TFunc(value, adhoc_shifter); for (unsigned i = 0; i < elemCount; ++i) { - UNIT_ASSERT_EQUAL(shiftedData[i], TQType<TOp>::As(result)[i]); + UNIT_ASSERT_EQUAL(shiftedData[i], TQType<TOp>::As(result)[i]); } - } -} - + } +} + void TSSEEmulTest::Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo) { const char data[48] = { @@ -713,52 +713,52 @@ struct THelperASHR { } }; -template <typename TElem> -struct THelperSHR { - static TElem Call(const TElem op, const int shift) { +template <typename TElem> +struct THelperSHR { + static TElem Call(const TElem op, const int shift) { constexpr int nBitsInOp = sizeof(op) * CHAR_BIT; return shift < nBitsInOp ? op >> shift : 0; - } -}; - -void TSSEEmulTest::Test_mm_srli_epi16() { + } +}; + +void TSSEEmulTest::Test_mm_srli_epi16() { Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_srli_epi16), unsigned, uint16x8_t, - THelperSHR<ui16>>(); -} - -void TSSEEmulTest::Test_mm_srli_epi32() { + THelperSHR<ui16>>(); +} + +void TSSEEmulTest::Test_mm_srli_epi32() { Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_srli_epi32), unsigned, uint32x4_t, - THelperSHR<ui32>>(); -} - -void TSSEEmulTest::Test_mm_srli_epi64() { + THelperSHR<ui32>>(); +} + +void TSSEEmulTest::Test_mm_srli_epi64() { Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_srli_epi64), unsigned, uint64x2_t, - THelperSHR<ui64>>(); -} - -template <typename TElem> -struct THelperSHL { - static TElem Call(const TElem op, const int shift) { + THelperSHR<ui64>>(); +} + +template <typename TElem> +struct THelperSHL { + static TElem Call(const TElem op, const int shift) { constexpr int nBitsInOp = sizeof(op) * CHAR_BIT; return shift < nBitsInOp ? op << shift : 0; - } -}; - -void TSSEEmulTest::Test_mm_slli_epi16() { + } +}; + +void TSSEEmulTest::Test_mm_slli_epi16() { Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_slli_epi16), unsigned, uint16x8_t, - THelperSHL<ui16>>(); -} - -void TSSEEmulTest::Test_mm_slli_epi32() { + THelperSHL<ui16>>(); +} + +void TSSEEmulTest::Test_mm_slli_epi32() { Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_slli_epi32), unsigned, uint32x4_t, - THelperSHL<ui32>>(); -} - -void TSSEEmulTest::Test_mm_slli_epi64() { + THelperSHL<ui32>>(); +} + +void TSSEEmulTest::Test_mm_slli_epi64() { Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_slli_epi64), unsigned, uint64x2_t, - THelperSHL<ui64>>(); -} - + THelperSHL<ui64>>(); +} + void TSSEEmulTest::Test_mm_slli_si128() { Test_mm_byte_shifter(EDirection::Left, [] (__m128i a) -> TShiftRes { TShiftRes res; @@ -849,30 +849,30 @@ void TSSEEmulTest::Test_mm_sll_epi64() { THelperSHL<ui64>>(); } -template <typename TElem> -struct THelperAdd { - static TElem Call(const TElem op1, const TElem op2) { - return op1 + op2; - } -}; - -void TSSEEmulTest::Test_mm_add_epi16() { - Test_mm_dualop<ui16, 8, Wrap(_mm_add_epi16), THelperAdd<ui16>, uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_add_epi32() { - Test_mm_dualop<ui32, 4, Wrap(_mm_add_epi32), THelperAdd<ui32>, uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_add_epi64() { - Test_mm_dualop<ui64, 2, Wrap(_mm_add_epi64), THelperAdd<ui64>, uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_add_ps() { - Test_mm_dualop<float, 2, WrapF(_mm_add_ps), - THelperAdd<float>, float32x4_t, __m128>(); -} - +template <typename TElem> +struct THelperAdd { + static TElem Call(const TElem op1, const TElem op2) { + return op1 + op2; + } +}; + +void TSSEEmulTest::Test_mm_add_epi16() { + Test_mm_dualop<ui16, 8, Wrap(_mm_add_epi16), THelperAdd<ui16>, uint16x8_t>(); +} + +void TSSEEmulTest::Test_mm_add_epi32() { + Test_mm_dualop<ui32, 4, Wrap(_mm_add_epi32), THelperAdd<ui32>, uint32x4_t>(); +} + +void TSSEEmulTest::Test_mm_add_epi64() { + Test_mm_dualop<ui64, 2, Wrap(_mm_add_epi64), THelperAdd<ui64>, uint64x2_t>(); +} + +void TSSEEmulTest::Test_mm_add_ps() { + Test_mm_dualop<float, 2, WrapF(_mm_add_ps), + THelperAdd<float>, float32x4_t, __m128>(); +} + void TSSEEmulTest::Test_mm_add_pd() { Test_mm_dualop<double, 2, WrapD(_mm_add_pd), THelperAdd<double>, float64x2_t, __m128d>(); @@ -904,44 +904,44 @@ void TSSEEmulTest::Test_mm_madd_epi16() { } -template <typename TElem> -struct THelperSub { - static TElem Call(const TElem op1, const TElem op2) { - return op1 - op2; - } -}; - -void TSSEEmulTest::Test_mm_sub_epi16() { - Test_mm_dualop<ui16, 8, Wrap(_mm_sub_epi16), THelperSub<ui16>, uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_sub_epi32() { - Test_mm_dualop<ui32, 4, Wrap(_mm_sub_epi32), THelperSub<ui32>, uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_sub_epi64() { - Test_mm_dualop<ui64, 2, Wrap(_mm_sub_epi64), THelperSub<ui64>, uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_sub_ps() { - Test_mm_dualop<float, 4, WrapF(_mm_sub_ps), THelperSub<float>, - float32x4_t, __m128>(); -} - +template <typename TElem> +struct THelperSub { + static TElem Call(const TElem op1, const TElem op2) { + return op1 - op2; + } +}; + +void TSSEEmulTest::Test_mm_sub_epi16() { + Test_mm_dualop<ui16, 8, Wrap(_mm_sub_epi16), THelperSub<ui16>, uint16x8_t>(); +} + +void TSSEEmulTest::Test_mm_sub_epi32() { + Test_mm_dualop<ui32, 4, Wrap(_mm_sub_epi32), THelperSub<ui32>, uint32x4_t>(); +} + +void TSSEEmulTest::Test_mm_sub_epi64() { + Test_mm_dualop<ui64, 2, Wrap(_mm_sub_epi64), THelperSub<ui64>, uint64x2_t>(); +} + +void TSSEEmulTest::Test_mm_sub_ps() { + Test_mm_dualop<float, 4, WrapF(_mm_sub_ps), THelperSub<float>, + float32x4_t, __m128>(); +} + void TSSEEmulTest::Test_mm_sub_pd() { Test_mm_dualop<double, 2, WrapD(_mm_sub_pd), THelperSub<double>, float64x2_t, __m128d>(); } -void TSSEEmulTest::Test_mm_mul_ps() { - struct THelper { - static float Call(const float op1, const float op2) { - return op1 * op2; - } - }; - Test_mm_dualop<float, 4, WrapF(_mm_mul_ps), THelper, float32x4_t, __m128>(); -} - +void TSSEEmulTest::Test_mm_mul_ps() { + struct THelper { + static float Call(const float op1, const float op2) { + return op1 * op2; + } + }; + Test_mm_dualop<float, 4, WrapF(_mm_mul_ps), THelper, float32x4_t, __m128>(); +} + void TSSEEmulTest::Test_mm_mul_pd() { struct THelper { static double Call(const double op1, const double op2) { @@ -951,15 +951,15 @@ void TSSEEmulTest::Test_mm_mul_pd() { Test_mm_dualop<double, 2, WrapD(_mm_mul_pd), THelper, float64x2_t, __m128d>(); } -void TSSEEmulTest::Test_mm_div_ps() { - struct THelper { - static float Call(const float op1, const float op2) { - return op1 / op2; - } - }; - Test_mm_dualop<float, 4, WrapF(_mm_div_ps), THelper, float32x4_t, __m128>(); -} - +void TSSEEmulTest::Test_mm_div_ps() { + struct THelper { + static float Call(const float op1, const float op2) { + return op1 / op2; + } + }; + Test_mm_dualop<float, 4, WrapF(_mm_div_ps), THelper, float32x4_t, __m128>(); +} + void TSSEEmulTest::Test_mm_div_pd() { struct THelper { static double Call(const double op1, const double op2) { @@ -969,441 +969,441 @@ void TSSEEmulTest::Test_mm_div_pd() { Test_mm_dualop<double, 2, WrapD(_mm_div_pd), THelper, float64x2_t, __m128d>(); } -void TSSEEmulTest::Test_mm_max_ps() { - struct THelper { - static float Call(const float op1, const float op2) { - return std::max(op1, op2); - } - }; - Test_mm_dualop<float, 4, WrapF(_mm_max_ps), THelper, float32x4_t, __m128>(); -} - -void TSSEEmulTest::Test_mm_min_ps() { - struct THelper { - static float Call(const float op1, const float op2) { - return std::min(op1, op2); - } - }; - Test_mm_dualop<float, 4, WrapF(_mm_min_ps), THelper, float32x4_t, __m128>(); -} - -void TSSEEmulTest::Test_mm_and_ps() { - struct THelper { - static float Call(const float op1, const float op2) { - union Cast { - unsigned int AsUInt; - float AsFloat; - }; - Cast v1, v2, result; - v1.AsFloat = op1; - v2.AsFloat = op2; - result.AsUInt = v1.AsUInt & v2.AsUInt; - return result.AsFloat; - } - }; - Test_mm_dualcmp<float, 4, WrapF(_mm_and_ps), - THelper, float32x4_t, __m128>(); -} - -template <typename TElem, int bits, int elemCount, int shift, - typename TFunc, typename TOp> -void TSSEEmulTest::Test_mm_unpack_epiXX() { - char data1[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - char data2[16] = { - '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', - '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; - TElem* dataw1 = reinterpret_cast<TElem*>(&data1); - TElem* dataw2 = reinterpret_cast<TElem*>(&data2); - - __m128i value1 = _mm_loadu_si128((__m128i*)&data1); - __m128i value2 = _mm_loadu_si128((__m128i*)&data2); - - TElem zippedData[elemCount]; - for (unsigned i = 0; i < elemCount / 2; ++i) { - zippedData[i * 2] = dataw1[i + shift]; - zippedData[i * 2 + 1] = dataw2[i + shift]; - } - __m128i result = TFunc(value1, value2); - - for (unsigned i = 0; i < elemCount / 2; ++i) { - UNIT_ASSERT_EQUAL(zippedData[i * 2], TQType<TOp>::As(result)[i * 2]); - UNIT_ASSERT_EQUAL(zippedData[i * 2 + 1], - TQType<TOp>::As(result)[i * 2 + 1]); - } -} - -void TSSEEmulTest::Test_mm_unpacklo_epi8() { - Test_mm_unpack_epiXX<ui8, 8, 16, 0, Wrap(_mm_unpacklo_epi8), uint8x16_t>(); -} - -void TSSEEmulTest::Test_mm_unpackhi_epi8() { - Test_mm_unpack_epiXX<ui8, 8, 16, 8, Wrap(_mm_unpackhi_epi8), uint8x16_t>(); -} - -void TSSEEmulTest::Test_mm_unpacklo_epi16() { - Test_mm_unpack_epiXX<ui16, 16, 8, 0, Wrap(_mm_unpacklo_epi16), uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_unpackhi_epi16() { - Test_mm_unpack_epiXX<ui16, 16, 8, 4, Wrap(_mm_unpackhi_epi16), uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_unpacklo_epi32() { - Test_mm_unpack_epiXX<ui32, 32, 4, 0, Wrap(_mm_unpacklo_epi32), uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_unpackhi_epi32() { - Test_mm_unpack_epiXX<ui32, 32, 4, 2, Wrap(_mm_unpackhi_epi32), uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_unpacklo_epi64() { - Test_mm_unpack_epiXX<ui64, 64, 2, 0, Wrap(_mm_unpacklo_epi64), uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_unpackhi_epi64() { - Test_mm_unpack_epiXX<ui64, 64, 2, 1, Wrap(_mm_unpackhi_epi64), uint64x2_t>(); -} - -template <typename TElem, unsigned elemCount, - typename TFunc, typename TElemFunc, +void TSSEEmulTest::Test_mm_max_ps() { + struct THelper { + static float Call(const float op1, const float op2) { + return std::max(op1, op2); + } + }; + Test_mm_dualop<float, 4, WrapF(_mm_max_ps), THelper, float32x4_t, __m128>(); +} + +void TSSEEmulTest::Test_mm_min_ps() { + struct THelper { + static float Call(const float op1, const float op2) { + return std::min(op1, op2); + } + }; + Test_mm_dualop<float, 4, WrapF(_mm_min_ps), THelper, float32x4_t, __m128>(); +} + +void TSSEEmulTest::Test_mm_and_ps() { + struct THelper { + static float Call(const float op1, const float op2) { + union Cast { + unsigned int AsUInt; + float AsFloat; + }; + Cast v1, v2, result; + v1.AsFloat = op1; + v2.AsFloat = op2; + result.AsUInt = v1.AsUInt & v2.AsUInt; + return result.AsFloat; + } + }; + Test_mm_dualcmp<float, 4, WrapF(_mm_and_ps), + THelper, float32x4_t, __m128>(); +} + +template <typename TElem, int bits, int elemCount, int shift, + typename TFunc, typename TOp> +void TSSEEmulTest::Test_mm_unpack_epiXX() { + char data1[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + char data2[16] = { + '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', + '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; + TElem* dataw1 = reinterpret_cast<TElem*>(&data1); + TElem* dataw2 = reinterpret_cast<TElem*>(&data2); + + __m128i value1 = _mm_loadu_si128((__m128i*)&data1); + __m128i value2 = _mm_loadu_si128((__m128i*)&data2); + + TElem zippedData[elemCount]; + for (unsigned i = 0; i < elemCount / 2; ++i) { + zippedData[i * 2] = dataw1[i + shift]; + zippedData[i * 2 + 1] = dataw2[i + shift]; + } + __m128i result = TFunc(value1, value2); + + for (unsigned i = 0; i < elemCount / 2; ++i) { + UNIT_ASSERT_EQUAL(zippedData[i * 2], TQType<TOp>::As(result)[i * 2]); + UNIT_ASSERT_EQUAL(zippedData[i * 2 + 1], + TQType<TOp>::As(result)[i * 2 + 1]); + } +} + +void TSSEEmulTest::Test_mm_unpacklo_epi8() { + Test_mm_unpack_epiXX<ui8, 8, 16, 0, Wrap(_mm_unpacklo_epi8), uint8x16_t>(); +} + +void TSSEEmulTest::Test_mm_unpackhi_epi8() { + Test_mm_unpack_epiXX<ui8, 8, 16, 8, Wrap(_mm_unpackhi_epi8), uint8x16_t>(); +} + +void TSSEEmulTest::Test_mm_unpacklo_epi16() { + Test_mm_unpack_epiXX<ui16, 16, 8, 0, Wrap(_mm_unpacklo_epi16), uint16x8_t>(); +} + +void TSSEEmulTest::Test_mm_unpackhi_epi16() { + Test_mm_unpack_epiXX<ui16, 16, 8, 4, Wrap(_mm_unpackhi_epi16), uint16x8_t>(); +} + +void TSSEEmulTest::Test_mm_unpacklo_epi32() { + Test_mm_unpack_epiXX<ui32, 32, 4, 0, Wrap(_mm_unpacklo_epi32), uint32x4_t>(); +} + +void TSSEEmulTest::Test_mm_unpackhi_epi32() { + Test_mm_unpack_epiXX<ui32, 32, 4, 2, Wrap(_mm_unpackhi_epi32), uint32x4_t>(); +} + +void TSSEEmulTest::Test_mm_unpacklo_epi64() { + Test_mm_unpack_epiXX<ui64, 64, 2, 0, Wrap(_mm_unpacklo_epi64), uint64x2_t>(); +} + +void TSSEEmulTest::Test_mm_unpackhi_epi64() { + Test_mm_unpack_epiXX<ui64, 64, 2, 1, Wrap(_mm_unpackhi_epi64), uint64x2_t>(); +} + +template <typename TElem, unsigned elemCount, + typename TFunc, typename TElemFunc, typename TOp, typename TVectorType> -void TSSEEmulTest::Test_mm_dualop() { - char data1[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - char data2[16] = { - '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', - '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; - TElem* dataw1 = reinterpret_cast<TElem*>(&data1); - TElem* dataw2 = reinterpret_cast<TElem*>(&data2); - +void TSSEEmulTest::Test_mm_dualop() { + char data1[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + char data2[16] = { + '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', + '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; + TElem* dataw1 = reinterpret_cast<TElem*>(&data1); + TElem* dataw2 = reinterpret_cast<TElem*>(&data2); + TVectorType value1 = TFuncLoad<TVectorType>(&data1); TVectorType value2 = TFuncLoad<TVectorType>(&data2); - - TElem procData[elemCount]; - for (unsigned i = 0; i < elemCount; ++i) { - procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]); - } + + TElem procData[elemCount]; + for (unsigned i = 0; i < elemCount; ++i) { + procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]); + } TVectorType result = TFunc(value1, value2); - - for (unsigned i = 0; i < elemCount; ++i) { - UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]); - } -} - -/* This is almost the same as Test_mm_dualop, - but different data1 and data2 */ -template <typename TElem, unsigned elemCount, - typename TFunc, typename TElemFunc, + + for (unsigned i = 0; i < elemCount; ++i) { + UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]); + } +} + +/* This is almost the same as Test_mm_dualop, + but different data1 and data2 */ +template <typename TElem, unsigned elemCount, + typename TFunc, typename TElemFunc, typename TOp, typename TVectorType> -void TSSEEmulTest::Test_mm_dualcmp() { - char data1[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x66', '\x77', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C'}; - char data2[16] = { - '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', - '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; - TElem* dataw1 = reinterpret_cast<TElem*>(&data1); - TElem* dataw2 = reinterpret_cast<TElem*>(&data2); - +void TSSEEmulTest::Test_mm_dualcmp() { + char data1[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x66', '\x77', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C'}; + char data2[16] = { + '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', + '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; + TElem* dataw1 = reinterpret_cast<TElem*>(&data1); + TElem* dataw2 = reinterpret_cast<TElem*>(&data2); + TVectorType value1 = TFuncLoad<TVectorType>(&data1); TVectorType value2 = TFuncLoad<TVectorType>(&data2); - - TElem procData[elemCount]; - for (unsigned i = 0; i < elemCount; ++i) { - procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]); - } + + TElem procData[elemCount]; + for (unsigned i = 0; i < elemCount; ++i) { + procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]); + } TVectorType result = TFunc(value1, value2); - - for (unsigned i = 0; i < elemCount; ++i) { - /* memcmp is for compare to invalid floats in results */ + + for (unsigned i = 0; i < elemCount; ++i) { + /* memcmp is for compare to invalid floats in results */ const TElem value = TQType<TOp>::As(result)[i]; UNIT_ASSERT(memcmp(&(procData[i]), &value, sizeof(TElem)) == 0); - } -} - -void TSSEEmulTest::Test_mm_or_si128() { - struct THelper { - static ui64 Call(const ui64 op1, const ui64 op2) { - return op1 | op2; - } - }; - - Test_mm_dualop<ui64, 2, Wrap(_mm_or_si128), THelper, uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_and_si128() { - struct THelper { - static ui64 Call(const ui64 op1, const ui64 op2) { - return op1 & op2; - } - }; - - Test_mm_dualop<ui64, 2, Wrap(_mm_and_si128), THelper, uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_andnot_si128() { - struct THelper { - static ui64 Call(const ui64 op1, const ui64 op2) { - return (~op1) & op2; - } - }; - - Test_mm_dualop<ui64, 2, Wrap(_mm_andnot_si128), THelper, uint64x2_t>(); -} - -template <typename TElem> -struct THelperCMPEQ { - static TElem Call(const TElem op1, const TElem op2) { - return op1 == op2 ? ~TElem(0) : TElem(0); - } -}; - -void TSSEEmulTest::Test_mm_cmpeq_epi8() { - Test_mm_dualcmp<ui8, 16, Wrap(_mm_cmpeq_epi8), - THelperCMPEQ<ui8>, uint8x16_t>(); -} - -void TSSEEmulTest::Test_mm_cmpeq_epi16() { - Test_mm_dualcmp<ui16, 8, Wrap(_mm_cmpeq_epi16), - THelperCMPEQ<ui16>, uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_cmpeq_epi32() { - Test_mm_dualcmp<ui32, 4, Wrap(_mm_cmpeq_epi32), - THelperCMPEQ<ui32>, uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_cmpeq_ps() { - struct THelperFloat { - static float Call(const float op1, const float op2) { - union Cast { - unsigned int AsUInt; - float AsFloat; - }; - Cast value; - value.AsUInt = op1 == op2 ? 0xFFFFFFFF : 0; - return value.AsFloat; - } - }; - - Test_mm_dualcmp<float, 4, WrapF(_mm_cmpeq_ps), - THelperFloat, float32x4_t, __m128>(); -} - -template <typename TElem> -struct THelperCMPGT { - static TElem Call(const TElem op1, const TElem op2) { - return op1 > op2 ? ~TElem(0) : TElem(0); - } -}; - -void TSSEEmulTest::Test_mm_cmpgt_epi8() { - Test_mm_dualcmp<i8, 16, Wrap(_mm_cmpgt_epi8), - THelperCMPGT<i8>, int8x16_t>(); -} - -void TSSEEmulTest::Test_mm_cmpgt_epi16() { - Test_mm_dualcmp<i16, 8, Wrap(_mm_cmpgt_epi16), - THelperCMPGT<i16>, int16x8_t>(); -} - -void TSSEEmulTest::Test_mm_cmpgt_epi32() { - Test_mm_dualcmp<i32, 4, Wrap(_mm_cmpgt_epi32), - THelperCMPGT<i32>, int32x4_t>(); -} - -void TSSEEmulTest::Test_mm_cmpgt_ps() { - struct THelperFloat { - static float Call(const float op1, const float op2) { - union Cast { - unsigned int AsUInt; - float AsFloat; - }; - Cast value; - value.AsUInt = op1 > op2 ? 0xFFFFFFFF : 0; - return value.AsFloat; - } - }; - - Test_mm_dualcmp<float, 4, WrapF(_mm_cmpgt_ps), - THelperFloat, float32x4_t, __m128>(); -} - -template <typename TElem> -struct THelperCMPLT { - static TElem Call(const TElem op1, const TElem op2) { - return op1 < op2 ? ~TElem(0) : TElem(0); - } -}; - -void TSSEEmulTest::Test_mm_cmplt_epi8() { - Test_mm_dualcmp<i8, 16, Wrap(_mm_cmplt_epi8), - THelperCMPLT<i8>, int8x16_t>(); -} - -void TSSEEmulTest::Test_mm_cmplt_epi16() { - Test_mm_dualcmp<i16, 8, Wrap(_mm_cmplt_epi16), - THelperCMPLT<i16>, int16x8_t>(); -} - -void TSSEEmulTest::Test_mm_cmplt_epi32() { - Test_mm_dualcmp<i32, 4, Wrap(_mm_cmplt_epi32), - THelperCMPLT<i32>, int32x4_t>(); -} - -template <typename TElem, int elemCount, + } +} + +void TSSEEmulTest::Test_mm_or_si128() { + struct THelper { + static ui64 Call(const ui64 op1, const ui64 op2) { + return op1 | op2; + } + }; + + Test_mm_dualop<ui64, 2, Wrap(_mm_or_si128), THelper, uint64x2_t>(); +} + +void TSSEEmulTest::Test_mm_and_si128() { + struct THelper { + static ui64 Call(const ui64 op1, const ui64 op2) { + return op1 & op2; + } + }; + + Test_mm_dualop<ui64, 2, Wrap(_mm_and_si128), THelper, uint64x2_t>(); +} + +void TSSEEmulTest::Test_mm_andnot_si128() { + struct THelper { + static ui64 Call(const ui64 op1, const ui64 op2) { + return (~op1) & op2; + } + }; + + Test_mm_dualop<ui64, 2, Wrap(_mm_andnot_si128), THelper, uint64x2_t>(); +} + +template <typename TElem> +struct THelperCMPEQ { + static TElem Call(const TElem op1, const TElem op2) { + return op1 == op2 ? ~TElem(0) : TElem(0); + } +}; + +void TSSEEmulTest::Test_mm_cmpeq_epi8() { + Test_mm_dualcmp<ui8, 16, Wrap(_mm_cmpeq_epi8), + THelperCMPEQ<ui8>, uint8x16_t>(); +} + +void TSSEEmulTest::Test_mm_cmpeq_epi16() { + Test_mm_dualcmp<ui16, 8, Wrap(_mm_cmpeq_epi16), + THelperCMPEQ<ui16>, uint16x8_t>(); +} + +void TSSEEmulTest::Test_mm_cmpeq_epi32() { + Test_mm_dualcmp<ui32, 4, Wrap(_mm_cmpeq_epi32), + THelperCMPEQ<ui32>, uint32x4_t>(); +} + +void TSSEEmulTest::Test_mm_cmpeq_ps() { + struct THelperFloat { + static float Call(const float op1, const float op2) { + union Cast { + unsigned int AsUInt; + float AsFloat; + }; + Cast value; + value.AsUInt = op1 == op2 ? 0xFFFFFFFF : 0; + return value.AsFloat; + } + }; + + Test_mm_dualcmp<float, 4, WrapF(_mm_cmpeq_ps), + THelperFloat, float32x4_t, __m128>(); +} + +template <typename TElem> +struct THelperCMPGT { + static TElem Call(const TElem op1, const TElem op2) { + return op1 > op2 ? ~TElem(0) : TElem(0); + } +}; + +void TSSEEmulTest::Test_mm_cmpgt_epi8() { + Test_mm_dualcmp<i8, 16, Wrap(_mm_cmpgt_epi8), + THelperCMPGT<i8>, int8x16_t>(); +} + +void TSSEEmulTest::Test_mm_cmpgt_epi16() { + Test_mm_dualcmp<i16, 8, Wrap(_mm_cmpgt_epi16), + THelperCMPGT<i16>, int16x8_t>(); +} + +void TSSEEmulTest::Test_mm_cmpgt_epi32() { + Test_mm_dualcmp<i32, 4, Wrap(_mm_cmpgt_epi32), + THelperCMPGT<i32>, int32x4_t>(); +} + +void TSSEEmulTest::Test_mm_cmpgt_ps() { + struct THelperFloat { + static float Call(const float op1, const float op2) { + union Cast { + unsigned int AsUInt; + float AsFloat; + }; + Cast value; + value.AsUInt = op1 > op2 ? 0xFFFFFFFF : 0; + return value.AsFloat; + } + }; + + Test_mm_dualcmp<float, 4, WrapF(_mm_cmpgt_ps), + THelperFloat, float32x4_t, __m128>(); +} + +template <typename TElem> +struct THelperCMPLT { + static TElem Call(const TElem op1, const TElem op2) { + return op1 < op2 ? ~TElem(0) : TElem(0); + } +}; + +void TSSEEmulTest::Test_mm_cmplt_epi8() { + Test_mm_dualcmp<i8, 16, Wrap(_mm_cmplt_epi8), + THelperCMPLT<i8>, int8x16_t>(); +} + +void TSSEEmulTest::Test_mm_cmplt_epi16() { + Test_mm_dualcmp<i16, 8, Wrap(_mm_cmplt_epi16), + THelperCMPLT<i16>, int16x8_t>(); +} + +void TSSEEmulTest::Test_mm_cmplt_epi32() { + Test_mm_dualcmp<i32, 4, Wrap(_mm_cmplt_epi32), + THelperCMPLT<i32>, int32x4_t>(); +} + +template <typename TElem, int elemCount, typename TFunc, typename TOp, typename TVectorType> -void TSSEEmulTest::Test_mm_setter_epiXX() { - char data[64] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', - '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', - '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', - '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', - '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', - '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; - TElem* dataw = reinterpret_cast<TElem*>(&data); - - for (unsigned dataItem = 0; dataItem < elemCount * 4; ++dataItem) { +void TSSEEmulTest::Test_mm_setter_epiXX() { + char data[64] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', + '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', + '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', + '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', + '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', + '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; + TElem* dataw = reinterpret_cast<TElem*>(&data); + + for (unsigned dataItem = 0; dataItem < elemCount * 4; ++dataItem) { TVectorType value = TFunc(dataw[dataItem]); - - for (unsigned i = 0; i < elemCount; ++i) - UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<TOp>::As(value)[i]); - } -} - -void TSSEEmulTest::Test_mm_set1_epi8() { - Test_mm_setter_epiXX<i8, 16, Wrap(_mm_set1_epi8), int8x16_t, __m128i>(); -} -void TSSEEmulTest::Test_mm_set1_epi16() { - Test_mm_setter_epiXX<i16, 8, Wrap(_mm_set1_epi16), int16x8_t, __m128i>(); -} -void TSSEEmulTest::Test_mm_set1_epi32() { - Test_mm_setter_epiXX<i32, 4, Wrap(_mm_set1_epi32), int32x4_t, __m128i>(); -} -void TSSEEmulTest::Test_mm_set1_ps() { - Test_mm_setter_epiXX<float, 4, WrapF(_mm_set1_ps), float32x4_t, __m128>(); -} - + + for (unsigned i = 0; i < elemCount; ++i) + UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<TOp>::As(value)[i]); + } +} + +void TSSEEmulTest::Test_mm_set1_epi8() { + Test_mm_setter_epiXX<i8, 16, Wrap(_mm_set1_epi8), int8x16_t, __m128i>(); +} +void TSSEEmulTest::Test_mm_set1_epi16() { + Test_mm_setter_epiXX<i16, 8, Wrap(_mm_set1_epi16), int16x8_t, __m128i>(); +} +void TSSEEmulTest::Test_mm_set1_epi32() { + Test_mm_setter_epiXX<i32, 4, Wrap(_mm_set1_epi32), int32x4_t, __m128i>(); +} +void TSSEEmulTest::Test_mm_set1_ps() { + Test_mm_setter_epiXX<float, 4, WrapF(_mm_set1_ps), float32x4_t, __m128>(); +} + void TSSEEmulTest::Test_mm_set_ps1() { Test_mm_setter_epiXX<float, 4, WrapF(_mm_set_ps1), float32x4_t, __m128>(); } -void TSSEEmulTest::Test_mm_setzero_si128() { - __m128i value = _mm_setzero_si128(); - for (unsigned i = 0; i < 4; ++i) - UNIT_ASSERT_EQUAL(0, TQType<uint32x4_t>::As(value)[i]); -} - -void TSSEEmulTest::Test_mm_setzero_ps() { - __m128 value = _mm_setzero_ps(); - for (unsigned i = 0; i < 4; ++i) - UNIT_ASSERT_EQUAL(0.0, TQType<float32x4_t>::As(value)[i]); -} - +void TSSEEmulTest::Test_mm_setzero_si128() { + __m128i value = _mm_setzero_si128(); + for (unsigned i = 0; i < 4; ++i) + UNIT_ASSERT_EQUAL(0, TQType<uint32x4_t>::As(value)[i]); +} + +void TSSEEmulTest::Test_mm_setzero_ps() { + __m128 value = _mm_setzero_ps(); + for (unsigned i = 0; i < 4; ++i) + UNIT_ASSERT_EQUAL(0.0, TQType<float32x4_t>::As(value)[i]); +} + void TSSEEmulTest::Test_mm_setzero_pd() { __m128d value = _mm_setzero_pd(); for (unsigned i = 0; i < 2; ++i) UNIT_ASSERT_EQUAL(0.0, TQType<float64x2_t>::As(value)[i]); } -void TSSEEmulTest::Test_mm_loadl_epi64() { - char data[64] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', - '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', - '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', - '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', - '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', - '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; - ui64* dataw = reinterpret_cast<ui64*>(&data); - - for (unsigned dataItem = 0; dataItem < 8; ++dataItem) { - __m128i value = _mm_loadl_epi64((__m128i const*)&dataw[dataItem]); - - UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<uint64x2_t>::As(value)[0]); - UNIT_ASSERT_EQUAL(0, TQType<uint64x2_t>::As(value)[1]); - } -} - -void TSSEEmulTest::Test_mm_storel_epi64() { - char data[64] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', - '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', - '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', - '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', - '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', - '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; - ui64* dataw = reinterpret_cast<ui64*>(&data); - - for (unsigned dataItem = 0; dataItem < 4; ++dataItem) { - __m128i value = _mm_loadu_si128((__m128i*)&dataw[dataItem * 2]); - - ui64 buf[2] = {55, 81}; - _mm_storel_epi64((__m128i*)&buf, value); - - UNIT_ASSERT_EQUAL(dataw[dataItem * 2], buf[0]); - UNIT_ASSERT_EQUAL(81, buf[1]); - } -} - -void TSSEEmulTest::Test_mm_shuffle_epi32() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - ui32* dataw = reinterpret_cast<ui32*>(&data); - __m128i value = _mm_loadu_si128((__m128i*)&data); - - int coding[4] = {1, 3, 0, 2}; - __m128i result = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 0, 3, 1)); - - for (unsigned i = 0; i < 4; ++i) - UNIT_ASSERT_EQUAL(dataw[coding[i]], - TQType<uint32x4_t>::As(result)[i]); -} - -static int GetHighBitAt(char data, int at) { - ui8 udata = data & 0x80; - return int(udata >> 7) << at; -} - -void TSSEEmulTest::Test_mm_movemask_epi8() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - __m128i value = _mm_loadu_si128((__m128i*)&data); - - int result = _mm_movemask_epi8(value); - int verify = 0; - for (unsigned i = 0; i < 16; ++i) { - verify |= GetHighBitAt(data[i], i); - } - - UNIT_ASSERT_EQUAL(result, verify); -} - -void TSSEEmulTest::Test_mm_movemask_ps() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - __m128 value = _mm_loadu_ps((float*)&data); - - int result = _mm_movemask_ps(value); - int verify = 0; - for (unsigned i = 0; i < 4; ++i) { - verify |= GetHighBitAt(data[i * 4 + 3], i); - } - - UNIT_ASSERT_EQUAL(result, verify); -} - +void TSSEEmulTest::Test_mm_loadl_epi64() { + char data[64] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', + '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', + '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', + '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', + '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', + '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; + ui64* dataw = reinterpret_cast<ui64*>(&data); + + for (unsigned dataItem = 0; dataItem < 8; ++dataItem) { + __m128i value = _mm_loadl_epi64((__m128i const*)&dataw[dataItem]); + + UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<uint64x2_t>::As(value)[0]); + UNIT_ASSERT_EQUAL(0, TQType<uint64x2_t>::As(value)[1]); + } +} + +void TSSEEmulTest::Test_mm_storel_epi64() { + char data[64] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', + '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', + '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', + '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', + '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', + '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; + ui64* dataw = reinterpret_cast<ui64*>(&data); + + for (unsigned dataItem = 0; dataItem < 4; ++dataItem) { + __m128i value = _mm_loadu_si128((__m128i*)&dataw[dataItem * 2]); + + ui64 buf[2] = {55, 81}; + _mm_storel_epi64((__m128i*)&buf, value); + + UNIT_ASSERT_EQUAL(dataw[dataItem * 2], buf[0]); + UNIT_ASSERT_EQUAL(81, buf[1]); + } +} + +void TSSEEmulTest::Test_mm_shuffle_epi32() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + ui32* dataw = reinterpret_cast<ui32*>(&data); + __m128i value = _mm_loadu_si128((__m128i*)&data); + + int coding[4] = {1, 3, 0, 2}; + __m128i result = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 0, 3, 1)); + + for (unsigned i = 0; i < 4; ++i) + UNIT_ASSERT_EQUAL(dataw[coding[i]], + TQType<uint32x4_t>::As(result)[i]); +} + +static int GetHighBitAt(char data, int at) { + ui8 udata = data & 0x80; + return int(udata >> 7) << at; +} + +void TSSEEmulTest::Test_mm_movemask_epi8() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + __m128i value = _mm_loadu_si128((__m128i*)&data); + + int result = _mm_movemask_epi8(value); + int verify = 0; + for (unsigned i = 0; i < 16; ++i) { + verify |= GetHighBitAt(data[i], i); + } + + UNIT_ASSERT_EQUAL(result, verify); +} + +void TSSEEmulTest::Test_mm_movemask_ps() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + __m128 value = _mm_loadu_ps((float*)&data); + + int result = _mm_movemask_ps(value); + int verify = 0; + for (unsigned i = 0; i < 4; ++i) { + verify |= GetHighBitAt(data[i * 4 + 3], i); + } + + UNIT_ASSERT_EQUAL(result, verify); +} + void TSSEEmulTest::Test_mm_movemask_ps_2() { char data[16] = { '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', @@ -1414,19 +1414,19 @@ void TSSEEmulTest::Test_mm_movemask_ps_2() { UNIT_ASSERT_EQUAL(result, 0xf); } -void TSSEEmulTest::Test_mm_cvtsi128_si32() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - __m128i value = _mm_loadu_si128((__m128i*)&data); - - int result = _mm_cvtsi128_si32(value); - i32* datap = reinterpret_cast<i32*>(&data); - int verify = datap[0]; - - UNIT_ASSERT_EQUAL(result, verify); -} - +void TSSEEmulTest::Test_mm_cvtsi128_si32() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + __m128i value = _mm_loadu_si128((__m128i*)&data); + + int result = _mm_cvtsi128_si32(value); + i32* datap = reinterpret_cast<i32*>(&data); + int verify = datap[0]; + + UNIT_ASSERT_EQUAL(result, verify); +} + void TSSEEmulTest::Test_mm_cvtsi128_si64() { char data[16] = { '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1440,52 +1440,52 @@ void TSSEEmulTest::Test_mm_cvtsi128_si64() { UNIT_ASSERT_EQUAL(result, verify); } -void TSSEEmulTest::Test_mm_set_epi16() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - i16* dataw = reinterpret_cast<i16*>(&data); - ui64* dataq = reinterpret_cast<ui64*>(&data); - - __m128i result = _mm_set_epi16(dataw[7], dataw[6], dataw[5], dataw[4], - dataw[3], dataw[2], dataw[1], dataw[0]); - ui64 buf[2] = {53, 81}; - _mm_storeu_si128((__m128i*)&buf, result); - - UNIT_ASSERT_EQUAL(buf[0], dataq[0]); - UNIT_ASSERT_EQUAL(buf[1], dataq[1]); -} - -void TSSEEmulTest::Test_mm_set_epi32() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - i32* dataw = reinterpret_cast<i32*>(&data); - ui64* dataq = reinterpret_cast<ui64*>(&data); - - __m128i result = _mm_set_epi32(dataw[3], dataw[2], dataw[1], dataw[0]); - ui64 buf[2] = {53, 81}; - _mm_storeu_si128((__m128i*)&buf, result); - - UNIT_ASSERT_EQUAL(buf[0], dataq[0]); - UNIT_ASSERT_EQUAL(buf[1], dataq[1]); -} - -void TSSEEmulTest::Test_mm_set_ps() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - float* dataw = reinterpret_cast<float*>(&data); - ui64* dataq = reinterpret_cast<ui64*>(&data); - - __m128 result = _mm_set_ps(dataw[3], dataw[2], dataw[1], dataw[0]); - ui64 buf[2] = {53, 81}; - _mm_storeu_ps((float*)&buf, result); - - UNIT_ASSERT_EQUAL(buf[0], dataq[0]); - UNIT_ASSERT_EQUAL(buf[1], dataq[1]); -} - +void TSSEEmulTest::Test_mm_set_epi16() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + i16* dataw = reinterpret_cast<i16*>(&data); + ui64* dataq = reinterpret_cast<ui64*>(&data); + + __m128i result = _mm_set_epi16(dataw[7], dataw[6], dataw[5], dataw[4], + dataw[3], dataw[2], dataw[1], dataw[0]); + ui64 buf[2] = {53, 81}; + _mm_storeu_si128((__m128i*)&buf, result); + + UNIT_ASSERT_EQUAL(buf[0], dataq[0]); + UNIT_ASSERT_EQUAL(buf[1], dataq[1]); +} + +void TSSEEmulTest::Test_mm_set_epi32() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + i32* dataw = reinterpret_cast<i32*>(&data); + ui64* dataq = reinterpret_cast<ui64*>(&data); + + __m128i result = _mm_set_epi32(dataw[3], dataw[2], dataw[1], dataw[0]); + ui64 buf[2] = {53, 81}; + _mm_storeu_si128((__m128i*)&buf, result); + + UNIT_ASSERT_EQUAL(buf[0], dataq[0]); + UNIT_ASSERT_EQUAL(buf[1], dataq[1]); +} + +void TSSEEmulTest::Test_mm_set_ps() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + float* dataw = reinterpret_cast<float*>(&data); + ui64* dataq = reinterpret_cast<ui64*>(&data); + + __m128 result = _mm_set_ps(dataw[3], dataw[2], dataw[1], dataw[0]); + ui64 buf[2] = {53, 81}; + _mm_storeu_ps((float*)&buf, result); + + UNIT_ASSERT_EQUAL(buf[0], dataq[0]); + UNIT_ASSERT_EQUAL(buf[1], dataq[1]); +} + void TSSEEmulTest::Test_mm_set_pd() { char data[16] = { '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1501,22 +1501,22 @@ void TSSEEmulTest::Test_mm_set_pd() { UNIT_ASSERT_EQUAL(buf[1], dataq[1]); } -void TSSEEmulTest::Test_mm_cvtsi32_si128() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - i32* dataw = reinterpret_cast<i32*>(&data); - - __m128i result = _mm_cvtsi32_si128(dataw[0]); - i32 buf[4] = {53, 81, -43, 2132}; - _mm_storeu_si128((__m128i*)&buf, result); - - UNIT_ASSERT_EQUAL(buf[0], dataw[0]); - UNIT_ASSERT_EQUAL(buf[1], 0); - UNIT_ASSERT_EQUAL(buf[2], 0); - UNIT_ASSERT_EQUAL(buf[3], 0); -} - +void TSSEEmulTest::Test_mm_cvtsi32_si128() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + i32* dataw = reinterpret_cast<i32*>(&data); + + __m128i result = _mm_cvtsi32_si128(dataw[0]); + i32 buf[4] = {53, 81, -43, 2132}; + _mm_storeu_si128((__m128i*)&buf, result); + + UNIT_ASSERT_EQUAL(buf[0], dataw[0]); + UNIT_ASSERT_EQUAL(buf[1], 0); + UNIT_ASSERT_EQUAL(buf[2], 0); + UNIT_ASSERT_EQUAL(buf[3], 0); +} + void TSSEEmulTest::Test_mm_cvtsi64_si128() { char data[16] = { '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1531,44 +1531,44 @@ void TSSEEmulTest::Test_mm_cvtsi64_si128() { UNIT_ASSERT_EQUAL(buf[1], 0); } -template <typename TElem, typename TNarrow, unsigned elemCount, typename TFunc> -void TSSEEmulTest::Test_mm_packs_epiXX() { - char data[32] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x00', '\x66', '\x1C', - '\x99', '\x33', '\x1C', '\x55', '\x00', '\x00', '\x00', '\x00', - '\x00', '\xAA', '\x00', '\x00', '\xCC', '\xBB', '\x22', '\xFF'}; - __m128i value0 = _mm_loadu_si128((__m128i*)&data); - __m128i value1 = _mm_loadu_si128(((__m128i*)&data) + 1); - TElem* dataw = reinterpret_cast<TElem*>(&data); - - __m128i result = TFunc(value0, value1); - - TNarrow verify[elemCount]; - for (unsigned i = 0; i < elemCount; ++i) { - TElem sum = dataw[i]; - if (sum > std::numeric_limits<TNarrow>::max()) - sum = std::numeric_limits<TNarrow>::max(); - if (sum < std::numeric_limits<TNarrow>::min()) - sum = std::numeric_limits<TNarrow>::min(); - verify[i] = TNarrow(sum); - } - - ui64* verifyp = (ui64*)&verify; - UNIT_ASSERT_EQUAL(verifyp[0], TQType<uint64x2_t>::As(result)[0]); - UNIT_ASSERT_EQUAL(verifyp[1], TQType<uint64x2_t>::As(result)[1]); -} - -void TSSEEmulTest::Test_mm_packs_epi16() { - Test_mm_packs_epiXX<i16, i8, 16, Wrap(_mm_packs_epi16)>(); -} -void TSSEEmulTest::Test_mm_packs_epi32() { - Test_mm_packs_epiXX<i32, i16, 8, Wrap(_mm_packs_epi32)>(); -} -void TSSEEmulTest::Test_mm_packus_epi16() { - Test_mm_packs_epiXX<i16, ui8, 16, Wrap(_mm_packus_epi16)>(); -} - +template <typename TElem, typename TNarrow, unsigned elemCount, typename TFunc> +void TSSEEmulTest::Test_mm_packs_epiXX() { + char data[32] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x00', '\x66', '\x1C', + '\x99', '\x33', '\x1C', '\x55', '\x00', '\x00', '\x00', '\x00', + '\x00', '\xAA', '\x00', '\x00', '\xCC', '\xBB', '\x22', '\xFF'}; + __m128i value0 = _mm_loadu_si128((__m128i*)&data); + __m128i value1 = _mm_loadu_si128(((__m128i*)&data) + 1); + TElem* dataw = reinterpret_cast<TElem*>(&data); + + __m128i result = TFunc(value0, value1); + + TNarrow verify[elemCount]; + for (unsigned i = 0; i < elemCount; ++i) { + TElem sum = dataw[i]; + if (sum > std::numeric_limits<TNarrow>::max()) + sum = std::numeric_limits<TNarrow>::max(); + if (sum < std::numeric_limits<TNarrow>::min()) + sum = std::numeric_limits<TNarrow>::min(); + verify[i] = TNarrow(sum); + } + + ui64* verifyp = (ui64*)&verify; + UNIT_ASSERT_EQUAL(verifyp[0], TQType<uint64x2_t>::As(result)[0]); + UNIT_ASSERT_EQUAL(verifyp[1], TQType<uint64x2_t>::As(result)[1]); +} + +void TSSEEmulTest::Test_mm_packs_epi16() { + Test_mm_packs_epiXX<i16, i8, 16, Wrap(_mm_packs_epi16)>(); +} +void TSSEEmulTest::Test_mm_packs_epi32() { + Test_mm_packs_epiXX<i32, i16, 8, Wrap(_mm_packs_epi32)>(); +} +void TSSEEmulTest::Test_mm_packus_epi16() { + Test_mm_packs_epiXX<i16, ui8, 16, Wrap(_mm_packus_epi16)>(); +} + void TSSEEmulTest::Test_mm_extract_epi8() { alignas(16) char data[16] = { '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1594,23 +1594,23 @@ void TSSEEmulTest::Test_mm_extract_epi8() { UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 15)), int(dataw[15])); } -void TSSEEmulTest::Test_mm_extract_epi16() { +void TSSEEmulTest::Test_mm_extract_epi16() { alignas(16) char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; const ui16* dataw = reinterpret_cast<const ui16*>(&data); const __m128i value = _mm_loadu_si128((__m128i*)&data); - - UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0])); - UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 1)), int(dataw[1])); - UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 2)), int(dataw[2])); - UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 3)), int(dataw[3])); - UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 4)), int(dataw[4])); - UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 5)), int(dataw[5])); - UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 6)), int(dataw[6])); - UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 7)), int(dataw[7])); -} - + + UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0])); + UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 1)), int(dataw[1])); + UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 2)), int(dataw[2])); + UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 3)), int(dataw[3])); + UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 4)), int(dataw[4])); + UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 5)), int(dataw[5])); + UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 6)), int(dataw[6])); + UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 7)), int(dataw[7])); +} + void TSSEEmulTest::Test_mm_extract_epi64() { alignas(16) char data[16] = { '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1635,160 +1635,160 @@ void TSSEEmulTest::Test_mm_extract_epi32() { UNIT_ASSERT_EQUAL((_mm_extract_epi32(value, 3)), int(dataw[3])); } -void TSSEEmulTest::Test_MM_TRANSPOSE4_PS() { - char data0[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - char data1[16] = { - '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', - '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; - char data2[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - char data3[16] = { - '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', - '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; - - __m128 value0 = _mm_loadu_ps((float*)&data0); - __m128 value1 = _mm_loadu_ps((float*)&data1); - __m128 value2 = _mm_loadu_ps((float*)&data2); - __m128 value3 = _mm_loadu_ps((float*)&data3); - - _MM_TRANSPOSE4_PS(value0, value1, value2, value3); - - ui64 tbuf0[2] = {0, 0}; - ui64 tbuf1[2] = {0, 0}; - ui64 tbuf2[2] = {0, 0}; - ui64 tbuf3[2] = {0, 0}; - - _mm_storeu_ps((float*)&tbuf0, value0); - _mm_storeu_ps((float*)&tbuf1, value1); - _mm_storeu_ps((float*)&tbuf2, value2); - _mm_storeu_ps((float*)&tbuf3, value3); - - char tdata0[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55', - '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55'}; - char tdata1[16] = { - '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44', - '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44'}; - char tdata2[16] = { - '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11', - '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11'}; - char tdata3[16] = { - '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF', - '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF'}; - - UNIT_ASSERT(memcmp(tbuf0, tdata0, 16) == 0); - UNIT_ASSERT(memcmp(tbuf1, tdata1, 16) == 0); - UNIT_ASSERT(memcmp(tbuf2, tdata2, 16) == 0); - UNIT_ASSERT(memcmp(tbuf3, tdata3, 16) == 0); -} - -template <typename TFrom, typename TTo, unsigned elemCount, - typename TLoadVector, typename TResultVector, - typename TElemFunc, typename TFunc, typename TOp> -void TSSEEmulTest::Test_mm_convertop() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - TFrom* datap = reinterpret_cast<TFrom*>(&data); - - TLoadVector value = TFuncLoad<TLoadVector>(&data); - - TTo procData[elemCount]; - for (unsigned i = 0; i < elemCount; ++i) { - procData[i] = TElemFunc::Call(datap[i]); - } - - TResultVector result = TFunc(value); - - for (unsigned i = 0; i < elemCount; ++i) { - UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]); - } -} - -void TSSEEmulTest::Test_mm_cvtepi32_ps() { - struct THelper { - static float Call(const i32 op) { - return float(op); - } - }; - Test_mm_convertop<i32, float, 4, __m128i, __m128, - THelper, WrapF(_mm_cvtepi32_ps), float32x4_t>(); -}; - -void TSSEEmulTest::Test_mm_cvtps_epi32() { - struct THelper { - static i32 Call(const float op) { - return i32(op); - } - }; - Test_mm_convertop<float, i32, 4, __m128, __m128i, +void TSSEEmulTest::Test_MM_TRANSPOSE4_PS() { + char data0[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + char data1[16] = { + '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', + '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; + char data2[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + char data3[16] = { + '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', + '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; + + __m128 value0 = _mm_loadu_ps((float*)&data0); + __m128 value1 = _mm_loadu_ps((float*)&data1); + __m128 value2 = _mm_loadu_ps((float*)&data2); + __m128 value3 = _mm_loadu_ps((float*)&data3); + + _MM_TRANSPOSE4_PS(value0, value1, value2, value3); + + ui64 tbuf0[2] = {0, 0}; + ui64 tbuf1[2] = {0, 0}; + ui64 tbuf2[2] = {0, 0}; + ui64 tbuf3[2] = {0, 0}; + + _mm_storeu_ps((float*)&tbuf0, value0); + _mm_storeu_ps((float*)&tbuf1, value1); + _mm_storeu_ps((float*)&tbuf2, value2); + _mm_storeu_ps((float*)&tbuf3, value3); + + char tdata0[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55', + '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55'}; + char tdata1[16] = { + '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44', + '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44'}; + char tdata2[16] = { + '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11', + '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11'}; + char tdata3[16] = { + '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF', + '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF'}; + + UNIT_ASSERT(memcmp(tbuf0, tdata0, 16) == 0); + UNIT_ASSERT(memcmp(tbuf1, tdata1, 16) == 0); + UNIT_ASSERT(memcmp(tbuf2, tdata2, 16) == 0); + UNIT_ASSERT(memcmp(tbuf3, tdata3, 16) == 0); +} + +template <typename TFrom, typename TTo, unsigned elemCount, + typename TLoadVector, typename TResultVector, + typename TElemFunc, typename TFunc, typename TOp> +void TSSEEmulTest::Test_mm_convertop() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + TFrom* datap = reinterpret_cast<TFrom*>(&data); + + TLoadVector value = TFuncLoad<TLoadVector>(&data); + + TTo procData[elemCount]; + for (unsigned i = 0; i < elemCount; ++i) { + procData[i] = TElemFunc::Call(datap[i]); + } + + TResultVector result = TFunc(value); + + for (unsigned i = 0; i < elemCount; ++i) { + UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]); + } +} + +void TSSEEmulTest::Test_mm_cvtepi32_ps() { + struct THelper { + static float Call(const i32 op) { + return float(op); + } + }; + Test_mm_convertop<i32, float, 4, __m128i, __m128, + THelper, WrapF(_mm_cvtepi32_ps), float32x4_t>(); +}; + +void TSSEEmulTest::Test_mm_cvtps_epi32() { + struct THelper { + static i32 Call(const float op) { + return i32(op); + } + }; + Test_mm_convertop<float, i32, 4, __m128, __m128i, THelper, T_mm_CallWrapper<__m128i, decltype(_mm_cvtps_epi32), _mm_cvtps_epi32>, int32x4_t>(); -}; - -void TSSEEmulTest::Test_mm_cvttps_epi32() { - struct THelper { - static i32 Call(const float op) { - return i32(op); - } - }; - Test_mm_convertop<float, i32, 4, __m128, __m128i, - THelper, Wrap(_mm_cvttps_epi32), int32x4_t>(); -}; - -template <typename TLoadVector, typename TCastVector, - typename TFunc, TFunc* func> -void TSSEEmulTest::Test_mm_castXX() { - char data[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - - TLoadVector value = TFuncLoad<TLoadVector>(&data); - const TLoadVector constvalue = TFuncLoad<TLoadVector>(&data); - TCastVector casted = func(value); - const TCastVector constcasted = func(constvalue); - char verify[16]; - char constverify[16]; - TFuncStore<TCastVector>(&verify, casted); - TFuncStore<TCastVector>(&constverify, constcasted); - - UNIT_ASSERT(memcmp(&data, &verify, 16) == 0); - UNIT_ASSERT(memcmp(&data, &constverify, 16) == 0); -}; - -void TSSEEmulTest::Test_mm_castsi128_ps() { - Test_mm_castXX<__m128i, __m128, - decltype(_mm_castsi128_ps), _mm_castsi128_ps>(); -} - -void TSSEEmulTest::Test_mm_castps_si128() { - Test_mm_castXX<__m128, __m128i, - decltype(_mm_castps_si128), _mm_castps_si128>(); -} - -void TSSEEmulTest::Test_mm_mul_epu32() { - char data0[16] = { - '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - char data1[16] = { - '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', - '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; - ui32* dataw0 = reinterpret_cast<ui32*>(&data0); - ui32* dataw1 = reinterpret_cast<ui32*>(&data1); - - __m128i value0 = _mm_loadu_si128((__m128i*)&data0); - __m128i value1 = _mm_loadu_si128((__m128i*)&data1); - +}; + +void TSSEEmulTest::Test_mm_cvttps_epi32() { + struct THelper { + static i32 Call(const float op) { + return i32(op); + } + }; + Test_mm_convertop<float, i32, 4, __m128, __m128i, + THelper, Wrap(_mm_cvttps_epi32), int32x4_t>(); +}; + +template <typename TLoadVector, typename TCastVector, + typename TFunc, TFunc* func> +void TSSEEmulTest::Test_mm_castXX() { + char data[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + + TLoadVector value = TFuncLoad<TLoadVector>(&data); + const TLoadVector constvalue = TFuncLoad<TLoadVector>(&data); + TCastVector casted = func(value); + const TCastVector constcasted = func(constvalue); + char verify[16]; + char constverify[16]; + TFuncStore<TCastVector>(&verify, casted); + TFuncStore<TCastVector>(&constverify, constcasted); + + UNIT_ASSERT(memcmp(&data, &verify, 16) == 0); + UNIT_ASSERT(memcmp(&data, &constverify, 16) == 0); +}; + +void TSSEEmulTest::Test_mm_castsi128_ps() { + Test_mm_castXX<__m128i, __m128, + decltype(_mm_castsi128_ps), _mm_castsi128_ps>(); +} + +void TSSEEmulTest::Test_mm_castps_si128() { + Test_mm_castXX<__m128, __m128i, + decltype(_mm_castps_si128), _mm_castps_si128>(); +} + +void TSSEEmulTest::Test_mm_mul_epu32() { + char data0[16] = { + '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; + char data1[16] = { + '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', + '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; + ui32* dataw0 = reinterpret_cast<ui32*>(&data0); + ui32* dataw1 = reinterpret_cast<ui32*>(&data1); + + __m128i value0 = _mm_loadu_si128((__m128i*)&data0); + __m128i value1 = _mm_loadu_si128((__m128i*)&data1); + ui64 mul0 = (ui64) dataw0[0] * (ui64) dataw1[0]; ui64 mul1 = (ui64) dataw0[2] * (ui64) dataw1[2]; - - __m128i result = _mm_mul_epu32(value0, value1); - - UNIT_ASSERT_EQUAL(mul0, TQType<uint64x2_t>::As(result)[0]); - UNIT_ASSERT_EQUAL(mul1, TQType<uint64x2_t>::As(result)[1]); -} + + __m128i result = _mm_mul_epu32(value0, value1); + + UNIT_ASSERT_EQUAL(mul0, TQType<uint64x2_t>::As(result)[0]); + UNIT_ASSERT_EQUAL(mul1, TQType<uint64x2_t>::As(result)[1]); +} void TSSEEmulTest::Test_mm_cmpunord_ps() { alignas(16) float valuesBits[4] = {1.f, 2.f, 3.f, 4.f}; diff --git a/library/cpp/sse/ut/ya.make b/library/cpp/sse/ut/ya.make index 14cac6727a..45e104971e 100644 --- a/library/cpp/sse/ut/ya.make +++ b/library/cpp/sse/ut/ya.make @@ -1,13 +1,13 @@ UNITTEST_FOR(library/cpp/sse) - + OWNER(danlark) - -SRCS( + +SRCS( test.cpp -) - +) + IF (ARCH_X86_64) CFLAGS(-msse4.1 -msse4.2) ENDIF() -END() +END() |