aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/sse
diff options
context:
space:
mode:
authoragri <agri@yandex-team.ru>2022-02-10 16:48:12 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:48:12 +0300
commit2909866fbc652492b7d7cab3023cb19489dc4fd8 (patch)
treeb222e5ac2e2e98872661c51ccceee5da0d291e13 /library/cpp/sse
parentd3530b2692e400bd4d29bd4f07cafaee139164e7 (diff)
downloadydb-2909866fbc652492b7d7cab3023cb19489dc4fd8.tar.gz
Restoring authorship annotation for <agri@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/sse')
-rw-r--r--library/cpp/sse/sse.h28
-rw-r--r--library/cpp/sse/sse2neon.h1122
-rw-r--r--library/cpp/sse/ut/test.cpp2290
-rw-r--r--library/cpp/sse/ut/ya.make12
4 files changed, 1726 insertions, 1726 deletions
diff --git a/library/cpp/sse/sse.h b/library/cpp/sse/sse.h
index 918a942803..19bac17de0 100644
--- a/library/cpp/sse/sse.h
+++ b/library/cpp/sse/sse.h
@@ -1,18 +1,18 @@
-#pragma once
-
-/*
- The header chooses appropriate SSE support.
- On Intel: SSE intrinsics
- On ARM64: translation to NEON intrinsics or software emulation
+#pragma once
+
+/*
+ The header chooses appropriate SSE support.
+ On Intel: SSE intrinsics
+ On ARM64: translation to NEON intrinsics or software emulation
On PowerPc: translation to Altivec intrinsics or software emulation
-*/
+*/
/* Author: Vitaliy Manushkin <agri@yandex-team.ru>, Danila Kutenin <danlark@yandex-team.ru> */
-
-#include <util/system/platform.h>
+
+#include <util/system/platform.h>
#if (defined(_i386_) || defined(_x86_64_)) && defined(_sse_)
-#include <xmmintrin.h>
-#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
#include <pmmintrin.h>
#define ARCADIA_SSE
#if defined(_ssse3_)
@@ -24,10 +24,10 @@
#if defined(_sse4_2_)
#include <nmmintrin.h>
#endif
-#elif defined(_arm64_)
-#include "sse2neon.h"
+#elif defined(_arm64_)
+#include "sse2neon.h"
#define ARCADIA_SSE
#elif defined(_ppc64_)
#include "powerpc.h"
#define ARCADIA_SSE
-#endif
+#endif
diff --git a/library/cpp/sse/sse2neon.h b/library/cpp/sse/sse2neon.h
index af7f3ed242..695dbd3041 100644
--- a/library/cpp/sse/sse2neon.h
+++ b/library/cpp/sse/sse2neon.h
@@ -1,60 +1,60 @@
-#pragma once
-
-/*
- The header contains inlining code
- which translates SSE intrinsics to NEON intrinsics or software emulation.
- You are encouraged for commitments.
- Add missing intrinsics, add unittests, purify the implementation,
- merge and simplify templates.
- Warning: The code is made in deep nights, so it surely contains bugs,
- imperfections, flaws and all other kinds of errors and mistakes.
-*/
-/* Author: Vitaliy Manushkin <agri@yandex-team.ru> */
-
-#include <util/system/platform.h>
+#pragma once
+
+/*
+ The header contains inlining code
+ which translates SSE intrinsics to NEON intrinsics or software emulation.
+ You are encouraged for commitments.
+ Add missing intrinsics, add unittests, purify the implementation,
+ merge and simplify templates.
+ Warning: The code is made in deep nights, so it surely contains bugs,
+ imperfections, flaws and all other kinds of errors and mistakes.
+*/
+/* Author: Vitaliy Manushkin <agri@yandex-team.ru> */
+
+#include <util/system/platform.h>
#include <util/system/compiler.h>
-#include <util/system/types.h>
-
-#if !defined(_arm64_)
-#error "This header is for ARM64 (aarch64) platform only. " \
+#include <util/system/types.h>
+
+#if !defined(_arm64_)
+#error "This header is for ARM64 (aarch64) platform only. " \
"Include sse.h instead of including this header directly."
-#endif
-
-#include <arm_neon.h>
-
-union __m128i {
- uint64x2_t AsUi64x2;
- int64x2_t AsSi64x2;
-
- uint32x4_t AsUi32x4;
- int32x4_t AsSi32x4;
-
- uint16x8_t AsUi16x8;
- int16x8_t AsSi16x8;
-
- uint8x16_t AsUi8x16;
- int8x16_t AsSi8x16;
-
- float32x4_t AsFloat32x4;
- float64x2_t AsFloat64x2;
-};
-
-union __m128 {
- float32x4_t AsFloat32x4;
- float64x2_t AsFloat64x2;
+#endif
+
+#include <arm_neon.h>
+
+union __m128i {
+ uint64x2_t AsUi64x2;
+ int64x2_t AsSi64x2;
uint32x4_t AsUi32x4;
int32x4_t AsSi32x4;
- uint64x2_t AsUi64x2;
- int64x2_t AsSi64x2;
+ uint16x8_t AsUi16x8;
+ int16x8_t AsSi16x8;
- uint8x16_t AsUi8x16;
+ uint8x16_t AsUi8x16;
+ int8x16_t AsSi8x16;
+
+ float32x4_t AsFloat32x4;
+ float64x2_t AsFloat64x2;
+};
+
+union __m128 {
+ float32x4_t AsFloat32x4;
+ float64x2_t AsFloat64x2;
+
+ uint32x4_t AsUi32x4;
+ int32x4_t AsSi32x4;
+
+ uint64x2_t AsUi64x2;
+ int64x2_t AsSi64x2;
+
+ uint8x16_t AsUi8x16;
int8x16_t AsSi8x16;
__m128i As128i;
-};
-
+};
+
typedef float64x2_t __m128d;
enum _mm_hint
@@ -72,128 +72,128 @@ Y_FORCE_INLINE void _mm_prefetch(const void *p, enum _mm_hint) {
__builtin_prefetch(p);
}
-template <typename TType>
-struct TQType;
-
-template <>
-struct TQType<uint8x16_t> {
- static inline uint8x16_t& As(__m128i& value) {
- return value.AsUi8x16;
- }
- static inline const uint8x16_t& As(const __m128i& value) {
- return value.AsUi8x16;
- }
-};
-
-template <>
-struct TQType<int8x16_t> {
- static inline int8x16_t& As(__m128i& value) {
- return value.AsSi8x16;
- }
- static inline const int8x16_t& As(const __m128i& value) {
- return value.AsSi8x16;
- }
-};
-
-template <>
-struct TQType<uint16x8_t> {
- static inline uint16x8_t& As(__m128i& value) {
- return value.AsUi16x8;
- }
- static inline const uint16x8_t& As(const __m128i& value) {
- return value.AsUi16x8;
- }
-};
-
-template <>
-struct TQType<int16x8_t> {
- static inline int16x8_t& As(__m128i& value) {
- return value.AsSi16x8;
- }
- static inline const int16x8_t& As(const __m128i& value) {
- return value.AsSi16x8;
- }
-};
-
-template <>
-struct TQType<uint32x4_t> {
- static inline uint32x4_t& As(__m128i& value) {
- return value.AsUi32x4;
- }
- static inline const uint32x4_t& As(const __m128i& value) {
- return value.AsUi32x4;
- }
-};
-
-template <>
-struct TQType<int32x4_t> {
- static inline int32x4_t& As(__m128i& value) {
- return value.AsSi32x4;
- }
- static inline const int32x4_t& As(const __m128i& value) {
- return value.AsSi32x4;
- }
-};
-
-template <>
-struct TQType<uint64x2_t> {
- static inline uint64x2_t& As(__m128i& value) {
- return value.AsUi64x2;
- }
- static inline const uint64x2_t& As(const __m128i& value) {
- return value.AsUi64x2;
- }
- static inline uint64x2_t& As(__m128& value) {
- return value.AsUi64x2;
- }
- static inline const uint64x2_t& As(const __m128& value) {
- return value.AsUi64x2;
- }
-};
-
-template <>
-struct TQType<int64x2_t> {
- static inline int64x2_t& As(__m128i& value) {
- return value.AsSi64x2;
- }
- static inline const int64x2_t& As(const __m128i& value) {
- return value.AsSi64x2;
- }
-};
-
-template <typename TValue>
-struct TBaseWrapper {
- TValue Value;
-
+template <typename TType>
+struct TQType;
+
+template <>
+struct TQType<uint8x16_t> {
+ static inline uint8x16_t& As(__m128i& value) {
+ return value.AsUi8x16;
+ }
+ static inline const uint8x16_t& As(const __m128i& value) {
+ return value.AsUi8x16;
+ }
+};
+
+template <>
+struct TQType<int8x16_t> {
+ static inline int8x16_t& As(__m128i& value) {
+ return value.AsSi8x16;
+ }
+ static inline const int8x16_t& As(const __m128i& value) {
+ return value.AsSi8x16;
+ }
+};
+
+template <>
+struct TQType<uint16x8_t> {
+ static inline uint16x8_t& As(__m128i& value) {
+ return value.AsUi16x8;
+ }
+ static inline const uint16x8_t& As(const __m128i& value) {
+ return value.AsUi16x8;
+ }
+};
+
+template <>
+struct TQType<int16x8_t> {
+ static inline int16x8_t& As(__m128i& value) {
+ return value.AsSi16x8;
+ }
+ static inline const int16x8_t& As(const __m128i& value) {
+ return value.AsSi16x8;
+ }
+};
+
+template <>
+struct TQType<uint32x4_t> {
+ static inline uint32x4_t& As(__m128i& value) {
+ return value.AsUi32x4;
+ }
+ static inline const uint32x4_t& As(const __m128i& value) {
+ return value.AsUi32x4;
+ }
+};
+
+template <>
+struct TQType<int32x4_t> {
+ static inline int32x4_t& As(__m128i& value) {
+ return value.AsSi32x4;
+ }
+ static inline const int32x4_t& As(const __m128i& value) {
+ return value.AsSi32x4;
+ }
+};
+
+template <>
+struct TQType<uint64x2_t> {
+ static inline uint64x2_t& As(__m128i& value) {
+ return value.AsUi64x2;
+ }
+ static inline const uint64x2_t& As(const __m128i& value) {
+ return value.AsUi64x2;
+ }
+ static inline uint64x2_t& As(__m128& value) {
+ return value.AsUi64x2;
+ }
+ static inline const uint64x2_t& As(const __m128& value) {
+ return value.AsUi64x2;
+ }
+};
+
+template <>
+struct TQType<int64x2_t> {
+ static inline int64x2_t& As(__m128i& value) {
+ return value.AsSi64x2;
+ }
+ static inline const int64x2_t& As(const __m128i& value) {
+ return value.AsSi64x2;
+ }
+};
+
+template <typename TValue>
+struct TBaseWrapper {
+ TValue Value;
+
Y_FORCE_INLINE
- operator TValue&() {
- return Value;
- }
-
+ operator TValue&() {
+ return Value;
+ }
+
Y_FORCE_INLINE
- operator const TValue&() const {
- return Value;
- }
-};
-
-template <typename TOp, typename TFunc, TFunc* func,
- typename TDup, TDup* dupfunc>
-struct TWrapperSingleDup: public TBaseWrapper<__m128i> {
+ operator const TValue&() const {
+ return Value;
+ }
+};
+
+template <typename TOp, typename TFunc, TFunc* func,
+ typename TDup, TDup* dupfunc>
+struct TWrapperSingleDup: public TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperSingleDup(const __m128i& op, const int shift) {
- TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(shift));
- }
-};
-
-template <typename TOp, typename TFunc, TFunc* func,
- typename TDup, TDup* dupfunc>
-struct TWrapperSingleNegDup: public TBaseWrapper<__m128i> {
+ TWrapperSingleDup(const __m128i& op, const int shift) {
+ TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(shift));
+ }
+};
+
+template <typename TOp, typename TFunc, TFunc* func,
+ typename TDup, TDup* dupfunc>
+struct TWrapperSingleNegDup: public TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperSingleNegDup(const __m128i& op, const int shift) {
- TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(-shift));
- }
-};
-
+ TWrapperSingleNegDup(const __m128i& op, const int shift) {
+ TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(-shift));
+ }
+};
+
inline __m128i _mm_srl_epi16(__m128i a, __m128i count) {
__m128i res;
res.AsUi16x8 = vshlq_u16(a.AsUi16x8, vdupq_n_s16(-count.AsUi16x8[0]));
@@ -225,16 +225,16 @@ inline __m128i _mm_srai_epi32(__m128i a, int count) {
return res;
}
-using _mm_srli_epi16 =
- TWrapperSingleNegDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
- decltype(vdupq_n_s16), vdupq_n_s16>;
-using _mm_srli_epi32 =
- TWrapperSingleNegDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
- decltype(vdupq_n_s32), vdupq_n_s32>;
-using _mm_srli_epi64 =
- TWrapperSingleNegDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
- decltype(vdupq_n_s64), vdupq_n_s64>;
-
+using _mm_srli_epi16 =
+ TWrapperSingleNegDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
+ decltype(vdupq_n_s16), vdupq_n_s16>;
+using _mm_srli_epi32 =
+ TWrapperSingleNegDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
+ decltype(vdupq_n_s32), vdupq_n_s32>;
+using _mm_srli_epi64 =
+ TWrapperSingleNegDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
+ decltype(vdupq_n_s64), vdupq_n_s64>;
+
inline __m128i _mm_sll_epi16(__m128i a, __m128i count) {
__m128i res;
@@ -255,57 +255,57 @@ inline __m128i _mm_sll_epi64(__m128i a, __m128i count) {
return res;
}
-using _mm_slli_epi16 =
- TWrapperSingleDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
- decltype(vdupq_n_s16), vdupq_n_s16>;
-using _mm_slli_epi32 =
- TWrapperSingleDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
- decltype(vdupq_n_s32), vdupq_n_s32>;
-using _mm_slli_epi64 =
- TWrapperSingleDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
- decltype(vdupq_n_s64), vdupq_n_s64>;
-
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
-struct TWrapperDual : TBaseWrapper<__m128i> {
+using _mm_slli_epi16 =
+ TWrapperSingleDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
+ decltype(vdupq_n_s16), vdupq_n_s16>;
+using _mm_slli_epi32 =
+ TWrapperSingleDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
+ decltype(vdupq_n_s32), vdupq_n_s32>;
+using _mm_slli_epi64 =
+ TWrapperSingleDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
+ decltype(vdupq_n_s64), vdupq_n_s64>;
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperDual : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperDual(const __m128i& op1, const __m128i& op2, TParams... params) {
- TQType<TOp>::As(Value) = (TOp)
- func(TQType<TOp>::As(op1),
- TQType<TOp>::As(op2),
- params...);
- }
-};
-
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
-struct TWrapperDualSwap : TBaseWrapper<__m128i> {
+ TWrapperDual(const __m128i& op1, const __m128i& op2, TParams... params) {
+ TQType<TOp>::As(Value) = (TOp)
+ func(TQType<TOp>::As(op1),
+ TQType<TOp>::As(op2),
+ params...);
+ }
+};
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperDualSwap : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperDualSwap(const __m128i& op1, const __m128i& op2, TParams... params) {
- TQType<TOp>::As(Value) =
- func(TQType<TOp>::As(op2),
- TQType<TOp>::As(op1),
- params...);
- }
-};
-
+ TWrapperDualSwap(const __m128i& op1, const __m128i& op2, TParams... params) {
+ TQType<TOp>::As(Value) =
+ func(TQType<TOp>::As(op2),
+ TQType<TOp>::As(op1),
+ params...);
+ }
+};
+
template <typename TOp, typename TFunc, TFunc* func, typename TArgument = __m128>
struct TWrapperDualF : TBaseWrapper<TArgument> {
Y_FORCE_INLINE
TWrapperDualF(const TArgument& op1, const TArgument& op2) {
TQType<TOp>::As(TBaseWrapper<TArgument>::Value) = (TOp) func(TQType<TOp>::As(op1), TQType<TOp>::As(op2));
- }
-};
-
-using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>;
-using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>;
-using _mm_andnot_si128 =
- TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>;
+ }
+};
+
+using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>;
+using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>;
+using _mm_andnot_si128 =
+ TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>;
using _mm_xor_si128 = TWrapperDual<uint64x2_t, decltype(veorq_u64), veorq_u64>;
-
+
using _mm_add_epi8 = TWrapperDual<uint8x16_t, decltype(vaddq_u8), vaddq_u8>;
-using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>;
-using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>;
-using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>;
-
+using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>;
+using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>;
+using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>;
+
inline __m128i _mm_madd_epi16(__m128i a, __m128i b) {
int32x4_t aLow;
int32x4_t aHigh;
@@ -343,118 +343,118 @@ inline __m128i _mm_madd_epi16(__m128i a, __m128i b) {
}
using _mm_sub_epi8 = TWrapperDual<uint8x16_t, decltype(vsubq_u8), vsubq_u8>;
-using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>;
-using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>;
-using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>;
-
-using _mm_unpacklo_epi8 =
- TWrapperDual<uint8x16_t, decltype(vzip1q_u8), vzip1q_u8>;
-using _mm_unpackhi_epi8 =
- TWrapperDual<uint8x16_t, decltype(vzip2q_u8), vzip2q_u8>;
-using _mm_unpacklo_epi16 =
- TWrapperDual<uint16x8_t, decltype(vzip1q_u16), vzip1q_u16>;
-using _mm_unpackhi_epi16 =
- TWrapperDual<uint16x8_t, decltype(vzip2q_u16), vzip2q_u16>;
-using _mm_unpacklo_epi32 =
- TWrapperDual<uint32x4_t, decltype(vzip1q_u32), vzip1q_u32>;
-using _mm_unpackhi_epi32 =
- TWrapperDual<uint32x4_t, decltype(vzip2q_u32), vzip2q_u32>;
-using _mm_unpacklo_epi64 =
- TWrapperDual<uint64x2_t, decltype(vzip1q_u64), vzip1q_u64>;
-using _mm_unpackhi_epi64 =
- TWrapperDual<uint64x2_t, decltype(vzip2q_u64), vzip2q_u64>;
-
-using _mm_cmpeq_epi8 =
- TWrapperDual<uint8x16_t, decltype(vceqq_u8), vceqq_u8>;
-using _mm_cmpeq_epi16 =
- TWrapperDual<uint16x8_t, decltype(vceqq_u16), vceqq_u16>;
-using _mm_cmpeq_epi32 =
- TWrapperDual<uint32x4_t, decltype(vceqq_u32), vceqq_u32>;
-
-using _mm_cmpgt_epi8 =
- TWrapperDual<int8x16_t, decltype(vcgtq_s8), vcgtq_s8>;
-using _mm_cmpgt_epi16 =
- TWrapperDual<int16x8_t, decltype(vcgtq_s16), vcgtq_s16>;
-using _mm_cmpgt_epi32 =
- TWrapperDual<int32x4_t, decltype(vcgtq_s32), vcgtq_s32>;
-
-using _mm_cmplt_epi8 =
- TWrapperDual<int8x16_t, decltype(vcltq_s8), vcltq_s8>;
-using _mm_cmplt_epi16 =
- TWrapperDual<int16x8_t, decltype(vcltq_s16), vcltq_s16>;
-using _mm_cmplt_epi32 =
- TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>;
-
+using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>;
+using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>;
+using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>;
+
+using _mm_unpacklo_epi8 =
+ TWrapperDual<uint8x16_t, decltype(vzip1q_u8), vzip1q_u8>;
+using _mm_unpackhi_epi8 =
+ TWrapperDual<uint8x16_t, decltype(vzip2q_u8), vzip2q_u8>;
+using _mm_unpacklo_epi16 =
+ TWrapperDual<uint16x8_t, decltype(vzip1q_u16), vzip1q_u16>;
+using _mm_unpackhi_epi16 =
+ TWrapperDual<uint16x8_t, decltype(vzip2q_u16), vzip2q_u16>;
+using _mm_unpacklo_epi32 =
+ TWrapperDual<uint32x4_t, decltype(vzip1q_u32), vzip1q_u32>;
+using _mm_unpackhi_epi32 =
+ TWrapperDual<uint32x4_t, decltype(vzip2q_u32), vzip2q_u32>;
+using _mm_unpacklo_epi64 =
+ TWrapperDual<uint64x2_t, decltype(vzip1q_u64), vzip1q_u64>;
+using _mm_unpackhi_epi64 =
+ TWrapperDual<uint64x2_t, decltype(vzip2q_u64), vzip2q_u64>;
+
+using _mm_cmpeq_epi8 =
+ TWrapperDual<uint8x16_t, decltype(vceqq_u8), vceqq_u8>;
+using _mm_cmpeq_epi16 =
+ TWrapperDual<uint16x8_t, decltype(vceqq_u16), vceqq_u16>;
+using _mm_cmpeq_epi32 =
+ TWrapperDual<uint32x4_t, decltype(vceqq_u32), vceqq_u32>;
+
+using _mm_cmpgt_epi8 =
+ TWrapperDual<int8x16_t, decltype(vcgtq_s8), vcgtq_s8>;
+using _mm_cmpgt_epi16 =
+ TWrapperDual<int16x8_t, decltype(vcgtq_s16), vcgtq_s16>;
+using _mm_cmpgt_epi32 =
+ TWrapperDual<int32x4_t, decltype(vcgtq_s32), vcgtq_s32>;
+
+using _mm_cmplt_epi8 =
+ TWrapperDual<int8x16_t, decltype(vcltq_s8), vcltq_s8>;
+using _mm_cmplt_epi16 =
+ TWrapperDual<int16x8_t, decltype(vcltq_s16), vcltq_s16>;
+using _mm_cmplt_epi32 =
+ TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>;
+
Y_FORCE_INLINE __m128i _mm_load_si128(const __m128i* ptr) {
- __m128i result;
+ __m128i result;
result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr);
- return result;
-}
-
+ return result;
+}
+
Y_FORCE_INLINE __m128i _mm_loadu_si128(const __m128i* ptr) {
- __m128i result;
+ __m128i result;
result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr);
- return result;
-}
-
+ return result;
+}
+
Y_FORCE_INLINE __m128i _mm_lddqu_si128(const __m128i* ptr) {
return _mm_loadu_si128(ptr);
}
Y_FORCE_INLINE void _mm_storeu_si128(__m128i* ptr, const __m128i& op) {
vst1q_u64((uint64_t*)ptr, op.AsUi64x2);
-}
-
+}
+
Y_FORCE_INLINE void
-_mm_store_si128(__m128i* ptr, const __m128i& op) {
+_mm_store_si128(__m128i* ptr, const __m128i& op) {
vst1q_u64((uint64_t*)ptr, op.AsUi64x2);
-}
-
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
-struct TWrapperSimple : TBaseWrapper<__m128i> {
+}
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperSimple : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperSimple(TParams... params) {
- TQType<TOp>::As(Value) = func(params...);
- }
-};
-
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
-struct TWrapperSimpleF : TBaseWrapper<__m128> {
+ TWrapperSimple(TParams... params) {
+ TQType<TOp>::As(Value) = func(params...);
+ }
+};
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperSimpleF : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- TWrapperSimpleF(TParams... params) {
- TQType<TOp>::As(Value) = func(params...);
- }
-};
-
-using _mm_set1_epi8 =
- TWrapperSimple<int8x16_t, decltype(vdupq_n_s8), vdupq_n_s8, const char>;
-using _mm_set1_epi16 =
- TWrapperSimple<int16x8_t, decltype(vdupq_n_s16), vdupq_n_s16, const ui16>;
-using _mm_set1_epi32 =
- TWrapperSimple<int32x4_t, decltype(vdupq_n_s32), vdupq_n_s32, const ui32>;
-
-struct _mm_setzero_si128 : TBaseWrapper<__m128i> {
+ TWrapperSimpleF(TParams... params) {
+ TQType<TOp>::As(Value) = func(params...);
+ }
+};
+
+using _mm_set1_epi8 =
+ TWrapperSimple<int8x16_t, decltype(vdupq_n_s8), vdupq_n_s8, const char>;
+using _mm_set1_epi16 =
+ TWrapperSimple<int16x8_t, decltype(vdupq_n_s16), vdupq_n_s16, const ui16>;
+using _mm_set1_epi32 =
+ TWrapperSimple<int32x4_t, decltype(vdupq_n_s32), vdupq_n_s32, const ui32>;
+
+struct _mm_setzero_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_setzero_si128() {
- TQType<uint64x2_t>::As(Value) = vdupq_n_u64(0);
- }
-};
-
-struct _mm_loadl_epi64 : TBaseWrapper<__m128i> {
+ _mm_setzero_si128() {
+ TQType<uint64x2_t>::As(Value) = vdupq_n_u64(0);
+ }
+};
+
+struct _mm_loadl_epi64 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_loadl_epi64(const __m128i* p) {
+ _mm_loadl_epi64(const __m128i* p) {
uint64x1_t im = vld1_u64((const uint64_t*)p);
- TQType<uint64x2_t>::As(Value) = vcombine_u64(im, vdup_n_u64(0));
- }
-};
-
-struct _mm_storel_epi64 : TBaseWrapper<__m128i> {
+ TQType<uint64x2_t>::As(Value) = vcombine_u64(im, vdup_n_u64(0));
+ }
+};
+
+struct _mm_storel_epi64 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_storel_epi64(__m128i* a, __m128i op) {
+ _mm_storel_epi64(__m128i* a, __m128i op) {
vst1_u64((uint64_t*)a, vget_low_u64(op.AsUi64x2));
- }
-};
-
+ }
+};
+
struct ShuffleStruct4 {
ui8 x[4];
};
@@ -470,45 +470,45 @@ _MM_SHUFFLE(ui8 x4, ui8 x3, ui8 x2, ui8 x1) {
}
Y_FORCE_INLINE __m128i
-_mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) {
- __m128i result;
+_mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) {
+ __m128i result;
const ui8 xi[4] = {
ui8(op2.x[0] * 4), ui8(op2.x[1] * 4),
ui8(op2.x[2] * 4), ui8(op2.x[3] * 4)
};
const uint8x16_t transform = {
- ui8(xi[0]), ui8(xi[0] + 1), ui8(xi[0] + 2), ui8(xi[0] + 3),
- ui8(xi[1]), ui8(xi[1] + 1), ui8(xi[1] + 2), ui8(xi[1] + 3),
- ui8(xi[2]), ui8(xi[2] + 1), ui8(xi[2] + 2), ui8(xi[2] + 3),
+ ui8(xi[0]), ui8(xi[0] + 1), ui8(xi[0] + 2), ui8(xi[0] + 3),
+ ui8(xi[1]), ui8(xi[1] + 1), ui8(xi[1] + 2), ui8(xi[1] + 3),
+ ui8(xi[2]), ui8(xi[2] + 1), ui8(xi[2] + 2), ui8(xi[2] + 3),
ui8(xi[3]), ui8(xi[3] + 1), ui8(xi[3] + 2), ui8(xi[3] + 3)
};
- result.AsUi8x16 = vqtbl1q_u8(op1.AsUi8x16, transform);
- return result;
-}
-
+ result.AsUi8x16 = vqtbl1q_u8(op1.AsUi8x16, transform);
+ return result;
+}
+
Y_FORCE_INLINE int
-_mm_movemask_epi8(const __m128i& op) {
- uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
- uint8x16_t opmasked = vandq_u8(op.AsUi8x16, mask);
- int8x16_t byteshifter = {
- 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7};
- uint8x16_t opshifted = vshlq_u8(opmasked, byteshifter);
- int16x8_t wordshifter = {-7, -5, -3, -1, 1, 3, 5, 7};
- uint16x8_t wordshifted =
- vshlq_u16(vreinterpretq_u16_u8(opshifted), wordshifter);
- return vaddvq_u16(wordshifted);
-}
-
-template <int imm>
-struct THelper_mm_srli_si128 : TBaseWrapper<__m128i> {
+_mm_movemask_epi8(const __m128i& op) {
+ uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+ uint8x16_t opmasked = vandq_u8(op.AsUi8x16, mask);
+ int8x16_t byteshifter = {
+ 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7};
+ uint8x16_t opshifted = vshlq_u8(opmasked, byteshifter);
+ int16x8_t wordshifter = {-7, -5, -3, -1, 1, 3, 5, 7};
+ uint16x8_t wordshifted =
+ vshlq_u16(vreinterpretq_u16_u8(opshifted), wordshifter);
+ return vaddvq_u16(wordshifted);
+}
+
+template <int imm>
+struct THelper_mm_srli_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
THelper_mm_srli_si128(const __m128i a) {
const auto zero = vdupq_n_u8(0);
- TQType<uint8x16_t>::As(Value) = vextq_u8(a.AsUi8x16, zero, imm);
- }
-};
-
+ TQType<uint8x16_t>::As(Value) = vextq_u8(a.AsUi8x16, zero, imm);
+ }
+};
+
template <>
struct THelper_mm_srli_si128<16> : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
@@ -518,8 +518,8 @@ struct THelper_mm_srli_si128<16> : TBaseWrapper<__m128i> {
}
};
-#define _mm_srli_si128(a, imm) THelper_mm_srli_si128<imm>(a)
-
+#define _mm_srli_si128(a, imm) THelper_mm_srli_si128<imm>(a)
+
template<int imm>
inline uint8x16_t vextq_u8_function(uint8x16_t a, uint8x16_t b) {
return vextq_u8(a, b, imm);
@@ -531,33 +531,33 @@ inline uint8x16_t vextq_u8_function<16>(uint8x16_t /* a */, uint8x16_t b) {
}
-template <int imm>
-struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> {
+template <int imm>
+struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
THelper_mm_slli_si128(const __m128i a) {
- auto zero = vdupq_n_u8(0);
+ auto zero = vdupq_n_u8(0);
TQType<uint8x16_t>::As(Value) = vextq_u8_function<16 - imm>(zero, a.AsUi8x16);
- }
-};
-
-#define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a)
-
+ }
+};
+
+#define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a)
+
Y_FORCE_INLINE int _mm_cvtsi128_si32(const __m128i& op) {
- return vgetq_lane_s32(op.AsSi32x4, 0);
-}
-
-struct _mm_set_epi16 : TBaseWrapper<__m128i> {
+ return vgetq_lane_s32(op.AsSi32x4, 0);
+}
+
+struct _mm_set_epi16 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_set_epi16(const short w7, const short w6,
- const short w5, const short w4,
- const short w3, const short w2,
- const short w1, const short w0) {
- int16x4_t d0 = {w0, w1, w2, w3};
- int16x4_t d1 = {w4, w5, w6, w7};
- TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1);
- }
-};
-
+ _mm_set_epi16(const short w7, const short w6,
+ const short w5, const short w4,
+ const short w3, const short w2,
+ const short w1, const short w0) {
+ int16x4_t d0 = {w0, w1, w2, w3};
+ int16x4_t d1 = {w4, w5, w6, w7};
+ TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1);
+ }
+};
+
struct _mm_setr_epi16 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
_mm_setr_epi16(const short w7, const short w6,
@@ -570,16 +570,16 @@ struct _mm_setr_epi16 : TBaseWrapper<__m128i> {
}
};
-struct _mm_set_epi32 : TBaseWrapper<__m128i> {
+struct _mm_set_epi32 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_set_epi32(const int x3, const int x2,
- const int x1, const int x0) {
- int32x2_t d0 = {x0, x1};
- int32x2_t d1 = {x2, x3};
- TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1);
- }
-};
-
+ _mm_set_epi32(const int x3, const int x2,
+ const int x1, const int x0) {
+ int32x2_t d0 = {x0, x1};
+ int32x2_t d1 = {x2, x3};
+ TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1);
+ }
+};
+
struct _mm_setr_epi32 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
_mm_setr_epi32(const int x3, const int x2,
@@ -590,14 +590,14 @@ struct _mm_setr_epi32 : TBaseWrapper<__m128i> {
}
};
-struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> {
+struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_cvtsi32_si128(int op) {
- auto zero = vdupq_n_s32(0);
- TQType<int32x4_t>::As(Value) = vsetq_lane_s32(op, zero, 0);
- }
-};
-
+ _mm_cvtsi32_si128(int op) {
+ auto zero = vdupq_n_s32(0);
+ TQType<int32x4_t>::As(Value) = vsetq_lane_s32(op, zero, 0);
+ }
+};
+
struct _mm_cvtsi64_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
_mm_cvtsi64_si128(i64 op) {
@@ -606,41 +606,41 @@ struct _mm_cvtsi64_si128 : TBaseWrapper<__m128i> {
}
};
-template <typename TOpOut, typename TOpIn,
- typename TFunc, TFunc* func,
- typename TCombine, TCombine* combine>
-struct TCombineWrapper : TBaseWrapper<__m128i> {
+template <typename TOpOut, typename TOpIn,
+ typename TFunc, TFunc* func,
+ typename TCombine, TCombine* combine>
+struct TCombineWrapper : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TCombineWrapper(const __m128i op1, const __m128i op2) {
- TQType<TOpOut>::As(Value) =
- combine(func(TQType<TOpIn>::As(op1)),
- func(TQType<TOpIn>::As(op2)));
- }
-};
-
-using _mm_packs_epi16 =
- TCombineWrapper<int8x16_t, int16x8_t,
- decltype(vqmovn_s16), vqmovn_s16,
- decltype(vcombine_s8), vcombine_s8>;
-using _mm_packs_epi32 =
- TCombineWrapper<int16x8_t, int32x4_t,
- decltype(vqmovn_s32), vqmovn_s32,
- decltype(vcombine_s16), vcombine_s16>;
-using _mm_packus_epi16 =
- TCombineWrapper<uint8x16_t, int16x8_t,
- decltype(vqmovun_s16), vqmovun_s16,
- decltype(vcombine_u8), vcombine_u8>;
-
-template <typename TOpOut, typename TOpIn,
- typename TFunc, TFunc* func, typename... TParams>
-struct TScalarOutWrapper : TBaseWrapper<TOpOut> {
+ TCombineWrapper(const __m128i op1, const __m128i op2) {
+ TQType<TOpOut>::As(Value) =
+ combine(func(TQType<TOpIn>::As(op1)),
+ func(TQType<TOpIn>::As(op2)));
+ }
+};
+
+using _mm_packs_epi16 =
+ TCombineWrapper<int8x16_t, int16x8_t,
+ decltype(vqmovn_s16), vqmovn_s16,
+ decltype(vcombine_s8), vcombine_s8>;
+using _mm_packs_epi32 =
+ TCombineWrapper<int16x8_t, int32x4_t,
+ decltype(vqmovn_s32), vqmovn_s32,
+ decltype(vcombine_s16), vcombine_s16>;
+using _mm_packus_epi16 =
+ TCombineWrapper<uint8x16_t, int16x8_t,
+ decltype(vqmovun_s16), vqmovun_s16,
+ decltype(vcombine_u8), vcombine_u8>;
+
+template <typename TOpOut, typename TOpIn,
+ typename TFunc, TFunc* func, typename... TParams>
+struct TScalarOutWrapper : TBaseWrapper<TOpOut> {
Y_FORCE_INLINE
- TScalarOutWrapper(const __m128i op, TParams... params) {
- TBaseWrapper<TOpOut>::Value =
- func(TQType<TOpIn>::As(op), params...);
- }
-};
-
+ TScalarOutWrapper(const __m128i op, TParams... params) {
+ TBaseWrapper<TOpOut>::Value =
+ func(TQType<TOpIn>::As(op), params...);
+ }
+};
+
template<int imm>
int extract_epi8_arm(__m128i arg) {
return vgetq_lane_u8(arg.AsUi8x16, imm);
@@ -649,13 +649,13 @@ int extract_epi8_arm(__m128i arg) {
template<int imm>
int extract_epi16_arm(__m128i arg) {
return vgetq_lane_u16(arg.AsUi16x8, imm);
-}
-
+}
+
template<int imm>
int extract_epi32_arm(__m128i arg) {
return vgetq_lane_s32(arg.AsSi32x4, imm);
}
-
+
template<int imm>
long long extract_epi64_arm(__m128i arg) {
return vgetq_lane_s64(arg.AsSi64x2, imm);
@@ -669,49 +669,49 @@ long long extract_epi64_arm(__m128i arg) {
static Y_FORCE_INLINE
__m128i _mm_mul_epu32(__m128i op1, __m128i op2) {
- __m128i result;
- uint32x4_t r1 = vuzp1q_u32(op1.AsUi32x4, op2.AsUi32x4);
- uint32x4_t r2 = vuzp1q_u32(op2.AsUi32x4, op1.AsUi32x4);
- result.AsUi64x2 = vmull_u32(vget_low_u32(r1), vget_low_u32(r2));
- return result;
-}
-
-template <>
-struct TQType<float32x4_t> {
- static inline float32x4_t& As(__m128& value) {
- return value.AsFloat32x4;
- }
-
- static inline const float32x4_t& As(const __m128& value) {
- return value.AsFloat32x4;
- }
-
- static inline float32x4_t& As(__m128i& value) {
- return value.AsFloat32x4;
- }
-
- static inline const float32x4_t& As(const __m128i& value) {
- return value.AsFloat32x4;
- }
-};
-
-template <>
-struct TQType<float64x2_t> {
- static inline float64x2_t& As(__m128& value) {
- return value.AsFloat64x2;
- }
-
- static inline const float64x2_t& As(const __m128& value) {
- return value.AsFloat64x2;
- }
-
- static inline float64x2_t& As(__m128i& value) {
- return value.AsFloat64x2;
- }
-
- static inline const float64x2_t& As(const __m128i& value) {
- return value.AsFloat64x2;
- }
+ __m128i result;
+ uint32x4_t r1 = vuzp1q_u32(op1.AsUi32x4, op2.AsUi32x4);
+ uint32x4_t r2 = vuzp1q_u32(op2.AsUi32x4, op1.AsUi32x4);
+ result.AsUi64x2 = vmull_u32(vget_low_u32(r1), vget_low_u32(r2));
+ return result;
+}
+
+template <>
+struct TQType<float32x4_t> {
+ static inline float32x4_t& As(__m128& value) {
+ return value.AsFloat32x4;
+ }
+
+ static inline const float32x4_t& As(const __m128& value) {
+ return value.AsFloat32x4;
+ }
+
+ static inline float32x4_t& As(__m128i& value) {
+ return value.AsFloat32x4;
+ }
+
+ static inline const float32x4_t& As(const __m128i& value) {
+ return value.AsFloat32x4;
+ }
+};
+
+template <>
+struct TQType<float64x2_t> {
+ static inline float64x2_t& As(__m128& value) {
+ return value.AsFloat64x2;
+ }
+
+ static inline const float64x2_t& As(const __m128& value) {
+ return value.AsFloat64x2;
+ }
+
+ static inline float64x2_t& As(__m128i& value) {
+ return value.AsFloat64x2;
+ }
+
+ static inline const float64x2_t& As(const __m128i& value) {
+ return value.AsFloat64x2;
+ }
static inline float64x2_t& As(__m128d& value) {
return value;
@@ -720,30 +720,30 @@ struct TQType<float64x2_t> {
static inline const float64x2_t& As(const __m128d& value) {
return value;
}
-};
-
-using _mm_set1_ps = TWrapperSimpleF<float32x4_t,
- decltype(vdupq_n_f32), vdupq_n_f32, const float>;
-using _mm_set_ps1 = TWrapperSimpleF<float32x4_t,
- decltype(vdupq_n_f32), vdupq_n_f32, const float>;
-
-struct _mm_setzero_ps : TBaseWrapper<__m128> {
+};
+
+using _mm_set1_ps = TWrapperSimpleF<float32x4_t,
+ decltype(vdupq_n_f32), vdupq_n_f32, const float>;
+using _mm_set_ps1 = TWrapperSimpleF<float32x4_t,
+ decltype(vdupq_n_f32), vdupq_n_f32, const float>;
+
+struct _mm_setzero_ps : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- _mm_setzero_ps() {
- TQType<float32x4_t>::As(Value) = vdupq_n_f32(0.);
- }
-};
-
+ _mm_setzero_ps() {
+ TQType<float32x4_t>::As(Value) = vdupq_n_f32(0.);
+ }
+};
+
Y_FORCE_INLINE __m128d _mm_setzero_pd() {
return vdupq_n_f64(0.);
}
Y_FORCE_INLINE __m128 _mm_loadu_ps(const float* ptr) {
- __m128 result;
- result.AsFloat32x4 = vld1q_f32(ptr);
- return result;
-}
-
+ __m128 result;
+ result.AsFloat32x4 = vld1q_f32(ptr);
+ return result;
+}
+
Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) {
__m128 result;
result.AsFloat32x4 = vld1q_f32(ptr);
@@ -751,23 +751,23 @@ Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) {
}
Y_FORCE_INLINE void _mm_storeu_ps(float* ptr, const __m128& op) {
- vst1q_f32(ptr, op.AsFloat32x4);
-}
-
+ vst1q_f32(ptr, op.AsFloat32x4);
+}
+
Y_FORCE_INLINE void _mm_store_ps(float* ptr, const __m128& op) {
vst1q_f32(ptr, op.AsFloat32x4);
}
-struct _mm_set_ps : TBaseWrapper<__m128> {
+struct _mm_set_ps : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- _mm_set_ps(const float x3, const float x2,
- const float x1, const float x0) {
- float32x2_t d0 = {x0, x1};
- float32x2_t d1 = {x2, x3};
- TQType<float32x4_t>::As(Value) = vcombine_f32(d0, d1);
- }
-};
-
+ _mm_set_ps(const float x3, const float x2,
+ const float x1, const float x0) {
+ float32x2_t d0 = {x0, x1};
+ float32x2_t d1 = {x2, x3};
+ TQType<float32x4_t>::As(Value) = vcombine_f32(d0, d1);
+ }
+};
+
Y_FORCE_INLINE __m128d _mm_set_pd(double d1, double d0) {
const float64x1_t p0 = {d0};
const float64x1_t p1 = {d1};
@@ -788,81 +788,81 @@ Y_FORCE_INLINE void _mm_store_pd(double* res, __m128d a) {
vst1q_f64(res, a);
}
-using _mm_add_ps = TWrapperDualF<float32x4_t, decltype(vaddq_f32), vaddq_f32>;
-using _mm_sub_ps = TWrapperDualF<float32x4_t, decltype(vsubq_f32), vsubq_f32>;
-using _mm_mul_ps = TWrapperDualF<float32x4_t, decltype(vmulq_f32), vmulq_f32>;
-using _mm_div_ps = TWrapperDualF<float32x4_t, decltype(vdivq_f32), vdivq_f32>;
-using _mm_cmpeq_ps = TWrapperDualF<float32x4_t, decltype(vceqq_f32), vceqq_f32>;
-using _mm_cmpgt_ps = TWrapperDualF<float32x4_t, decltype(vcgtq_f32), vcgtq_f32>;
-using _mm_max_ps = TWrapperDualF<float32x4_t, decltype(vmaxq_f32), vmaxq_f32>;
-using _mm_min_ps = TWrapperDualF<float32x4_t, decltype(vminq_f32), vminq_f32>;
-
+using _mm_add_ps = TWrapperDualF<float32x4_t, decltype(vaddq_f32), vaddq_f32>;
+using _mm_sub_ps = TWrapperDualF<float32x4_t, decltype(vsubq_f32), vsubq_f32>;
+using _mm_mul_ps = TWrapperDualF<float32x4_t, decltype(vmulq_f32), vmulq_f32>;
+using _mm_div_ps = TWrapperDualF<float32x4_t, decltype(vdivq_f32), vdivq_f32>;
+using _mm_cmpeq_ps = TWrapperDualF<float32x4_t, decltype(vceqq_f32), vceqq_f32>;
+using _mm_cmpgt_ps = TWrapperDualF<float32x4_t, decltype(vcgtq_f32), vcgtq_f32>;
+using _mm_max_ps = TWrapperDualF<float32x4_t, decltype(vmaxq_f32), vmaxq_f32>;
+using _mm_min_ps = TWrapperDualF<float32x4_t, decltype(vminq_f32), vminq_f32>;
+
using _mm_add_pd = TWrapperDualF<float64x2_t, decltype(vaddq_f64), vaddq_f64, __m128d>;
using _mm_sub_pd = TWrapperDualF<float64x2_t, decltype(vsubq_f64), vsubq_f64, __m128d>;
using _mm_mul_pd = TWrapperDualF<float64x2_t, decltype(vmulq_f64), vmulq_f64, __m128d>;
using _mm_div_pd = TWrapperDualF<float64x2_t, decltype(vdivq_f64), vdivq_f64, __m128d>;
-struct _mm_and_ps : TBaseWrapper<__m128> {
+struct _mm_and_ps : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- _mm_and_ps(const __m128& op1, const __m128& op2) {
- TQType<uint64x2_t>::As(Value) =
- vandq_u64(TQType<uint64x2_t>::As(op1),
- TQType<uint64x2_t>::As(op2));
- }
-};
-
+ _mm_and_ps(const __m128& op1, const __m128& op2) {
+ TQType<uint64x2_t>::As(Value) =
+ vandq_u64(TQType<uint64x2_t>::As(op1),
+ TQType<uint64x2_t>::As(op2));
+ }
+};
+
Y_FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {
return vandq_u64(a, b);
}
Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m128& op3) {
- float64x2_t im0 =
- (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
- float64x2_t im1 =
- (float64x2_t)vtrn2q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
- float64x2_t im2 =
- (float64x2_t)vtrn1q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
- float64x2_t im3 =
- (float64x2_t)vtrn2q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
-
- TQType<float64x2_t>::As(op0) = vtrn1q_f64(im0, im2);
- TQType<float64x2_t>::As(op1) = vtrn1q_f64(im1, im3);
- TQType<float64x2_t>::As(op2) = vtrn2q_f64(im0, im2);
- TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3);
-};
-
+ float64x2_t im0 =
+ (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
+ float64x2_t im1 =
+ (float64x2_t)vtrn2q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
+ float64x2_t im2 =
+ (float64x2_t)vtrn1q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
+ float64x2_t im3 =
+ (float64x2_t)vtrn2q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
+
+ TQType<float64x2_t>::As(op0) = vtrn1q_f64(im0, im2);
+ TQType<float64x2_t>::As(op1) = vtrn1q_f64(im1, im3);
+ TQType<float64x2_t>::As(op2) = vtrn2q_f64(im0, im2);
+ TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3);
+};
+
Y_FORCE_INLINE __m128 _mm_castsi128_ps(__m128i op) {
- return reinterpret_cast<__m128&>(op);
-}
-
+ return reinterpret_cast<__m128&>(op);
+}
+
Y_FORCE_INLINE __m128i _mm_castps_si128(__m128 op) {
- return reinterpret_cast<__m128i&>(op);
-}
-
-template <typename TOpOut, typename TOpIn,
- typename TFunc, TFunc* func, typename... TParams>
-struct TCvtS2FWrapperSingle : TBaseWrapper<__m128> {
+ return reinterpret_cast<__m128i&>(op);
+}
+
+template <typename TOpOut, typename TOpIn,
+ typename TFunc, TFunc* func, typename... TParams>
+struct TCvtS2FWrapperSingle : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- TCvtS2FWrapperSingle(const __m128i& op, TParams... params) {
- TQType<TOpOut>::As(Value) =
- func(TQType<TOpIn>::As(op), params...);
- }
-};
-
-using _mm_cvtepi32_ps =
- TCvtS2FWrapperSingle<float32x4_t, int32x4_t,
- decltype(vcvtq_f32_s32), vcvtq_f32_s32>;
-
-template <typename TOpOut, typename TOpIn,
- typename TFunc, TFunc* func, typename... TParams>
-struct TCvtF2SWrapperSingle : TBaseWrapper<__m128i> {
+ TCvtS2FWrapperSingle(const __m128i& op, TParams... params) {
+ TQType<TOpOut>::As(Value) =
+ func(TQType<TOpIn>::As(op), params...);
+ }
+};
+
+using _mm_cvtepi32_ps =
+ TCvtS2FWrapperSingle<float32x4_t, int32x4_t,
+ decltype(vcvtq_f32_s32), vcvtq_f32_s32>;
+
+template <typename TOpOut, typename TOpIn,
+ typename TFunc, TFunc* func, typename... TParams>
+struct TCvtF2SWrapperSingle : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TCvtF2SWrapperSingle(const __m128& op, TParams... params) {
- TQType<TOpOut>::As(Value) =
- func(TQType<TOpIn>::As(op), params...);
- }
-};
-
+ TCvtF2SWrapperSingle(const __m128& op, TParams... params) {
+ TQType<TOpOut>::As(Value) =
+ func(TQType<TOpIn>::As(op), params...);
+ }
+};
+
inline __m128i _mm_cvtps_epi32(__m128 a) {
/// vcvtq_s32_f32 rounds to zero, but we need to round to the nearest.
static const float32x4_t half = vdupq_n_f32(0.5f);
@@ -874,26 +874,26 @@ inline __m128i _mm_cvtps_epi32(__m128 a) {
return res;
}
-using _mm_cvttps_epi32 =
- TCvtF2SWrapperSingle<int32x4_t, float32x4_t,
- decltype(vcvtq_s32_f32), vcvtq_s32_f32>;
-
+using _mm_cvttps_epi32 =
+ TCvtF2SWrapperSingle<int32x4_t, float32x4_t,
+ decltype(vcvtq_s32_f32), vcvtq_s32_f32>;
+
Y_FORCE_INLINE int
-_mm_movemask_ps(const __m128& op) {
- uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
- uint32x4_t bits = vandq_u32(op.AsUi32x4, mask);
- int32x4_t shifts = {-31, -30, -29, -28};
- bits = vshlq_u32(bits, shifts);
- return vaddvq_u32(bits);
-}
+_mm_movemask_ps(const __m128& op) {
+ uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+ uint32x4_t bits = vandq_u32(op.AsUi32x4, mask);
+ int32x4_t shifts = {-31, -30, -29, -28};
+ bits = vshlq_u32(bits, shifts);
+ return vaddvq_u32(bits);
+}
Y_FORCE_INLINE i64 _mm_cvtsi128_si64(__m128i a) {
return vgetq_lane_s64(a.AsSi64x2, 0);
}
-
-static inline void _mm_pause() {
+
+static inline void _mm_pause() {
__asm__ ("YIELD");
-}
+}
static inline __m128 _mm_rsqrt_ps(__m128 a) {
__m128 res;
diff --git a/library/cpp/sse/ut/test.cpp b/library/cpp/sse/ut/test.cpp
index 42a82a8cfa..33c999d284 100644
--- a/library/cpp/sse/ut/test.cpp
+++ b/library/cpp/sse/ut/test.cpp
@@ -1,10 +1,10 @@
-/*
- Unittests for all SSE instrinsics translated to NEON instrinsics or
- software implementation.
- Should be tested both on Intel and ARM64.
- */
-/* Author: Vitaliy Manushkin <agri@yandex-team.ru */
-
+/*
+ Unittests for all SSE instrinsics translated to NEON instrinsics or
+ software implementation.
+ Should be tested both on Intel and ARM64.
+ */
+/* Author: Vitaliy Manushkin <agri@yandex-team.ru */
+
#include <library/cpp/testing/unittest/registar.h>
#include <util/generic/typetraits.h>
@@ -13,35 +13,35 @@
#include <util/stream/output.h>
#include <algorithm>
-#include <array>
-#include <limits>
+#include <array>
+#include <limits>
#include <memory>
#include <type_traits>
#include <utility>
-
-template <typename TResult, typename TFunc, TFunc* func>
-struct T_mm_CallWrapper {
- TResult Value;
-
- template <typename... TParams>
- T_mm_CallWrapper(TParams&&... params) {
- Value = func(std::forward<TParams>(params)...);
- }
-
- operator TResult&() {
- return Value;
- }
-
- operator const TResult&() const {
- return Value;
- }
-};
-
-#if defined(_arm64_)
+
+template <typename TResult, typename TFunc, TFunc* func>
+struct T_mm_CallWrapper {
+ TResult Value;
+
+ template <typename... TParams>
+ T_mm_CallWrapper(TParams&&... params) {
+ Value = func(std::forward<TParams>(params)...);
+ }
+
+ operator TResult&() {
+ return Value;
+ }
+
+ operator const TResult&() const {
+ return Value;
+ }
+};
+
+#if defined(_arm64_)
#include "library/cpp/sse/sse2neon.h"
#elif defined(_i386_) || defined(_x86_64_)
-#include <xmmintrin.h>
-#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
#include <smmintrin.h>
#elif defined(_ppc64_)
#include "library/cpp/sse/powerpc.h"
@@ -54,10 +54,10 @@ struct T_mm_CallWrapper {
#define WrapF(T_mm_func) T_mm_func
#define WrapD(T_mm_func) T_mm_func
#elif defined(_ppc64_) || defined(_i386_) || defined(_x86_64_)
-#define Wrap(_mm_func) \
- T_mm_CallWrapper<__m128i, decltype(_mm_func), _mm_func>
-#define WrapF(_mm_func) \
- T_mm_CallWrapper<__m128, decltype(_mm_func), _mm_func>
+#define Wrap(_mm_func) \
+ T_mm_CallWrapper<__m128i, decltype(_mm_func), _mm_func>
+#define WrapF(_mm_func) \
+ T_mm_CallWrapper<__m128, decltype(_mm_func), _mm_func>
#define WrapD(_mm_func) \
T_mm_CallWrapper<__m128d, decltype(_mm_func), _mm_func>
using int8x16_t = std::array<i8, 16>;
@@ -70,69 +70,69 @@ using uint32x4_t = std::array<ui32, 4>;
using uint64x2_t = std::array<ui64, 2>;
using float32x4_t = std::array<float, 4>;
using float64x2_t = std::array<double, 2>;
-
+
template <typename TVectorType>
-struct TQType {
+struct TQType {
static TVectorType As(__m128i param) {
TVectorType value;
- _mm_storeu_si128((__m128i*)&value, param);
- return value;
- }
+ _mm_storeu_si128((__m128i*)&value, param);
+ return value;
+ }
static TVectorType As(__m128 param) {
TVectorType value;
- _mm_storeu_ps((float*)&value, param);
- return value;
- }
+ _mm_storeu_ps((float*)&value, param);
+ return value;
+ }
static TVectorType As(__m128d param) {
TVectorType value;
_mm_storeu_pd((double*)&value, param);
return value;
}
-};
-#endif
-
+};
+#endif
+
template <typename TVectorType>
-struct TFuncLoad;
+struct TFuncLoad;
template <typename TVectorType>
-struct TFuncStore;
-
-template <>
-struct TFuncLoad<__m128i> {
- __m128i Value;
-
- template <typename TPointer>
- TFuncLoad(TPointer* ptr) {
- Value = _mm_loadu_si128((__m128i*)ptr);
- }
-
- operator __m128i&() {
- return Value;
- }
-
- operator const __m128i&() const {
- return Value;
- }
-};
-
-template <>
-struct TFuncLoad<__m128> {
- __m128 Value;
-
- template <typename TPointer>
- TFuncLoad(TPointer* ptr) {
- Value = _mm_loadu_ps((float*)ptr);
- }
-
- operator __m128&() {
- return Value;
- }
-
- operator const __m128&() const {
- return Value;
- }
-};
-
-template <>
+struct TFuncStore;
+
+template <>
+struct TFuncLoad<__m128i> {
+ __m128i Value;
+
+ template <typename TPointer>
+ TFuncLoad(TPointer* ptr) {
+ Value = _mm_loadu_si128((__m128i*)ptr);
+ }
+
+ operator __m128i&() {
+ return Value;
+ }
+
+ operator const __m128i&() const {
+ return Value;
+ }
+};
+
+template <>
+struct TFuncLoad<__m128> {
+ __m128 Value;
+
+ template <typename TPointer>
+ TFuncLoad(TPointer* ptr) {
+ Value = _mm_loadu_ps((float*)ptr);
+ }
+
+ operator __m128&() {
+ return Value;
+ }
+
+ operator const __m128&() const {
+ return Value;
+ }
+};
+
+template <>
struct TFuncLoad<__m128d> {
__m128d Value;
@@ -151,153 +151,153 @@ struct TFuncLoad<__m128d> {
};
template <>
-struct TFuncStore<__m128i> {
- template <typename TPointer>
- TFuncStore(TPointer* ptr, __m128i Value) {
- _mm_storeu_si128((__m128i*)ptr, Value);
- }
-};
-
-template <>
-struct TFuncStore<__m128> {
- template <typename TPointer>
- TFuncStore(TPointer* ptr, __m128 Value) {
- _mm_storeu_ps((float*)ptr, Value);
- }
-};
-
-class TSSEEmulTest: public TTestBase {
-private:
- UNIT_TEST_SUITE(TSSEEmulTest);
- UNIT_TEST(Test_mm_load_si128);
- UNIT_TEST(Test_mm_loadu_si128);
+struct TFuncStore<__m128i> {
+ template <typename TPointer>
+ TFuncStore(TPointer* ptr, __m128i Value) {
+ _mm_storeu_si128((__m128i*)ptr, Value);
+ }
+};
+
+template <>
+struct TFuncStore<__m128> {
+ template <typename TPointer>
+ TFuncStore(TPointer* ptr, __m128 Value) {
+ _mm_storeu_ps((float*)ptr, Value);
+ }
+};
+
+class TSSEEmulTest: public TTestBase {
+private:
+ UNIT_TEST_SUITE(TSSEEmulTest);
+ UNIT_TEST(Test_mm_load_si128);
+ UNIT_TEST(Test_mm_loadu_si128);
UNIT_TEST(Test_mm_storeu_si128);
UNIT_TEST(Test_mm_loadu_si128_2);
UNIT_TEST(Test_mm_loadu_ps);
UNIT_TEST(Test_mm_storeu_ps);
-
+
UNIT_TEST(Test_mm_slli_epi16);
UNIT_TEST(Test_mm_slli_epi32);
UNIT_TEST(Test_mm_slli_epi64);
UNIT_TEST(Test_mm_slli_si128);
- UNIT_TEST(Test_mm_srli_epi16);
- UNIT_TEST(Test_mm_srli_epi32);
- UNIT_TEST(Test_mm_srli_epi64);
+ UNIT_TEST(Test_mm_srli_epi16);
+ UNIT_TEST(Test_mm_srli_epi32);
+ UNIT_TEST(Test_mm_srli_epi64);
UNIT_TEST(Test_mm_srli_si128);
-
+
UNIT_TEST(Test_mm_srai_epi16);
UNIT_TEST(Test_mm_srai_epi32);
UNIT_TEST(Test_mm_sll_epi16);
UNIT_TEST(Test_mm_sll_epi32);
UNIT_TEST(Test_mm_sll_epi64);
-
+
UNIT_TEST(Test_mm_srl_epi16);
UNIT_TEST(Test_mm_srl_epi32);
UNIT_TEST(Test_mm_srl_epi64);
- UNIT_TEST(Test_mm_add_epi16);
- UNIT_TEST(Test_mm_add_epi32);
- UNIT_TEST(Test_mm_add_epi64);
- UNIT_TEST(Test_mm_add_ps);
+ UNIT_TEST(Test_mm_add_epi16);
+ UNIT_TEST(Test_mm_add_epi32);
+ UNIT_TEST(Test_mm_add_epi64);
+ UNIT_TEST(Test_mm_add_ps);
UNIT_TEST(Test_mm_add_pd);
-
+
UNIT_TEST(Test_mm_madd_epi16);
- UNIT_TEST(Test_mm_sub_epi16);
- UNIT_TEST(Test_mm_sub_epi32);
- UNIT_TEST(Test_mm_sub_epi64);
- UNIT_TEST(Test_mm_sub_ps);
+ UNIT_TEST(Test_mm_sub_epi16);
+ UNIT_TEST(Test_mm_sub_epi32);
+ UNIT_TEST(Test_mm_sub_epi64);
+ UNIT_TEST(Test_mm_sub_ps);
UNIT_TEST(Test_mm_sub_pd);
-
- UNIT_TEST(Test_mm_mul_ps);
+
+ UNIT_TEST(Test_mm_mul_ps);
UNIT_TEST(Test_mm_mul_pd);
- UNIT_TEST(Test_mm_div_ps);
+ UNIT_TEST(Test_mm_div_ps);
UNIT_TEST(Test_mm_div_pd);
- UNIT_TEST(Test_mm_max_ps);
- UNIT_TEST(Test_mm_min_ps);
- UNIT_TEST(Test_mm_and_ps);
-
- UNIT_TEST(Test_mm_unpacklo_epi8);
- UNIT_TEST(Test_mm_unpackhi_epi8);
- UNIT_TEST(Test_mm_unpacklo_epi16);
- UNIT_TEST(Test_mm_unpackhi_epi16);
- UNIT_TEST(Test_mm_unpacklo_epi32);
- UNIT_TEST(Test_mm_unpackhi_epi32);
- UNIT_TEST(Test_mm_unpacklo_epi64);
- UNIT_TEST(Test_mm_unpackhi_epi64);
-
- UNIT_TEST(Test_mm_or_si128);
- UNIT_TEST(Test_mm_and_si128);
- UNIT_TEST(Test_mm_andnot_si128);
-
- UNIT_TEST(Test_mm_cmpeq_epi8);
- UNIT_TEST(Test_mm_cmpeq_epi16);
- UNIT_TEST(Test_mm_cmpeq_epi32);
- UNIT_TEST(Test_mm_cmpeq_ps);
-
- UNIT_TEST(Test_mm_cmpgt_epi8);
- UNIT_TEST(Test_mm_cmpgt_epi16);
- UNIT_TEST(Test_mm_cmpgt_epi32);
- UNIT_TEST(Test_mm_cmpgt_ps);
-
- UNIT_TEST(Test_mm_cmplt_epi8);
- UNIT_TEST(Test_mm_cmplt_epi16);
- UNIT_TEST(Test_mm_cmplt_epi32);
-
- UNIT_TEST(Test_mm_set1_epi8);
- UNIT_TEST(Test_mm_set1_epi16);
- UNIT_TEST(Test_mm_set1_epi32);
- UNIT_TEST(Test_mm_set1_ps);
+ UNIT_TEST(Test_mm_max_ps);
+ UNIT_TEST(Test_mm_min_ps);
+ UNIT_TEST(Test_mm_and_ps);
+
+ UNIT_TEST(Test_mm_unpacklo_epi8);
+ UNIT_TEST(Test_mm_unpackhi_epi8);
+ UNIT_TEST(Test_mm_unpacklo_epi16);
+ UNIT_TEST(Test_mm_unpackhi_epi16);
+ UNIT_TEST(Test_mm_unpacklo_epi32);
+ UNIT_TEST(Test_mm_unpackhi_epi32);
+ UNIT_TEST(Test_mm_unpacklo_epi64);
+ UNIT_TEST(Test_mm_unpackhi_epi64);
+
+ UNIT_TEST(Test_mm_or_si128);
+ UNIT_TEST(Test_mm_and_si128);
+ UNIT_TEST(Test_mm_andnot_si128);
+
+ UNIT_TEST(Test_mm_cmpeq_epi8);
+ UNIT_TEST(Test_mm_cmpeq_epi16);
+ UNIT_TEST(Test_mm_cmpeq_epi32);
+ UNIT_TEST(Test_mm_cmpeq_ps);
+
+ UNIT_TEST(Test_mm_cmpgt_epi8);
+ UNIT_TEST(Test_mm_cmpgt_epi16);
+ UNIT_TEST(Test_mm_cmpgt_epi32);
+ UNIT_TEST(Test_mm_cmpgt_ps);
+
+ UNIT_TEST(Test_mm_cmplt_epi8);
+ UNIT_TEST(Test_mm_cmplt_epi16);
+ UNIT_TEST(Test_mm_cmplt_epi32);
+
+ UNIT_TEST(Test_mm_set1_epi8);
+ UNIT_TEST(Test_mm_set1_epi16);
+ UNIT_TEST(Test_mm_set1_epi32);
+ UNIT_TEST(Test_mm_set1_ps);
UNIT_TEST(Test_mm_set_ps1);
-
- UNIT_TEST(Test_mm_setzero_si128);
- UNIT_TEST(Test_mm_setzero_ps);
+
+ UNIT_TEST(Test_mm_setzero_si128);
+ UNIT_TEST(Test_mm_setzero_ps);
UNIT_TEST(Test_mm_setzero_pd);
-
- UNIT_TEST(Test_mm_storel_epi64);
- UNIT_TEST(Test_mm_loadl_epi64);
-
+
+ UNIT_TEST(Test_mm_storel_epi64);
+ UNIT_TEST(Test_mm_loadl_epi64);
+
UNIT_TEST(Test_mm_loadl_pd);
UNIT_TEST(Test_mm_loadh_pd);
UNIT_TEST(Test_mm_cvtsd_f64);
- UNIT_TEST(Test_mm_shuffle_epi32);
- UNIT_TEST(Test_mm_movemask_epi8);
- UNIT_TEST(Test_mm_cvtsi128_si32);
+ UNIT_TEST(Test_mm_shuffle_epi32);
+ UNIT_TEST(Test_mm_movemask_epi8);
+ UNIT_TEST(Test_mm_cvtsi128_si32);
UNIT_TEST(Test_mm_cvtsi128_si64);
-
- UNIT_TEST(Test_mm_set_epi16);
- UNIT_TEST(Test_mm_set_epi32);
- UNIT_TEST(Test_mm_set_ps);
+
+ UNIT_TEST(Test_mm_set_epi16);
+ UNIT_TEST(Test_mm_set_epi32);
+ UNIT_TEST(Test_mm_set_ps);
UNIT_TEST(Test_mm_set_pd);
-
- UNIT_TEST(Test_mm_cvtsi32_si128);
+
+ UNIT_TEST(Test_mm_cvtsi32_si128);
UNIT_TEST(Test_mm_cvtsi64_si128);
-
- UNIT_TEST(Test_mm_packs_epi16);
- UNIT_TEST(Test_mm_packs_epi32);
- UNIT_TEST(Test_mm_packus_epi16);
-
- UNIT_TEST(Test_mm_extract_epi16);
+
+ UNIT_TEST(Test_mm_packs_epi16);
+ UNIT_TEST(Test_mm_packs_epi32);
+ UNIT_TEST(Test_mm_packus_epi16);
+
+ UNIT_TEST(Test_mm_extract_epi16);
UNIT_TEST(Test_mm_extract_epi8);
UNIT_TEST(Test_mm_extract_epi32);
UNIT_TEST(Test_mm_extract_epi64);
-
- UNIT_TEST(Test_MM_TRANSPOSE4_PS);
- UNIT_TEST(Test_mm_movemask_ps);
+
+ UNIT_TEST(Test_MM_TRANSPOSE4_PS);
+ UNIT_TEST(Test_mm_movemask_ps);
UNIT_TEST(Test_mm_movemask_ps_2);
-
- UNIT_TEST(Test_mm_cvtepi32_ps);
- UNIT_TEST(Test_mm_cvtps_epi32);
- UNIT_TEST(Test_mm_cvttps_epi32);
-
- UNIT_TEST(Test_mm_castsi128_ps);
- UNIT_TEST(Test_mm_castps_si128);
-
- UNIT_TEST(Test_mm_mul_epu32);
-
+
+ UNIT_TEST(Test_mm_cvtepi32_ps);
+ UNIT_TEST(Test_mm_cvtps_epi32);
+ UNIT_TEST(Test_mm_cvttps_epi32);
+
+ UNIT_TEST(Test_mm_castsi128_ps);
+ UNIT_TEST(Test_mm_castps_si128);
+
+ UNIT_TEST(Test_mm_mul_epu32);
+
UNIT_TEST(Test_mm_cmpunord_ps);
UNIT_TEST(Test_mm_andnot_ps);
UNIT_TEST(Test_mm_shuffle_ps);
@@ -310,36 +310,36 @@ private:
UNIT_TEST(Test_mm_rsqrt_ps);
UNIT_TEST(Test_matrixnet_powerpc);
- UNIT_TEST_SUITE_END();
-
-public:
- void Test_mm_load_si128();
- void Test_mm_loadu_si128();
+ UNIT_TEST_SUITE_END();
+
+public:
+ void Test_mm_load_si128();
+ void Test_mm_loadu_si128();
void Test_mm_storeu_si128();
void Test_mm_loadu_si128_2();
void Test_mm_loadu_ps();
void Test_mm_storeu_ps();
-
- template <typename TElem, int bits, int elemCount,
+
+ template <typename TElem, int bits, int elemCount,
typename TFunc, typename TShifter, typename TOp, typename TElemFunc>
- void Test_mm_shifter_epiXX();
-
+ void Test_mm_shifter_epiXX();
+
enum class EDirection {
Left,
Right
};
-
+
struct TShiftRes {
__m128i Value[17];
};
void Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo);
- void Test_mm_slli_epi16();
- void Test_mm_slli_epi32();
- void Test_mm_slli_epi64();
+ void Test_mm_slli_epi16();
+ void Test_mm_slli_epi32();
+ void Test_mm_slli_epi64();
void Test_mm_slli_si128();
-
+
void Test_mm_srli_epi16();
void Test_mm_srli_epi32();
void Test_mm_srli_epi64();
@@ -356,134 +356,134 @@ public:
void Test_mm_srl_epi32();
void Test_mm_srl_epi64();
- void Test_mm_add_epi8();
- void Test_mm_add_epi16();
- void Test_mm_add_epi32();
- void Test_mm_add_epi64();
- void Test_mm_add_ps();
+ void Test_mm_add_epi8();
+ void Test_mm_add_epi16();
+ void Test_mm_add_epi32();
+ void Test_mm_add_epi64();
+ void Test_mm_add_ps();
void Test_mm_add_pd();
-
+
void Test_mm_madd_epi16();
- void Test_mm_sub_epi8();
- void Test_mm_sub_epi16();
- void Test_mm_sub_epi32();
- void Test_mm_sub_epi64();
- void Test_mm_sub_ps();
+ void Test_mm_sub_epi8();
+ void Test_mm_sub_epi16();
+ void Test_mm_sub_epi32();
+ void Test_mm_sub_epi64();
+ void Test_mm_sub_ps();
void Test_mm_sub_pd();
-
- void Test_mm_mul_ps();
+
+ void Test_mm_mul_ps();
void Test_mm_mul_pd();
- void Test_mm_div_ps();
+ void Test_mm_div_ps();
void Test_mm_div_pd();
- void Test_mm_max_ps();
- void Test_mm_min_ps();
- void Test_mm_and_ps();
-
- template <typename TElem, int bits, int elemCount, int shift,
- typename TFunc, typename TOp>
- void Test_mm_unpack_epiXX();
- void Test_mm_unpacklo_epi8();
- void Test_mm_unpackhi_epi8();
- void Test_mm_unpacklo_epi16();
- void Test_mm_unpackhi_epi16();
- void Test_mm_unpacklo_epi32();
- void Test_mm_unpackhi_epi32();
- void Test_mm_unpacklo_epi64();
- void Test_mm_unpackhi_epi64();
-
- template <typename TElem, unsigned elemCount,
- typename TFunc, typename TElemFunc,
+ void Test_mm_max_ps();
+ void Test_mm_min_ps();
+ void Test_mm_and_ps();
+
+ template <typename TElem, int bits, int elemCount, int shift,
+ typename TFunc, typename TOp>
+ void Test_mm_unpack_epiXX();
+ void Test_mm_unpacklo_epi8();
+ void Test_mm_unpackhi_epi8();
+ void Test_mm_unpacklo_epi16();
+ void Test_mm_unpackhi_epi16();
+ void Test_mm_unpacklo_epi32();
+ void Test_mm_unpackhi_epi32();
+ void Test_mm_unpacklo_epi64();
+ void Test_mm_unpackhi_epi64();
+
+ template <typename TElem, unsigned elemCount,
+ typename TFunc, typename TElemFunc,
typename TOp, typename TVectorType = __m128i>
- void Test_mm_dualop();
-
- template <typename TElem, unsigned elemCount,
- typename TFunc, typename TElemFunc,
+ void Test_mm_dualop();
+
+ template <typename TElem, unsigned elemCount,
+ typename TFunc, typename TElemFunc,
typename TOp, typename TVectorType = __m128i>
- void Test_mm_dualcmp();
-
- void Test_mm_or_si128();
- void Test_mm_and_si128();
- void Test_mm_andnot_si128();
-
- void Test_mm_cmpeq_epi8();
- void Test_mm_cmpeq_epi16();
- void Test_mm_cmpeq_epi32();
- void Test_mm_cmpeq_ps();
-
- void Test_mm_cmpgt_epi8();
- void Test_mm_cmpgt_epi16();
- void Test_mm_cmpgt_epi32();
- void Test_mm_cmpgt_ps();
-
- void Test_mm_cmplt_epi8();
- void Test_mm_cmplt_epi16();
- void Test_mm_cmplt_epi32();
-
- template <typename TElem, int elemCount,
+ void Test_mm_dualcmp();
+
+ void Test_mm_or_si128();
+ void Test_mm_and_si128();
+ void Test_mm_andnot_si128();
+
+ void Test_mm_cmpeq_epi8();
+ void Test_mm_cmpeq_epi16();
+ void Test_mm_cmpeq_epi32();
+ void Test_mm_cmpeq_ps();
+
+ void Test_mm_cmpgt_epi8();
+ void Test_mm_cmpgt_epi16();
+ void Test_mm_cmpgt_epi32();
+ void Test_mm_cmpgt_ps();
+
+ void Test_mm_cmplt_epi8();
+ void Test_mm_cmplt_epi16();
+ void Test_mm_cmplt_epi32();
+
+ template <typename TElem, int elemCount,
typename TFunc, typename TOp, typename TVectorType>
- void Test_mm_setter_epiXX();
- void Test_mm_set1_epi8();
- void Test_mm_set1_epi16();
- void Test_mm_set1_epi32();
- void Test_mm_set1_ps();
+ void Test_mm_setter_epiXX();
+ void Test_mm_set1_epi8();
+ void Test_mm_set1_epi16();
+ void Test_mm_set1_epi32();
+ void Test_mm_set1_ps();
void Test_mm_set_ps1();
-
- void Test_mm_setzero_si128();
- void Test_mm_setzero_ps();
+
+ void Test_mm_setzero_si128();
+ void Test_mm_setzero_ps();
void Test_mm_setzero_pd();
-
- void Test_mm_loadl_epi64();
- void Test_mm_storel_epi64();
-
+
+ void Test_mm_loadl_epi64();
+ void Test_mm_storel_epi64();
+
void Test_mm_loadl_pd();
void Test_mm_loadh_pd();
void Test_mm_cvtsd_f64();
- void Test_mm_shuffle_epi32();
- void Test_mm_movemask_epi8();
- void Test_mm_cvtsi128_si32();
+ void Test_mm_shuffle_epi32();
+ void Test_mm_movemask_epi8();
+ void Test_mm_cvtsi128_si32();
void Test_mm_cvtsi128_si64();
-
- void Test_mm_set_epi16();
- void Test_mm_set_epi32();
- void Test_mm_set_ps();
+
+ void Test_mm_set_epi16();
+ void Test_mm_set_epi32();
+ void Test_mm_set_ps();
void Test_mm_set_pd();
-
- void Test_mm_cvtsi32_si128();
+
+ void Test_mm_cvtsi32_si128();
void Test_mm_cvtsi64_si128();
-
- template <typename TElem, typename TNarrow, unsigned elemCount,
- typename TFunc>
- void Test_mm_packs_epiXX();
- void Test_mm_packs_epi16();
- void Test_mm_packs_epi32();
- void Test_mm_packus_epi16();
-
- void Test_mm_extract_epi16();
+
+ template <typename TElem, typename TNarrow, unsigned elemCount,
+ typename TFunc>
+ void Test_mm_packs_epiXX();
+ void Test_mm_packs_epi16();
+ void Test_mm_packs_epi32();
+ void Test_mm_packus_epi16();
+
+ void Test_mm_extract_epi16();
void Test_mm_extract_epi8();
void Test_mm_extract_epi32();
void Test_mm_extract_epi64();
-
- void Test_MM_TRANSPOSE4_PS();
- void Test_mm_movemask_ps();
+
+ void Test_MM_TRANSPOSE4_PS();
+ void Test_mm_movemask_ps();
void Test_mm_movemask_ps_2();
-
- template <typename TFrom, typename TTo, unsigned elemCount,
- typename TLoadVector, typename TResultVector,
- typename TElemFunc, typename TFunc, typename TOp>
- void Test_mm_convertop();
- void Test_mm_cvtepi32_ps();
- void Test_mm_cvtps_epi32();
- void Test_mm_cvttps_epi32();
-
- template <typename TLoadVector, typename TCastVector,
- typename TFunc, TFunc* func>
- void Test_mm_castXX();
- void Test_mm_castsi128_ps();
- void Test_mm_castps_si128();
-
- void Test_mm_mul_epu32();
+
+ template <typename TFrom, typename TTo, unsigned elemCount,
+ typename TLoadVector, typename TResultVector,
+ typename TElemFunc, typename TFunc, typename TOp>
+ void Test_mm_convertop();
+ void Test_mm_cvtepi32_ps();
+ void Test_mm_cvtps_epi32();
+ void Test_mm_cvttps_epi32();
+
+ template <typename TLoadVector, typename TCastVector,
+ typename TFunc, TFunc* func>
+ void Test_mm_castXX();
+ void Test_mm_castsi128_ps();
+ void Test_mm_castps_si128();
+
+ void Test_mm_mul_epu32();
void Test_mm_cmpunord_ps();
void Test_mm_store_ss();
@@ -497,30 +497,30 @@ public:
void Test_mm_rsqrt_ps();
void Test_mm_rsqrt_ss();
void Test_matrixnet_powerpc();
-};
-
-UNIT_TEST_SUITE_REGISTRATION(TSSEEmulTest);
-
-void TSSEEmulTest::Test_mm_load_si128() {
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TSSEEmulTest);
+
+void TSSEEmulTest::Test_mm_load_si128() {
alignas(16) char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- __m128i value = _mm_load_si128((__m128i*)&data);
- UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[0], 0xAABB2211CCFF00AAUL);
- UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[1], 0x1C66775588449933UL);
-}
-
-void TSSEEmulTest::Test_mm_loadu_si128() {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ __m128i value = _mm_load_si128((__m128i*)&data);
+ UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[0], 0xAABB2211CCFF00AAUL);
+ UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[1], 0x1C66775588449933UL);
+}
+
+void TSSEEmulTest::Test_mm_loadu_si128() {
alignas(16) char data[17] = {
- '\x66',
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- UNIT_ASSERT((ui64(&data[1]) & 0x1) == 0x1);
- __m128i value = _mm_loadu_si128((__m128i*)&data[1]);
- UNIT_ASSERT(TQType<uint64x2_t>::As(value)[0] == 0xAABB2211CCFF00AAUL);
- UNIT_ASSERT(TQType<uint64x2_t>::As(value)[1] == 0x1C66775588449933UL);
-}
-
+ '\x66',
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ UNIT_ASSERT((ui64(&data[1]) & 0x1) == 0x1);
+ __m128i value = _mm_loadu_si128((__m128i*)&data[1]);
+ UNIT_ASSERT(TQType<uint64x2_t>::As(value)[0] == 0xAABB2211CCFF00AAUL);
+ UNIT_ASSERT(TQType<uint64x2_t>::As(value)[1] == 0x1C66775588449933UL);
+}
+
void TSSEEmulTest::Test_mm_storeu_si128() {
alignas(16) unsigned char stub[32] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -640,32 +640,32 @@ unsigned MakeNumber<unsigned>(unsigned number) {
return number;
}
-template <typename TElem, int bits, int elemCount,
+template <typename TElem, int bits, int elemCount,
typename TFunc, typename TShifter, typename TOp, typename TElemFunc>
-void TSSEEmulTest::Test_mm_shifter_epiXX() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- TElem* dataw = reinterpret_cast<TElem*>(&data);
-
- __m128i value = _mm_loadu_si128((__m128i*)&data);
-
+void TSSEEmulTest::Test_mm_shifter_epiXX() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ TElem* dataw = reinterpret_cast<TElem*>(&data);
+
+ __m128i value = _mm_loadu_si128((__m128i*)&data);
+
for (unsigned shifter = 0; shifter <= bits; ++shifter) {
- TElem shiftedData[elemCount];
+ TElem shiftedData[elemCount];
for (unsigned i = 0; i < elemCount; ++i) {
- shiftedData[i] = TElemFunc::Call(dataw[i], shifter);
+ shiftedData[i] = TElemFunc::Call(dataw[i], shifter);
}
-
+
const TShifter adhoc_shifter = MakeNumber<TShifter>(shifter);
__m128i result = TFunc(value, adhoc_shifter);
for (unsigned i = 0; i < elemCount; ++i) {
- UNIT_ASSERT_EQUAL(shiftedData[i], TQType<TOp>::As(result)[i]);
+ UNIT_ASSERT_EQUAL(shiftedData[i], TQType<TOp>::As(result)[i]);
}
- }
-}
-
+ }
+}
+
void TSSEEmulTest::Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo) {
const char data[48] = {
@@ -713,52 +713,52 @@ struct THelperASHR {
}
};
-template <typename TElem>
-struct THelperSHR {
- static TElem Call(const TElem op, const int shift) {
+template <typename TElem>
+struct THelperSHR {
+ static TElem Call(const TElem op, const int shift) {
constexpr int nBitsInOp = sizeof(op) * CHAR_BIT;
return shift < nBitsInOp ? op >> shift : 0;
- }
-};
-
-void TSSEEmulTest::Test_mm_srli_epi16() {
+ }
+};
+
+void TSSEEmulTest::Test_mm_srli_epi16() {
Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_srli_epi16), unsigned, uint16x8_t,
- THelperSHR<ui16>>();
-}
-
-void TSSEEmulTest::Test_mm_srli_epi32() {
+ THelperSHR<ui16>>();
+}
+
+void TSSEEmulTest::Test_mm_srli_epi32() {
Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_srli_epi32), unsigned, uint32x4_t,
- THelperSHR<ui32>>();
-}
-
-void TSSEEmulTest::Test_mm_srli_epi64() {
+ THelperSHR<ui32>>();
+}
+
+void TSSEEmulTest::Test_mm_srli_epi64() {
Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_srli_epi64), unsigned, uint64x2_t,
- THelperSHR<ui64>>();
-}
-
-template <typename TElem>
-struct THelperSHL {
- static TElem Call(const TElem op, const int shift) {
+ THelperSHR<ui64>>();
+}
+
+template <typename TElem>
+struct THelperSHL {
+ static TElem Call(const TElem op, const int shift) {
constexpr int nBitsInOp = sizeof(op) * CHAR_BIT;
return shift < nBitsInOp ? op << shift : 0;
- }
-};
-
-void TSSEEmulTest::Test_mm_slli_epi16() {
+ }
+};
+
+void TSSEEmulTest::Test_mm_slli_epi16() {
Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_slli_epi16), unsigned, uint16x8_t,
- THelperSHL<ui16>>();
-}
-
-void TSSEEmulTest::Test_mm_slli_epi32() {
+ THelperSHL<ui16>>();
+}
+
+void TSSEEmulTest::Test_mm_slli_epi32() {
Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_slli_epi32), unsigned, uint32x4_t,
- THelperSHL<ui32>>();
-}
-
-void TSSEEmulTest::Test_mm_slli_epi64() {
+ THelperSHL<ui32>>();
+}
+
+void TSSEEmulTest::Test_mm_slli_epi64() {
Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_slli_epi64), unsigned, uint64x2_t,
- THelperSHL<ui64>>();
-}
-
+ THelperSHL<ui64>>();
+}
+
void TSSEEmulTest::Test_mm_slli_si128() {
Test_mm_byte_shifter(EDirection::Left, [] (__m128i a) -> TShiftRes {
TShiftRes res;
@@ -849,30 +849,30 @@ void TSSEEmulTest::Test_mm_sll_epi64() {
THelperSHL<ui64>>();
}
-template <typename TElem>
-struct THelperAdd {
- static TElem Call(const TElem op1, const TElem op2) {
- return op1 + op2;
- }
-};
-
-void TSSEEmulTest::Test_mm_add_epi16() {
- Test_mm_dualop<ui16, 8, Wrap(_mm_add_epi16), THelperAdd<ui16>, uint16x8_t>();
-}
-
-void TSSEEmulTest::Test_mm_add_epi32() {
- Test_mm_dualop<ui32, 4, Wrap(_mm_add_epi32), THelperAdd<ui32>, uint32x4_t>();
-}
-
-void TSSEEmulTest::Test_mm_add_epi64() {
- Test_mm_dualop<ui64, 2, Wrap(_mm_add_epi64), THelperAdd<ui64>, uint64x2_t>();
-}
-
-void TSSEEmulTest::Test_mm_add_ps() {
- Test_mm_dualop<float, 2, WrapF(_mm_add_ps),
- THelperAdd<float>, float32x4_t, __m128>();
-}
-
+template <typename TElem>
+struct THelperAdd {
+ static TElem Call(const TElem op1, const TElem op2) {
+ return op1 + op2;
+ }
+};
+
+void TSSEEmulTest::Test_mm_add_epi16() {
+ Test_mm_dualop<ui16, 8, Wrap(_mm_add_epi16), THelperAdd<ui16>, uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_add_epi32() {
+ Test_mm_dualop<ui32, 4, Wrap(_mm_add_epi32), THelperAdd<ui32>, uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_add_epi64() {
+ Test_mm_dualop<ui64, 2, Wrap(_mm_add_epi64), THelperAdd<ui64>, uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_add_ps() {
+ Test_mm_dualop<float, 2, WrapF(_mm_add_ps),
+ THelperAdd<float>, float32x4_t, __m128>();
+}
+
void TSSEEmulTest::Test_mm_add_pd() {
Test_mm_dualop<double, 2, WrapD(_mm_add_pd),
THelperAdd<double>, float64x2_t, __m128d>();
@@ -904,44 +904,44 @@ void TSSEEmulTest::Test_mm_madd_epi16() {
}
-template <typename TElem>
-struct THelperSub {
- static TElem Call(const TElem op1, const TElem op2) {
- return op1 - op2;
- }
-};
-
-void TSSEEmulTest::Test_mm_sub_epi16() {
- Test_mm_dualop<ui16, 8, Wrap(_mm_sub_epi16), THelperSub<ui16>, uint16x8_t>();
-}
-
-void TSSEEmulTest::Test_mm_sub_epi32() {
- Test_mm_dualop<ui32, 4, Wrap(_mm_sub_epi32), THelperSub<ui32>, uint32x4_t>();
-}
-
-void TSSEEmulTest::Test_mm_sub_epi64() {
- Test_mm_dualop<ui64, 2, Wrap(_mm_sub_epi64), THelperSub<ui64>, uint64x2_t>();
-}
-
-void TSSEEmulTest::Test_mm_sub_ps() {
- Test_mm_dualop<float, 4, WrapF(_mm_sub_ps), THelperSub<float>,
- float32x4_t, __m128>();
-}
-
+template <typename TElem>
+struct THelperSub {
+ static TElem Call(const TElem op1, const TElem op2) {
+ return op1 - op2;
+ }
+};
+
+void TSSEEmulTest::Test_mm_sub_epi16() {
+ Test_mm_dualop<ui16, 8, Wrap(_mm_sub_epi16), THelperSub<ui16>, uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_sub_epi32() {
+ Test_mm_dualop<ui32, 4, Wrap(_mm_sub_epi32), THelperSub<ui32>, uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_sub_epi64() {
+ Test_mm_dualop<ui64, 2, Wrap(_mm_sub_epi64), THelperSub<ui64>, uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_sub_ps() {
+ Test_mm_dualop<float, 4, WrapF(_mm_sub_ps), THelperSub<float>,
+ float32x4_t, __m128>();
+}
+
void TSSEEmulTest::Test_mm_sub_pd() {
Test_mm_dualop<double, 2, WrapD(_mm_sub_pd), THelperSub<double>,
float64x2_t, __m128d>();
}
-void TSSEEmulTest::Test_mm_mul_ps() {
- struct THelper {
- static float Call(const float op1, const float op2) {
- return op1 * op2;
- }
- };
- Test_mm_dualop<float, 4, WrapF(_mm_mul_ps), THelper, float32x4_t, __m128>();
-}
-
+void TSSEEmulTest::Test_mm_mul_ps() {
+ struct THelper {
+ static float Call(const float op1, const float op2) {
+ return op1 * op2;
+ }
+ };
+ Test_mm_dualop<float, 4, WrapF(_mm_mul_ps), THelper, float32x4_t, __m128>();
+}
+
void TSSEEmulTest::Test_mm_mul_pd() {
struct THelper {
static double Call(const double op1, const double op2) {
@@ -951,15 +951,15 @@ void TSSEEmulTest::Test_mm_mul_pd() {
Test_mm_dualop<double, 2, WrapD(_mm_mul_pd), THelper, float64x2_t, __m128d>();
}
-void TSSEEmulTest::Test_mm_div_ps() {
- struct THelper {
- static float Call(const float op1, const float op2) {
- return op1 / op2;
- }
- };
- Test_mm_dualop<float, 4, WrapF(_mm_div_ps), THelper, float32x4_t, __m128>();
-}
-
+void TSSEEmulTest::Test_mm_div_ps() {
+ struct THelper {
+ static float Call(const float op1, const float op2) {
+ return op1 / op2;
+ }
+ };
+ Test_mm_dualop<float, 4, WrapF(_mm_div_ps), THelper, float32x4_t, __m128>();
+}
+
void TSSEEmulTest::Test_mm_div_pd() {
struct THelper {
static double Call(const double op1, const double op2) {
@@ -969,441 +969,441 @@ void TSSEEmulTest::Test_mm_div_pd() {
Test_mm_dualop<double, 2, WrapD(_mm_div_pd), THelper, float64x2_t, __m128d>();
}
-void TSSEEmulTest::Test_mm_max_ps() {
- struct THelper {
- static float Call(const float op1, const float op2) {
- return std::max(op1, op2);
- }
- };
- Test_mm_dualop<float, 4, WrapF(_mm_max_ps), THelper, float32x4_t, __m128>();
-}
-
-void TSSEEmulTest::Test_mm_min_ps() {
- struct THelper {
- static float Call(const float op1, const float op2) {
- return std::min(op1, op2);
- }
- };
- Test_mm_dualop<float, 4, WrapF(_mm_min_ps), THelper, float32x4_t, __m128>();
-}
-
-void TSSEEmulTest::Test_mm_and_ps() {
- struct THelper {
- static float Call(const float op1, const float op2) {
- union Cast {
- unsigned int AsUInt;
- float AsFloat;
- };
- Cast v1, v2, result;
- v1.AsFloat = op1;
- v2.AsFloat = op2;
- result.AsUInt = v1.AsUInt & v2.AsUInt;
- return result.AsFloat;
- }
- };
- Test_mm_dualcmp<float, 4, WrapF(_mm_and_ps),
- THelper, float32x4_t, __m128>();
-}
-
-template <typename TElem, int bits, int elemCount, int shift,
- typename TFunc, typename TOp>
-void TSSEEmulTest::Test_mm_unpack_epiXX() {
- char data1[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- char data2[16] = {
- '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
- '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
- TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
- TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
-
- __m128i value1 = _mm_loadu_si128((__m128i*)&data1);
- __m128i value2 = _mm_loadu_si128((__m128i*)&data2);
-
- TElem zippedData[elemCount];
- for (unsigned i = 0; i < elemCount / 2; ++i) {
- zippedData[i * 2] = dataw1[i + shift];
- zippedData[i * 2 + 1] = dataw2[i + shift];
- }
- __m128i result = TFunc(value1, value2);
-
- for (unsigned i = 0; i < elemCount / 2; ++i) {
- UNIT_ASSERT_EQUAL(zippedData[i * 2], TQType<TOp>::As(result)[i * 2]);
- UNIT_ASSERT_EQUAL(zippedData[i * 2 + 1],
- TQType<TOp>::As(result)[i * 2 + 1]);
- }
-}
-
-void TSSEEmulTest::Test_mm_unpacklo_epi8() {
- Test_mm_unpack_epiXX<ui8, 8, 16, 0, Wrap(_mm_unpacklo_epi8), uint8x16_t>();
-}
-
-void TSSEEmulTest::Test_mm_unpackhi_epi8() {
- Test_mm_unpack_epiXX<ui8, 8, 16, 8, Wrap(_mm_unpackhi_epi8), uint8x16_t>();
-}
-
-void TSSEEmulTest::Test_mm_unpacklo_epi16() {
- Test_mm_unpack_epiXX<ui16, 16, 8, 0, Wrap(_mm_unpacklo_epi16), uint16x8_t>();
-}
-
-void TSSEEmulTest::Test_mm_unpackhi_epi16() {
- Test_mm_unpack_epiXX<ui16, 16, 8, 4, Wrap(_mm_unpackhi_epi16), uint16x8_t>();
-}
-
-void TSSEEmulTest::Test_mm_unpacklo_epi32() {
- Test_mm_unpack_epiXX<ui32, 32, 4, 0, Wrap(_mm_unpacklo_epi32), uint32x4_t>();
-}
-
-void TSSEEmulTest::Test_mm_unpackhi_epi32() {
- Test_mm_unpack_epiXX<ui32, 32, 4, 2, Wrap(_mm_unpackhi_epi32), uint32x4_t>();
-}
-
-void TSSEEmulTest::Test_mm_unpacklo_epi64() {
- Test_mm_unpack_epiXX<ui64, 64, 2, 0, Wrap(_mm_unpacklo_epi64), uint64x2_t>();
-}
-
-void TSSEEmulTest::Test_mm_unpackhi_epi64() {
- Test_mm_unpack_epiXX<ui64, 64, 2, 1, Wrap(_mm_unpackhi_epi64), uint64x2_t>();
-}
-
-template <typename TElem, unsigned elemCount,
- typename TFunc, typename TElemFunc,
+void TSSEEmulTest::Test_mm_max_ps() {
+ struct THelper {
+ static float Call(const float op1, const float op2) {
+ return std::max(op1, op2);
+ }
+ };
+ Test_mm_dualop<float, 4, WrapF(_mm_max_ps), THelper, float32x4_t, __m128>();
+}
+
+void TSSEEmulTest::Test_mm_min_ps() {
+ struct THelper {
+ static float Call(const float op1, const float op2) {
+ return std::min(op1, op2);
+ }
+ };
+ Test_mm_dualop<float, 4, WrapF(_mm_min_ps), THelper, float32x4_t, __m128>();
+}
+
+void TSSEEmulTest::Test_mm_and_ps() {
+ struct THelper {
+ static float Call(const float op1, const float op2) {
+ union Cast {
+ unsigned int AsUInt;
+ float AsFloat;
+ };
+ Cast v1, v2, result;
+ v1.AsFloat = op1;
+ v2.AsFloat = op2;
+ result.AsUInt = v1.AsUInt & v2.AsUInt;
+ return result.AsFloat;
+ }
+ };
+ Test_mm_dualcmp<float, 4, WrapF(_mm_and_ps),
+ THelper, float32x4_t, __m128>();
+}
+
+template <typename TElem, int bits, int elemCount, int shift,
+ typename TFunc, typename TOp>
+void TSSEEmulTest::Test_mm_unpack_epiXX() {
+ char data1[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ char data2[16] = {
+ '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+ '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+ TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
+ TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
+
+ __m128i value1 = _mm_loadu_si128((__m128i*)&data1);
+ __m128i value2 = _mm_loadu_si128((__m128i*)&data2);
+
+ TElem zippedData[elemCount];
+ for (unsigned i = 0; i < elemCount / 2; ++i) {
+ zippedData[i * 2] = dataw1[i + shift];
+ zippedData[i * 2 + 1] = dataw2[i + shift];
+ }
+ __m128i result = TFunc(value1, value2);
+
+ for (unsigned i = 0; i < elemCount / 2; ++i) {
+ UNIT_ASSERT_EQUAL(zippedData[i * 2], TQType<TOp>::As(result)[i * 2]);
+ UNIT_ASSERT_EQUAL(zippedData[i * 2 + 1],
+ TQType<TOp>::As(result)[i * 2 + 1]);
+ }
+}
+
+void TSSEEmulTest::Test_mm_unpacklo_epi8() {
+ Test_mm_unpack_epiXX<ui8, 8, 16, 0, Wrap(_mm_unpacklo_epi8), uint8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpackhi_epi8() {
+ Test_mm_unpack_epiXX<ui8, 8, 16, 8, Wrap(_mm_unpackhi_epi8), uint8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpacklo_epi16() {
+ Test_mm_unpack_epiXX<ui16, 16, 8, 0, Wrap(_mm_unpacklo_epi16), uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpackhi_epi16() {
+ Test_mm_unpack_epiXX<ui16, 16, 8, 4, Wrap(_mm_unpackhi_epi16), uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpacklo_epi32() {
+ Test_mm_unpack_epiXX<ui32, 32, 4, 0, Wrap(_mm_unpacklo_epi32), uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpackhi_epi32() {
+ Test_mm_unpack_epiXX<ui32, 32, 4, 2, Wrap(_mm_unpackhi_epi32), uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpacklo_epi64() {
+ Test_mm_unpack_epiXX<ui64, 64, 2, 0, Wrap(_mm_unpacklo_epi64), uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpackhi_epi64() {
+ Test_mm_unpack_epiXX<ui64, 64, 2, 1, Wrap(_mm_unpackhi_epi64), uint64x2_t>();
+}
+
+template <typename TElem, unsigned elemCount,
+ typename TFunc, typename TElemFunc,
typename TOp, typename TVectorType>
-void TSSEEmulTest::Test_mm_dualop() {
- char data1[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- char data2[16] = {
- '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
- '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
- TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
- TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
-
+void TSSEEmulTest::Test_mm_dualop() {
+ char data1[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ char data2[16] = {
+ '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+ '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+ TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
+ TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
+
TVectorType value1 = TFuncLoad<TVectorType>(&data1);
TVectorType value2 = TFuncLoad<TVectorType>(&data2);
-
- TElem procData[elemCount];
- for (unsigned i = 0; i < elemCount; ++i) {
- procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);
- }
+
+ TElem procData[elemCount];
+ for (unsigned i = 0; i < elemCount; ++i) {
+ procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);
+ }
TVectorType result = TFunc(value1, value2);
-
- for (unsigned i = 0; i < elemCount; ++i) {
- UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);
- }
-}
-
-/* This is almost the same as Test_mm_dualop,
- but different data1 and data2 */
-template <typename TElem, unsigned elemCount,
- typename TFunc, typename TElemFunc,
+
+ for (unsigned i = 0; i < elemCount; ++i) {
+ UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);
+ }
+}
+
+/* This is almost the same as Test_mm_dualop,
+ but different data1 and data2 */
+template <typename TElem, unsigned elemCount,
+ typename TFunc, typename TElemFunc,
typename TOp, typename TVectorType>
-void TSSEEmulTest::Test_mm_dualcmp() {
- char data1[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x66', '\x77', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C'};
- char data2[16] = {
- '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
- '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
- TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
- TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
-
+void TSSEEmulTest::Test_mm_dualcmp() {
+ char data1[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x66', '\x77', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C'};
+ char data2[16] = {
+ '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
+ '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
+ TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
+ TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
+
TVectorType value1 = TFuncLoad<TVectorType>(&data1);
TVectorType value2 = TFuncLoad<TVectorType>(&data2);
-
- TElem procData[elemCount];
- for (unsigned i = 0; i < elemCount; ++i) {
- procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);
- }
+
+ TElem procData[elemCount];
+ for (unsigned i = 0; i < elemCount; ++i) {
+ procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);
+ }
TVectorType result = TFunc(value1, value2);
-
- for (unsigned i = 0; i < elemCount; ++i) {
- /* memcmp is for compare to invalid floats in results */
+
+ for (unsigned i = 0; i < elemCount; ++i) {
+ /* memcmp is for compare to invalid floats in results */
const TElem value = TQType<TOp>::As(result)[i];
UNIT_ASSERT(memcmp(&(procData[i]), &value, sizeof(TElem)) == 0);
- }
-}
-
-void TSSEEmulTest::Test_mm_or_si128() {
- struct THelper {
- static ui64 Call(const ui64 op1, const ui64 op2) {
- return op1 | op2;
- }
- };
-
- Test_mm_dualop<ui64, 2, Wrap(_mm_or_si128), THelper, uint64x2_t>();
-}
-
-void TSSEEmulTest::Test_mm_and_si128() {
- struct THelper {
- static ui64 Call(const ui64 op1, const ui64 op2) {
- return op1 & op2;
- }
- };
-
- Test_mm_dualop<ui64, 2, Wrap(_mm_and_si128), THelper, uint64x2_t>();
-}
-
-void TSSEEmulTest::Test_mm_andnot_si128() {
- struct THelper {
- static ui64 Call(const ui64 op1, const ui64 op2) {
- return (~op1) & op2;
- }
- };
-
- Test_mm_dualop<ui64, 2, Wrap(_mm_andnot_si128), THelper, uint64x2_t>();
-}
-
-template <typename TElem>
-struct THelperCMPEQ {
- static TElem Call(const TElem op1, const TElem op2) {
- return op1 == op2 ? ~TElem(0) : TElem(0);
- }
-};
-
-void TSSEEmulTest::Test_mm_cmpeq_epi8() {
- Test_mm_dualcmp<ui8, 16, Wrap(_mm_cmpeq_epi8),
- THelperCMPEQ<ui8>, uint8x16_t>();
-}
-
-void TSSEEmulTest::Test_mm_cmpeq_epi16() {
- Test_mm_dualcmp<ui16, 8, Wrap(_mm_cmpeq_epi16),
- THelperCMPEQ<ui16>, uint16x8_t>();
-}
-
-void TSSEEmulTest::Test_mm_cmpeq_epi32() {
- Test_mm_dualcmp<ui32, 4, Wrap(_mm_cmpeq_epi32),
- THelperCMPEQ<ui32>, uint32x4_t>();
-}
-
-void TSSEEmulTest::Test_mm_cmpeq_ps() {
- struct THelperFloat {
- static float Call(const float op1, const float op2) {
- union Cast {
- unsigned int AsUInt;
- float AsFloat;
- };
- Cast value;
- value.AsUInt = op1 == op2 ? 0xFFFFFFFF : 0;
- return value.AsFloat;
- }
- };
-
- Test_mm_dualcmp<float, 4, WrapF(_mm_cmpeq_ps),
- THelperFloat, float32x4_t, __m128>();
-}
-
-template <typename TElem>
-struct THelperCMPGT {
- static TElem Call(const TElem op1, const TElem op2) {
- return op1 > op2 ? ~TElem(0) : TElem(0);
- }
-};
-
-void TSSEEmulTest::Test_mm_cmpgt_epi8() {
- Test_mm_dualcmp<i8, 16, Wrap(_mm_cmpgt_epi8),
- THelperCMPGT<i8>, int8x16_t>();
-}
-
-void TSSEEmulTest::Test_mm_cmpgt_epi16() {
- Test_mm_dualcmp<i16, 8, Wrap(_mm_cmpgt_epi16),
- THelperCMPGT<i16>, int16x8_t>();
-}
-
-void TSSEEmulTest::Test_mm_cmpgt_epi32() {
- Test_mm_dualcmp<i32, 4, Wrap(_mm_cmpgt_epi32),
- THelperCMPGT<i32>, int32x4_t>();
-}
-
-void TSSEEmulTest::Test_mm_cmpgt_ps() {
- struct THelperFloat {
- static float Call(const float op1, const float op2) {
- union Cast {
- unsigned int AsUInt;
- float AsFloat;
- };
- Cast value;
- value.AsUInt = op1 > op2 ? 0xFFFFFFFF : 0;
- return value.AsFloat;
- }
- };
-
- Test_mm_dualcmp<float, 4, WrapF(_mm_cmpgt_ps),
- THelperFloat, float32x4_t, __m128>();
-}
-
-template <typename TElem>
-struct THelperCMPLT {
- static TElem Call(const TElem op1, const TElem op2) {
- return op1 < op2 ? ~TElem(0) : TElem(0);
- }
-};
-
-void TSSEEmulTest::Test_mm_cmplt_epi8() {
- Test_mm_dualcmp<i8, 16, Wrap(_mm_cmplt_epi8),
- THelperCMPLT<i8>, int8x16_t>();
-}
-
-void TSSEEmulTest::Test_mm_cmplt_epi16() {
- Test_mm_dualcmp<i16, 8, Wrap(_mm_cmplt_epi16),
- THelperCMPLT<i16>, int16x8_t>();
-}
-
-void TSSEEmulTest::Test_mm_cmplt_epi32() {
- Test_mm_dualcmp<i32, 4, Wrap(_mm_cmplt_epi32),
- THelperCMPLT<i32>, int32x4_t>();
-}
-
-template <typename TElem, int elemCount,
+ }
+}
+
+void TSSEEmulTest::Test_mm_or_si128() {
+ struct THelper {
+ static ui64 Call(const ui64 op1, const ui64 op2) {
+ return op1 | op2;
+ }
+ };
+
+ Test_mm_dualop<ui64, 2, Wrap(_mm_or_si128), THelper, uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_and_si128() {
+ struct THelper {
+ static ui64 Call(const ui64 op1, const ui64 op2) {
+ return op1 & op2;
+ }
+ };
+
+ Test_mm_dualop<ui64, 2, Wrap(_mm_and_si128), THelper, uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_andnot_si128() {
+ struct THelper {
+ static ui64 Call(const ui64 op1, const ui64 op2) {
+ return (~op1) & op2;
+ }
+ };
+
+ Test_mm_dualop<ui64, 2, Wrap(_mm_andnot_si128), THelper, uint64x2_t>();
+}
+
+template <typename TElem>
+struct THelperCMPEQ {
+ static TElem Call(const TElem op1, const TElem op2) {
+ return op1 == op2 ? ~TElem(0) : TElem(0);
+ }
+};
+
+void TSSEEmulTest::Test_mm_cmpeq_epi8() {
+ Test_mm_dualcmp<ui8, 16, Wrap(_mm_cmpeq_epi8),
+ THelperCMPEQ<ui8>, uint8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpeq_epi16() {
+ Test_mm_dualcmp<ui16, 8, Wrap(_mm_cmpeq_epi16),
+ THelperCMPEQ<ui16>, uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpeq_epi32() {
+ Test_mm_dualcmp<ui32, 4, Wrap(_mm_cmpeq_epi32),
+ THelperCMPEQ<ui32>, uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpeq_ps() {
+ struct THelperFloat {
+ static float Call(const float op1, const float op2) {
+ union Cast {
+ unsigned int AsUInt;
+ float AsFloat;
+ };
+ Cast value;
+ value.AsUInt = op1 == op2 ? 0xFFFFFFFF : 0;
+ return value.AsFloat;
+ }
+ };
+
+ Test_mm_dualcmp<float, 4, WrapF(_mm_cmpeq_ps),
+ THelperFloat, float32x4_t, __m128>();
+}
+
+template <typename TElem>
+struct THelperCMPGT {
+ static TElem Call(const TElem op1, const TElem op2) {
+ return op1 > op2 ? ~TElem(0) : TElem(0);
+ }
+};
+
+void TSSEEmulTest::Test_mm_cmpgt_epi8() {
+ Test_mm_dualcmp<i8, 16, Wrap(_mm_cmpgt_epi8),
+ THelperCMPGT<i8>, int8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpgt_epi16() {
+ Test_mm_dualcmp<i16, 8, Wrap(_mm_cmpgt_epi16),
+ THelperCMPGT<i16>, int16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpgt_epi32() {
+ Test_mm_dualcmp<i32, 4, Wrap(_mm_cmpgt_epi32),
+ THelperCMPGT<i32>, int32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpgt_ps() {
+ struct THelperFloat {
+ static float Call(const float op1, const float op2) {
+ union Cast {
+ unsigned int AsUInt;
+ float AsFloat;
+ };
+ Cast value;
+ value.AsUInt = op1 > op2 ? 0xFFFFFFFF : 0;
+ return value.AsFloat;
+ }
+ };
+
+ Test_mm_dualcmp<float, 4, WrapF(_mm_cmpgt_ps),
+ THelperFloat, float32x4_t, __m128>();
+}
+
+template <typename TElem>
+struct THelperCMPLT {
+ static TElem Call(const TElem op1, const TElem op2) {
+ return op1 < op2 ? ~TElem(0) : TElem(0);
+ }
+};
+
+void TSSEEmulTest::Test_mm_cmplt_epi8() {
+ Test_mm_dualcmp<i8, 16, Wrap(_mm_cmplt_epi8),
+ THelperCMPLT<i8>, int8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmplt_epi16() {
+ Test_mm_dualcmp<i16, 8, Wrap(_mm_cmplt_epi16),
+ THelperCMPLT<i16>, int16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmplt_epi32() {
+ Test_mm_dualcmp<i32, 4, Wrap(_mm_cmplt_epi32),
+ THelperCMPLT<i32>, int32x4_t>();
+}
+
+template <typename TElem, int elemCount,
typename TFunc, typename TOp, typename TVectorType>
-void TSSEEmulTest::Test_mm_setter_epiXX() {
- char data[64] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
- '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
- '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
- '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
- '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
- '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
- TElem* dataw = reinterpret_cast<TElem*>(&data);
-
- for (unsigned dataItem = 0; dataItem < elemCount * 4; ++dataItem) {
+void TSSEEmulTest::Test_mm_setter_epiXX() {
+ char data[64] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
+ '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+ '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
+ '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
+ '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
+ '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
+ TElem* dataw = reinterpret_cast<TElem*>(&data);
+
+ for (unsigned dataItem = 0; dataItem < elemCount * 4; ++dataItem) {
TVectorType value = TFunc(dataw[dataItem]);
-
- for (unsigned i = 0; i < elemCount; ++i)
- UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<TOp>::As(value)[i]);
- }
-}
-
-void TSSEEmulTest::Test_mm_set1_epi8() {
- Test_mm_setter_epiXX<i8, 16, Wrap(_mm_set1_epi8), int8x16_t, __m128i>();
-}
-void TSSEEmulTest::Test_mm_set1_epi16() {
- Test_mm_setter_epiXX<i16, 8, Wrap(_mm_set1_epi16), int16x8_t, __m128i>();
-}
-void TSSEEmulTest::Test_mm_set1_epi32() {
- Test_mm_setter_epiXX<i32, 4, Wrap(_mm_set1_epi32), int32x4_t, __m128i>();
-}
-void TSSEEmulTest::Test_mm_set1_ps() {
- Test_mm_setter_epiXX<float, 4, WrapF(_mm_set1_ps), float32x4_t, __m128>();
-}
-
+
+ for (unsigned i = 0; i < elemCount; ++i)
+ UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<TOp>::As(value)[i]);
+ }
+}
+
+void TSSEEmulTest::Test_mm_set1_epi8() {
+ Test_mm_setter_epiXX<i8, 16, Wrap(_mm_set1_epi8), int8x16_t, __m128i>();
+}
+void TSSEEmulTest::Test_mm_set1_epi16() {
+ Test_mm_setter_epiXX<i16, 8, Wrap(_mm_set1_epi16), int16x8_t, __m128i>();
+}
+void TSSEEmulTest::Test_mm_set1_epi32() {
+ Test_mm_setter_epiXX<i32, 4, Wrap(_mm_set1_epi32), int32x4_t, __m128i>();
+}
+void TSSEEmulTest::Test_mm_set1_ps() {
+ Test_mm_setter_epiXX<float, 4, WrapF(_mm_set1_ps), float32x4_t, __m128>();
+}
+
void TSSEEmulTest::Test_mm_set_ps1() {
Test_mm_setter_epiXX<float, 4, WrapF(_mm_set_ps1), float32x4_t, __m128>();
}
-void TSSEEmulTest::Test_mm_setzero_si128() {
- __m128i value = _mm_setzero_si128();
- for (unsigned i = 0; i < 4; ++i)
- UNIT_ASSERT_EQUAL(0, TQType<uint32x4_t>::As(value)[i]);
-}
-
-void TSSEEmulTest::Test_mm_setzero_ps() {
- __m128 value = _mm_setzero_ps();
- for (unsigned i = 0; i < 4; ++i)
- UNIT_ASSERT_EQUAL(0.0, TQType<float32x4_t>::As(value)[i]);
-}
-
+void TSSEEmulTest::Test_mm_setzero_si128() {
+ __m128i value = _mm_setzero_si128();
+ for (unsigned i = 0; i < 4; ++i)
+ UNIT_ASSERT_EQUAL(0, TQType<uint32x4_t>::As(value)[i]);
+}
+
+void TSSEEmulTest::Test_mm_setzero_ps() {
+ __m128 value = _mm_setzero_ps();
+ for (unsigned i = 0; i < 4; ++i)
+ UNIT_ASSERT_EQUAL(0.0, TQType<float32x4_t>::As(value)[i]);
+}
+
void TSSEEmulTest::Test_mm_setzero_pd() {
__m128d value = _mm_setzero_pd();
for (unsigned i = 0; i < 2; ++i)
UNIT_ASSERT_EQUAL(0.0, TQType<float64x2_t>::As(value)[i]);
}
-void TSSEEmulTest::Test_mm_loadl_epi64() {
- char data[64] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
- '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
- '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
- '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
- '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
- '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
- ui64* dataw = reinterpret_cast<ui64*>(&data);
-
- for (unsigned dataItem = 0; dataItem < 8; ++dataItem) {
- __m128i value = _mm_loadl_epi64((__m128i const*)&dataw[dataItem]);
-
- UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<uint64x2_t>::As(value)[0]);
- UNIT_ASSERT_EQUAL(0, TQType<uint64x2_t>::As(value)[1]);
- }
-}
-
-void TSSEEmulTest::Test_mm_storel_epi64() {
- char data[64] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
- '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
- '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
- '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
- '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
- '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
- ui64* dataw = reinterpret_cast<ui64*>(&data);
-
- for (unsigned dataItem = 0; dataItem < 4; ++dataItem) {
- __m128i value = _mm_loadu_si128((__m128i*)&dataw[dataItem * 2]);
-
- ui64 buf[2] = {55, 81};
- _mm_storel_epi64((__m128i*)&buf, value);
-
- UNIT_ASSERT_EQUAL(dataw[dataItem * 2], buf[0]);
- UNIT_ASSERT_EQUAL(81, buf[1]);
- }
-}
-
-void TSSEEmulTest::Test_mm_shuffle_epi32() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- ui32* dataw = reinterpret_cast<ui32*>(&data);
- __m128i value = _mm_loadu_si128((__m128i*)&data);
-
- int coding[4] = {1, 3, 0, 2};
- __m128i result = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 0, 3, 1));
-
- for (unsigned i = 0; i < 4; ++i)
- UNIT_ASSERT_EQUAL(dataw[coding[i]],
- TQType<uint32x4_t>::As(result)[i]);
-}
-
-static int GetHighBitAt(char data, int at) {
- ui8 udata = data & 0x80;
- return int(udata >> 7) << at;
-}
-
-void TSSEEmulTest::Test_mm_movemask_epi8() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- __m128i value = _mm_loadu_si128((__m128i*)&data);
-
- int result = _mm_movemask_epi8(value);
- int verify = 0;
- for (unsigned i = 0; i < 16; ++i) {
- verify |= GetHighBitAt(data[i], i);
- }
-
- UNIT_ASSERT_EQUAL(result, verify);
-}
-
-void TSSEEmulTest::Test_mm_movemask_ps() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- __m128 value = _mm_loadu_ps((float*)&data);
-
- int result = _mm_movemask_ps(value);
- int verify = 0;
- for (unsigned i = 0; i < 4; ++i) {
- verify |= GetHighBitAt(data[i * 4 + 3], i);
- }
-
- UNIT_ASSERT_EQUAL(result, verify);
-}
-
+void TSSEEmulTest::Test_mm_loadl_epi64() {
+ char data[64] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
+ '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+ '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
+ '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
+ '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
+ '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
+ ui64* dataw = reinterpret_cast<ui64*>(&data);
+
+ for (unsigned dataItem = 0; dataItem < 8; ++dataItem) {
+ __m128i value = _mm_loadl_epi64((__m128i const*)&dataw[dataItem]);
+
+ UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<uint64x2_t>::As(value)[0]);
+ UNIT_ASSERT_EQUAL(0, TQType<uint64x2_t>::As(value)[1]);
+ }
+}
+
+void TSSEEmulTest::Test_mm_storel_epi64() {
+ char data[64] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
+ '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+ '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
+ '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
+ '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
+ '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
+ ui64* dataw = reinterpret_cast<ui64*>(&data);
+
+ for (unsigned dataItem = 0; dataItem < 4; ++dataItem) {
+ __m128i value = _mm_loadu_si128((__m128i*)&dataw[dataItem * 2]);
+
+ ui64 buf[2] = {55, 81};
+ _mm_storel_epi64((__m128i*)&buf, value);
+
+ UNIT_ASSERT_EQUAL(dataw[dataItem * 2], buf[0]);
+ UNIT_ASSERT_EQUAL(81, buf[1]);
+ }
+}
+
+void TSSEEmulTest::Test_mm_shuffle_epi32() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ ui32* dataw = reinterpret_cast<ui32*>(&data);
+ __m128i value = _mm_loadu_si128((__m128i*)&data);
+
+ int coding[4] = {1, 3, 0, 2};
+ __m128i result = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 0, 3, 1));
+
+ for (unsigned i = 0; i < 4; ++i)
+ UNIT_ASSERT_EQUAL(dataw[coding[i]],
+ TQType<uint32x4_t>::As(result)[i]);
+}
+
+static int GetHighBitAt(char data, int at) {
+ ui8 udata = data & 0x80;
+ return int(udata >> 7) << at;
+}
+
+void TSSEEmulTest::Test_mm_movemask_epi8() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ __m128i value = _mm_loadu_si128((__m128i*)&data);
+
+ int result = _mm_movemask_epi8(value);
+ int verify = 0;
+ for (unsigned i = 0; i < 16; ++i) {
+ verify |= GetHighBitAt(data[i], i);
+ }
+
+ UNIT_ASSERT_EQUAL(result, verify);
+}
+
+void TSSEEmulTest::Test_mm_movemask_ps() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ __m128 value = _mm_loadu_ps((float*)&data);
+
+ int result = _mm_movemask_ps(value);
+ int verify = 0;
+ for (unsigned i = 0; i < 4; ++i) {
+ verify |= GetHighBitAt(data[i * 4 + 3], i);
+ }
+
+ UNIT_ASSERT_EQUAL(result, verify);
+}
+
void TSSEEmulTest::Test_mm_movemask_ps_2() {
char data[16] = {
'\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF',
@@ -1414,19 +1414,19 @@ void TSSEEmulTest::Test_mm_movemask_ps_2() {
UNIT_ASSERT_EQUAL(result, 0xf);
}
-void TSSEEmulTest::Test_mm_cvtsi128_si32() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- __m128i value = _mm_loadu_si128((__m128i*)&data);
-
- int result = _mm_cvtsi128_si32(value);
- i32* datap = reinterpret_cast<i32*>(&data);
- int verify = datap[0];
-
- UNIT_ASSERT_EQUAL(result, verify);
-}
-
+void TSSEEmulTest::Test_mm_cvtsi128_si32() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ __m128i value = _mm_loadu_si128((__m128i*)&data);
+
+ int result = _mm_cvtsi128_si32(value);
+ i32* datap = reinterpret_cast<i32*>(&data);
+ int verify = datap[0];
+
+ UNIT_ASSERT_EQUAL(result, verify);
+}
+
void TSSEEmulTest::Test_mm_cvtsi128_si64() {
char data[16] = {
'\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1440,52 +1440,52 @@ void TSSEEmulTest::Test_mm_cvtsi128_si64() {
UNIT_ASSERT_EQUAL(result, verify);
}
-void TSSEEmulTest::Test_mm_set_epi16() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- i16* dataw = reinterpret_cast<i16*>(&data);
- ui64* dataq = reinterpret_cast<ui64*>(&data);
-
- __m128i result = _mm_set_epi16(dataw[7], dataw[6], dataw[5], dataw[4],
- dataw[3], dataw[2], dataw[1], dataw[0]);
- ui64 buf[2] = {53, 81};
- _mm_storeu_si128((__m128i*)&buf, result);
-
- UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
- UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
-}
-
-void TSSEEmulTest::Test_mm_set_epi32() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- i32* dataw = reinterpret_cast<i32*>(&data);
- ui64* dataq = reinterpret_cast<ui64*>(&data);
-
- __m128i result = _mm_set_epi32(dataw[3], dataw[2], dataw[1], dataw[0]);
- ui64 buf[2] = {53, 81};
- _mm_storeu_si128((__m128i*)&buf, result);
-
- UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
- UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
-}
-
-void TSSEEmulTest::Test_mm_set_ps() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- float* dataw = reinterpret_cast<float*>(&data);
- ui64* dataq = reinterpret_cast<ui64*>(&data);
-
- __m128 result = _mm_set_ps(dataw[3], dataw[2], dataw[1], dataw[0]);
- ui64 buf[2] = {53, 81};
- _mm_storeu_ps((float*)&buf, result);
-
- UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
- UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
-}
-
+void TSSEEmulTest::Test_mm_set_epi16() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ i16* dataw = reinterpret_cast<i16*>(&data);
+ ui64* dataq = reinterpret_cast<ui64*>(&data);
+
+ __m128i result = _mm_set_epi16(dataw[7], dataw[6], dataw[5], dataw[4],
+ dataw[3], dataw[2], dataw[1], dataw[0]);
+ ui64 buf[2] = {53, 81};
+ _mm_storeu_si128((__m128i*)&buf, result);
+
+ UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
+ UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
+}
+
+void TSSEEmulTest::Test_mm_set_epi32() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ i32* dataw = reinterpret_cast<i32*>(&data);
+ ui64* dataq = reinterpret_cast<ui64*>(&data);
+
+ __m128i result = _mm_set_epi32(dataw[3], dataw[2], dataw[1], dataw[0]);
+ ui64 buf[2] = {53, 81};
+ _mm_storeu_si128((__m128i*)&buf, result);
+
+ UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
+ UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
+}
+
+void TSSEEmulTest::Test_mm_set_ps() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ float* dataw = reinterpret_cast<float*>(&data);
+ ui64* dataq = reinterpret_cast<ui64*>(&data);
+
+ __m128 result = _mm_set_ps(dataw[3], dataw[2], dataw[1], dataw[0]);
+ ui64 buf[2] = {53, 81};
+ _mm_storeu_ps((float*)&buf, result);
+
+ UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
+ UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
+}
+
void TSSEEmulTest::Test_mm_set_pd() {
char data[16] = {
'\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1501,22 +1501,22 @@ void TSSEEmulTest::Test_mm_set_pd() {
UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
}
-void TSSEEmulTest::Test_mm_cvtsi32_si128() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- i32* dataw = reinterpret_cast<i32*>(&data);
-
- __m128i result = _mm_cvtsi32_si128(dataw[0]);
- i32 buf[4] = {53, 81, -43, 2132};
- _mm_storeu_si128((__m128i*)&buf, result);
-
- UNIT_ASSERT_EQUAL(buf[0], dataw[0]);
- UNIT_ASSERT_EQUAL(buf[1], 0);
- UNIT_ASSERT_EQUAL(buf[2], 0);
- UNIT_ASSERT_EQUAL(buf[3], 0);
-}
-
+void TSSEEmulTest::Test_mm_cvtsi32_si128() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ i32* dataw = reinterpret_cast<i32*>(&data);
+
+ __m128i result = _mm_cvtsi32_si128(dataw[0]);
+ i32 buf[4] = {53, 81, -43, 2132};
+ _mm_storeu_si128((__m128i*)&buf, result);
+
+ UNIT_ASSERT_EQUAL(buf[0], dataw[0]);
+ UNIT_ASSERT_EQUAL(buf[1], 0);
+ UNIT_ASSERT_EQUAL(buf[2], 0);
+ UNIT_ASSERT_EQUAL(buf[3], 0);
+}
+
void TSSEEmulTest::Test_mm_cvtsi64_si128() {
char data[16] = {
'\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1531,44 +1531,44 @@ void TSSEEmulTest::Test_mm_cvtsi64_si128() {
UNIT_ASSERT_EQUAL(buf[1], 0);
}
-template <typename TElem, typename TNarrow, unsigned elemCount, typename TFunc>
-void TSSEEmulTest::Test_mm_packs_epiXX() {
- char data[32] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x00', '\x66', '\x1C',
- '\x99', '\x33', '\x1C', '\x55', '\x00', '\x00', '\x00', '\x00',
- '\x00', '\xAA', '\x00', '\x00', '\xCC', '\xBB', '\x22', '\xFF'};
- __m128i value0 = _mm_loadu_si128((__m128i*)&data);
- __m128i value1 = _mm_loadu_si128(((__m128i*)&data) + 1);
- TElem* dataw = reinterpret_cast<TElem*>(&data);
-
- __m128i result = TFunc(value0, value1);
-
- TNarrow verify[elemCount];
- for (unsigned i = 0; i < elemCount; ++i) {
- TElem sum = dataw[i];
- if (sum > std::numeric_limits<TNarrow>::max())
- sum = std::numeric_limits<TNarrow>::max();
- if (sum < std::numeric_limits<TNarrow>::min())
- sum = std::numeric_limits<TNarrow>::min();
- verify[i] = TNarrow(sum);
- }
-
- ui64* verifyp = (ui64*)&verify;
- UNIT_ASSERT_EQUAL(verifyp[0], TQType<uint64x2_t>::As(result)[0]);
- UNIT_ASSERT_EQUAL(verifyp[1], TQType<uint64x2_t>::As(result)[1]);
-}
-
-void TSSEEmulTest::Test_mm_packs_epi16() {
- Test_mm_packs_epiXX<i16, i8, 16, Wrap(_mm_packs_epi16)>();
-}
-void TSSEEmulTest::Test_mm_packs_epi32() {
- Test_mm_packs_epiXX<i32, i16, 8, Wrap(_mm_packs_epi32)>();
-}
-void TSSEEmulTest::Test_mm_packus_epi16() {
- Test_mm_packs_epiXX<i16, ui8, 16, Wrap(_mm_packus_epi16)>();
-}
-
+template <typename TElem, typename TNarrow, unsigned elemCount, typename TFunc>
+void TSSEEmulTest::Test_mm_packs_epiXX() {
+ char data[32] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x00', '\x66', '\x1C',
+ '\x99', '\x33', '\x1C', '\x55', '\x00', '\x00', '\x00', '\x00',
+ '\x00', '\xAA', '\x00', '\x00', '\xCC', '\xBB', '\x22', '\xFF'};
+ __m128i value0 = _mm_loadu_si128((__m128i*)&data);
+ __m128i value1 = _mm_loadu_si128(((__m128i*)&data) + 1);
+ TElem* dataw = reinterpret_cast<TElem*>(&data);
+
+ __m128i result = TFunc(value0, value1);
+
+ TNarrow verify[elemCount];
+ for (unsigned i = 0; i < elemCount; ++i) {
+ TElem sum = dataw[i];
+ if (sum > std::numeric_limits<TNarrow>::max())
+ sum = std::numeric_limits<TNarrow>::max();
+ if (sum < std::numeric_limits<TNarrow>::min())
+ sum = std::numeric_limits<TNarrow>::min();
+ verify[i] = TNarrow(sum);
+ }
+
+ ui64* verifyp = (ui64*)&verify;
+ UNIT_ASSERT_EQUAL(verifyp[0], TQType<uint64x2_t>::As(result)[0]);
+ UNIT_ASSERT_EQUAL(verifyp[1], TQType<uint64x2_t>::As(result)[1]);
+}
+
+void TSSEEmulTest::Test_mm_packs_epi16() {
+ Test_mm_packs_epiXX<i16, i8, 16, Wrap(_mm_packs_epi16)>();
+}
+void TSSEEmulTest::Test_mm_packs_epi32() {
+ Test_mm_packs_epiXX<i32, i16, 8, Wrap(_mm_packs_epi32)>();
+}
+void TSSEEmulTest::Test_mm_packus_epi16() {
+ Test_mm_packs_epiXX<i16, ui8, 16, Wrap(_mm_packus_epi16)>();
+}
+
void TSSEEmulTest::Test_mm_extract_epi8() {
alignas(16) char data[16] = {
'\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1594,23 +1594,23 @@ void TSSEEmulTest::Test_mm_extract_epi8() {
UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 15)), int(dataw[15]));
}
-void TSSEEmulTest::Test_mm_extract_epi16() {
+void TSSEEmulTest::Test_mm_extract_epi16() {
alignas(16) char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
const ui16* dataw = reinterpret_cast<const ui16*>(&data);
const __m128i value = _mm_loadu_si128((__m128i*)&data);
-
- UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0]));
- UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 1)), int(dataw[1]));
- UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 2)), int(dataw[2]));
- UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 3)), int(dataw[3]));
- UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 4)), int(dataw[4]));
- UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 5)), int(dataw[5]));
- UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 6)), int(dataw[6]));
- UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 7)), int(dataw[7]));
-}
-
+
+ UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0]));
+ UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 1)), int(dataw[1]));
+ UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 2)), int(dataw[2]));
+ UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 3)), int(dataw[3]));
+ UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 4)), int(dataw[4]));
+ UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 5)), int(dataw[5]));
+ UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 6)), int(dataw[6]));
+ UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 7)), int(dataw[7]));
+}
+
void TSSEEmulTest::Test_mm_extract_epi64() {
alignas(16) char data[16] = {
'\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1635,160 +1635,160 @@ void TSSEEmulTest::Test_mm_extract_epi32() {
UNIT_ASSERT_EQUAL((_mm_extract_epi32(value, 3)), int(dataw[3]));
}
-void TSSEEmulTest::Test_MM_TRANSPOSE4_PS() {
- char data0[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- char data1[16] = {
- '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
- '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
- char data2[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- char data3[16] = {
- '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
- '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
-
- __m128 value0 = _mm_loadu_ps((float*)&data0);
- __m128 value1 = _mm_loadu_ps((float*)&data1);
- __m128 value2 = _mm_loadu_ps((float*)&data2);
- __m128 value3 = _mm_loadu_ps((float*)&data3);
-
- _MM_TRANSPOSE4_PS(value0, value1, value2, value3);
-
- ui64 tbuf0[2] = {0, 0};
- ui64 tbuf1[2] = {0, 0};
- ui64 tbuf2[2] = {0, 0};
- ui64 tbuf3[2] = {0, 0};
-
- _mm_storeu_ps((float*)&tbuf0, value0);
- _mm_storeu_ps((float*)&tbuf1, value1);
- _mm_storeu_ps((float*)&tbuf2, value2);
- _mm_storeu_ps((float*)&tbuf3, value3);
-
- char tdata0[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55',
- '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55'};
- char tdata1[16] = {
- '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44',
- '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44'};
- char tdata2[16] = {
- '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11',
- '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11'};
- char tdata3[16] = {
- '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF',
- '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF'};
-
- UNIT_ASSERT(memcmp(tbuf0, tdata0, 16) == 0);
- UNIT_ASSERT(memcmp(tbuf1, tdata1, 16) == 0);
- UNIT_ASSERT(memcmp(tbuf2, tdata2, 16) == 0);
- UNIT_ASSERT(memcmp(tbuf3, tdata3, 16) == 0);
-}
-
-template <typename TFrom, typename TTo, unsigned elemCount,
- typename TLoadVector, typename TResultVector,
- typename TElemFunc, typename TFunc, typename TOp>
-void TSSEEmulTest::Test_mm_convertop() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- TFrom* datap = reinterpret_cast<TFrom*>(&data);
-
- TLoadVector value = TFuncLoad<TLoadVector>(&data);
-
- TTo procData[elemCount];
- for (unsigned i = 0; i < elemCount; ++i) {
- procData[i] = TElemFunc::Call(datap[i]);
- }
-
- TResultVector result = TFunc(value);
-
- for (unsigned i = 0; i < elemCount; ++i) {
- UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);
- }
-}
-
-void TSSEEmulTest::Test_mm_cvtepi32_ps() {
- struct THelper {
- static float Call(const i32 op) {
- return float(op);
- }
- };
- Test_mm_convertop<i32, float, 4, __m128i, __m128,
- THelper, WrapF(_mm_cvtepi32_ps), float32x4_t>();
-};
-
-void TSSEEmulTest::Test_mm_cvtps_epi32() {
- struct THelper {
- static i32 Call(const float op) {
- return i32(op);
- }
- };
- Test_mm_convertop<float, i32, 4, __m128, __m128i,
+void TSSEEmulTest::Test_MM_TRANSPOSE4_PS() {
+ char data0[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ char data1[16] = {
+ '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+ '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+ char data2[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ char data3[16] = {
+ '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+ '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+
+ __m128 value0 = _mm_loadu_ps((float*)&data0);
+ __m128 value1 = _mm_loadu_ps((float*)&data1);
+ __m128 value2 = _mm_loadu_ps((float*)&data2);
+ __m128 value3 = _mm_loadu_ps((float*)&data3);
+
+ _MM_TRANSPOSE4_PS(value0, value1, value2, value3);
+
+ ui64 tbuf0[2] = {0, 0};
+ ui64 tbuf1[2] = {0, 0};
+ ui64 tbuf2[2] = {0, 0};
+ ui64 tbuf3[2] = {0, 0};
+
+ _mm_storeu_ps((float*)&tbuf0, value0);
+ _mm_storeu_ps((float*)&tbuf1, value1);
+ _mm_storeu_ps((float*)&tbuf2, value2);
+ _mm_storeu_ps((float*)&tbuf3, value3);
+
+ char tdata0[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55',
+ '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55'};
+ char tdata1[16] = {
+ '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44',
+ '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44'};
+ char tdata2[16] = {
+ '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11',
+ '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11'};
+ char tdata3[16] = {
+ '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF',
+ '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF'};
+
+ UNIT_ASSERT(memcmp(tbuf0, tdata0, 16) == 0);
+ UNIT_ASSERT(memcmp(tbuf1, tdata1, 16) == 0);
+ UNIT_ASSERT(memcmp(tbuf2, tdata2, 16) == 0);
+ UNIT_ASSERT(memcmp(tbuf3, tdata3, 16) == 0);
+}
+
+template <typename TFrom, typename TTo, unsigned elemCount,
+ typename TLoadVector, typename TResultVector,
+ typename TElemFunc, typename TFunc, typename TOp>
+void TSSEEmulTest::Test_mm_convertop() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ TFrom* datap = reinterpret_cast<TFrom*>(&data);
+
+ TLoadVector value = TFuncLoad<TLoadVector>(&data);
+
+ TTo procData[elemCount];
+ for (unsigned i = 0; i < elemCount; ++i) {
+ procData[i] = TElemFunc::Call(datap[i]);
+ }
+
+ TResultVector result = TFunc(value);
+
+ for (unsigned i = 0; i < elemCount; ++i) {
+ UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);
+ }
+}
+
+void TSSEEmulTest::Test_mm_cvtepi32_ps() {
+ struct THelper {
+ static float Call(const i32 op) {
+ return float(op);
+ }
+ };
+ Test_mm_convertop<i32, float, 4, __m128i, __m128,
+ THelper, WrapF(_mm_cvtepi32_ps), float32x4_t>();
+};
+
+void TSSEEmulTest::Test_mm_cvtps_epi32() {
+ struct THelper {
+ static i32 Call(const float op) {
+ return i32(op);
+ }
+ };
+ Test_mm_convertop<float, i32, 4, __m128, __m128i,
THelper, T_mm_CallWrapper<__m128i, decltype(_mm_cvtps_epi32), _mm_cvtps_epi32>, int32x4_t>();
-};
-
-void TSSEEmulTest::Test_mm_cvttps_epi32() {
- struct THelper {
- static i32 Call(const float op) {
- return i32(op);
- }
- };
- Test_mm_convertop<float, i32, 4, __m128, __m128i,
- THelper, Wrap(_mm_cvttps_epi32), int32x4_t>();
-};
-
-template <typename TLoadVector, typename TCastVector,
- typename TFunc, TFunc* func>
-void TSSEEmulTest::Test_mm_castXX() {
- char data[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
-
- TLoadVector value = TFuncLoad<TLoadVector>(&data);
- const TLoadVector constvalue = TFuncLoad<TLoadVector>(&data);
- TCastVector casted = func(value);
- const TCastVector constcasted = func(constvalue);
- char verify[16];
- char constverify[16];
- TFuncStore<TCastVector>(&verify, casted);
- TFuncStore<TCastVector>(&constverify, constcasted);
-
- UNIT_ASSERT(memcmp(&data, &verify, 16) == 0);
- UNIT_ASSERT(memcmp(&data, &constverify, 16) == 0);
-};
-
-void TSSEEmulTest::Test_mm_castsi128_ps() {
- Test_mm_castXX<__m128i, __m128,
- decltype(_mm_castsi128_ps), _mm_castsi128_ps>();
-}
-
-void TSSEEmulTest::Test_mm_castps_si128() {
- Test_mm_castXX<__m128, __m128i,
- decltype(_mm_castps_si128), _mm_castps_si128>();
-}
-
-void TSSEEmulTest::Test_mm_mul_epu32() {
- char data0[16] = {
- '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
- '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
- char data1[16] = {
- '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
- '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
- ui32* dataw0 = reinterpret_cast<ui32*>(&data0);
- ui32* dataw1 = reinterpret_cast<ui32*>(&data1);
-
- __m128i value0 = _mm_loadu_si128((__m128i*)&data0);
- __m128i value1 = _mm_loadu_si128((__m128i*)&data1);
-
+};
+
+void TSSEEmulTest::Test_mm_cvttps_epi32() {
+ struct THelper {
+ static i32 Call(const float op) {
+ return i32(op);
+ }
+ };
+ Test_mm_convertop<float, i32, 4, __m128, __m128i,
+ THelper, Wrap(_mm_cvttps_epi32), int32x4_t>();
+};
+
+template <typename TLoadVector, typename TCastVector,
+ typename TFunc, TFunc* func>
+void TSSEEmulTest::Test_mm_castXX() {
+ char data[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+
+ TLoadVector value = TFuncLoad<TLoadVector>(&data);
+ const TLoadVector constvalue = TFuncLoad<TLoadVector>(&data);
+ TCastVector casted = func(value);
+ const TCastVector constcasted = func(constvalue);
+ char verify[16];
+ char constverify[16];
+ TFuncStore<TCastVector>(&verify, casted);
+ TFuncStore<TCastVector>(&constverify, constcasted);
+
+ UNIT_ASSERT(memcmp(&data, &verify, 16) == 0);
+ UNIT_ASSERT(memcmp(&data, &constverify, 16) == 0);
+};
+
+void TSSEEmulTest::Test_mm_castsi128_ps() {
+ Test_mm_castXX<__m128i, __m128,
+ decltype(_mm_castsi128_ps), _mm_castsi128_ps>();
+}
+
+void TSSEEmulTest::Test_mm_castps_si128() {
+ Test_mm_castXX<__m128, __m128i,
+ decltype(_mm_castps_si128), _mm_castps_si128>();
+}
+
+void TSSEEmulTest::Test_mm_mul_epu32() {
+ char data0[16] = {
+ '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+ '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+ char data1[16] = {
+ '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+ '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+ ui32* dataw0 = reinterpret_cast<ui32*>(&data0);
+ ui32* dataw1 = reinterpret_cast<ui32*>(&data1);
+
+ __m128i value0 = _mm_loadu_si128((__m128i*)&data0);
+ __m128i value1 = _mm_loadu_si128((__m128i*)&data1);
+
ui64 mul0 = (ui64) dataw0[0] * (ui64) dataw1[0];
ui64 mul1 = (ui64) dataw0[2] * (ui64) dataw1[2];
-
- __m128i result = _mm_mul_epu32(value0, value1);
-
- UNIT_ASSERT_EQUAL(mul0, TQType<uint64x2_t>::As(result)[0]);
- UNIT_ASSERT_EQUAL(mul1, TQType<uint64x2_t>::As(result)[1]);
-}
+
+ __m128i result = _mm_mul_epu32(value0, value1);
+
+ UNIT_ASSERT_EQUAL(mul0, TQType<uint64x2_t>::As(result)[0]);
+ UNIT_ASSERT_EQUAL(mul1, TQType<uint64x2_t>::As(result)[1]);
+}
void TSSEEmulTest::Test_mm_cmpunord_ps() {
alignas(16) float valuesBits[4] = {1.f, 2.f, 3.f, 4.f};
diff --git a/library/cpp/sse/ut/ya.make b/library/cpp/sse/ut/ya.make
index 14cac6727a..45e104971e 100644
--- a/library/cpp/sse/ut/ya.make
+++ b/library/cpp/sse/ut/ya.make
@@ -1,13 +1,13 @@
UNITTEST_FOR(library/cpp/sse)
-
+
OWNER(danlark)
-
-SRCS(
+
+SRCS(
test.cpp
-)
-
+)
+
IF (ARCH_X86_64)
CFLAGS(-msse4.1 -msse4.2)
ENDIF()
-END()
+END()