aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/sse/sse2neon.h
diff options
context:
space:
mode:
authoragri <agri@yandex-team.ru>2022-02-10 16:48:12 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:48:12 +0300
commit2909866fbc652492b7d7cab3023cb19489dc4fd8 (patch)
treeb222e5ac2e2e98872661c51ccceee5da0d291e13 /library/cpp/sse/sse2neon.h
parentd3530b2692e400bd4d29bd4f07cafaee139164e7 (diff)
downloadydb-2909866fbc652492b7d7cab3023cb19489dc4fd8.tar.gz
Restoring authorship annotation for <agri@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/sse/sse2neon.h')
-rw-r--r--library/cpp/sse/sse2neon.h1122
1 files changed, 561 insertions, 561 deletions
diff --git a/library/cpp/sse/sse2neon.h b/library/cpp/sse/sse2neon.h
index af7f3ed242..695dbd3041 100644
--- a/library/cpp/sse/sse2neon.h
+++ b/library/cpp/sse/sse2neon.h
@@ -1,60 +1,60 @@
-#pragma once
-
-/*
- The header contains inlining code
- which translates SSE intrinsics to NEON intrinsics or software emulation.
- You are encouraged for commitments.
- Add missing intrinsics, add unittests, purify the implementation,
- merge and simplify templates.
- Warning: The code is made in deep nights, so it surely contains bugs,
- imperfections, flaws and all other kinds of errors and mistakes.
-*/
-/* Author: Vitaliy Manushkin <agri@yandex-team.ru> */
-
-#include <util/system/platform.h>
+#pragma once
+
+/*
+ The header contains inlining code
+ which translates SSE intrinsics to NEON intrinsics or software emulation.
+ You are encouraged for commitments.
+ Add missing intrinsics, add unittests, purify the implementation,
+ merge and simplify templates.
+ Warning: The code is made in deep nights, so it surely contains bugs,
+ imperfections, flaws and all other kinds of errors and mistakes.
+*/
+/* Author: Vitaliy Manushkin <agri@yandex-team.ru> */
+
+#include <util/system/platform.h>
#include <util/system/compiler.h>
-#include <util/system/types.h>
-
-#if !defined(_arm64_)
-#error "This header is for ARM64 (aarch64) platform only. " \
+#include <util/system/types.h>
+
+#if !defined(_arm64_)
+#error "This header is for ARM64 (aarch64) platform only. " \
"Include sse.h instead of including this header directly."
-#endif
-
-#include <arm_neon.h>
-
-union __m128i {
- uint64x2_t AsUi64x2;
- int64x2_t AsSi64x2;
-
- uint32x4_t AsUi32x4;
- int32x4_t AsSi32x4;
-
- uint16x8_t AsUi16x8;
- int16x8_t AsSi16x8;
-
- uint8x16_t AsUi8x16;
- int8x16_t AsSi8x16;
-
- float32x4_t AsFloat32x4;
- float64x2_t AsFloat64x2;
-};
-
-union __m128 {
- float32x4_t AsFloat32x4;
- float64x2_t AsFloat64x2;
+#endif
+
+#include <arm_neon.h>
+
+union __m128i {
+ uint64x2_t AsUi64x2;
+ int64x2_t AsSi64x2;
uint32x4_t AsUi32x4;
int32x4_t AsSi32x4;
- uint64x2_t AsUi64x2;
- int64x2_t AsSi64x2;
+ uint16x8_t AsUi16x8;
+ int16x8_t AsSi16x8;
- uint8x16_t AsUi8x16;
+ uint8x16_t AsUi8x16;
+ int8x16_t AsSi8x16;
+
+ float32x4_t AsFloat32x4;
+ float64x2_t AsFloat64x2;
+};
+
+union __m128 {
+ float32x4_t AsFloat32x4;
+ float64x2_t AsFloat64x2;
+
+ uint32x4_t AsUi32x4;
+ int32x4_t AsSi32x4;
+
+ uint64x2_t AsUi64x2;
+ int64x2_t AsSi64x2;
+
+ uint8x16_t AsUi8x16;
int8x16_t AsSi8x16;
__m128i As128i;
-};
-
+};
+
typedef float64x2_t __m128d;
enum _mm_hint
@@ -72,128 +72,128 @@ Y_FORCE_INLINE void _mm_prefetch(const void *p, enum _mm_hint) {
__builtin_prefetch(p);
}
-template <typename TType>
-struct TQType;
-
-template <>
-struct TQType<uint8x16_t> {
- static inline uint8x16_t& As(__m128i& value) {
- return value.AsUi8x16;
- }
- static inline const uint8x16_t& As(const __m128i& value) {
- return value.AsUi8x16;
- }
-};
-
-template <>
-struct TQType<int8x16_t> {
- static inline int8x16_t& As(__m128i& value) {
- return value.AsSi8x16;
- }
- static inline const int8x16_t& As(const __m128i& value) {
- return value.AsSi8x16;
- }
-};
-
-template <>
-struct TQType<uint16x8_t> {
- static inline uint16x8_t& As(__m128i& value) {
- return value.AsUi16x8;
- }
- static inline const uint16x8_t& As(const __m128i& value) {
- return value.AsUi16x8;
- }
-};
-
-template <>
-struct TQType<int16x8_t> {
- static inline int16x8_t& As(__m128i& value) {
- return value.AsSi16x8;
- }
- static inline const int16x8_t& As(const __m128i& value) {
- return value.AsSi16x8;
- }
-};
-
-template <>
-struct TQType<uint32x4_t> {
- static inline uint32x4_t& As(__m128i& value) {
- return value.AsUi32x4;
- }
- static inline const uint32x4_t& As(const __m128i& value) {
- return value.AsUi32x4;
- }
-};
-
-template <>
-struct TQType<int32x4_t> {
- static inline int32x4_t& As(__m128i& value) {
- return value.AsSi32x4;
- }
- static inline const int32x4_t& As(const __m128i& value) {
- return value.AsSi32x4;
- }
-};
-
-template <>
-struct TQType<uint64x2_t> {
- static inline uint64x2_t& As(__m128i& value) {
- return value.AsUi64x2;
- }
- static inline const uint64x2_t& As(const __m128i& value) {
- return value.AsUi64x2;
- }
- static inline uint64x2_t& As(__m128& value) {
- return value.AsUi64x2;
- }
- static inline const uint64x2_t& As(const __m128& value) {
- return value.AsUi64x2;
- }
-};
-
-template <>
-struct TQType<int64x2_t> {
- static inline int64x2_t& As(__m128i& value) {
- return value.AsSi64x2;
- }
- static inline const int64x2_t& As(const __m128i& value) {
- return value.AsSi64x2;
- }
-};
-
-template <typename TValue>
-struct TBaseWrapper {
- TValue Value;
-
+template <typename TType>
+struct TQType;
+
+template <>
+struct TQType<uint8x16_t> {
+ static inline uint8x16_t& As(__m128i& value) {
+ return value.AsUi8x16;
+ }
+ static inline const uint8x16_t& As(const __m128i& value) {
+ return value.AsUi8x16;
+ }
+};
+
+template <>
+struct TQType<int8x16_t> {
+ static inline int8x16_t& As(__m128i& value) {
+ return value.AsSi8x16;
+ }
+ static inline const int8x16_t& As(const __m128i& value) {
+ return value.AsSi8x16;
+ }
+};
+
+template <>
+struct TQType<uint16x8_t> {
+ static inline uint16x8_t& As(__m128i& value) {
+ return value.AsUi16x8;
+ }
+ static inline const uint16x8_t& As(const __m128i& value) {
+ return value.AsUi16x8;
+ }
+};
+
+template <>
+struct TQType<int16x8_t> {
+ static inline int16x8_t& As(__m128i& value) {
+ return value.AsSi16x8;
+ }
+ static inline const int16x8_t& As(const __m128i& value) {
+ return value.AsSi16x8;
+ }
+};
+
+template <>
+struct TQType<uint32x4_t> {
+ static inline uint32x4_t& As(__m128i& value) {
+ return value.AsUi32x4;
+ }
+ static inline const uint32x4_t& As(const __m128i& value) {
+ return value.AsUi32x4;
+ }
+};
+
+template <>
+struct TQType<int32x4_t> {
+ static inline int32x4_t& As(__m128i& value) {
+ return value.AsSi32x4;
+ }
+ static inline const int32x4_t& As(const __m128i& value) {
+ return value.AsSi32x4;
+ }
+};
+
+template <>
+struct TQType<uint64x2_t> {
+ static inline uint64x2_t& As(__m128i& value) {
+ return value.AsUi64x2;
+ }
+ static inline const uint64x2_t& As(const __m128i& value) {
+ return value.AsUi64x2;
+ }
+ static inline uint64x2_t& As(__m128& value) {
+ return value.AsUi64x2;
+ }
+ static inline const uint64x2_t& As(const __m128& value) {
+ return value.AsUi64x2;
+ }
+};
+
+template <>
+struct TQType<int64x2_t> {
+ static inline int64x2_t& As(__m128i& value) {
+ return value.AsSi64x2;
+ }
+ static inline const int64x2_t& As(const __m128i& value) {
+ return value.AsSi64x2;
+ }
+};
+
+template <typename TValue>
+struct TBaseWrapper {
+ TValue Value;
+
Y_FORCE_INLINE
- operator TValue&() {
- return Value;
- }
-
+ operator TValue&() {
+ return Value;
+ }
+
Y_FORCE_INLINE
- operator const TValue&() const {
- return Value;
- }
-};
-
-template <typename TOp, typename TFunc, TFunc* func,
- typename TDup, TDup* dupfunc>
-struct TWrapperSingleDup: public TBaseWrapper<__m128i> {
+ operator const TValue&() const {
+ return Value;
+ }
+};
+
+template <typename TOp, typename TFunc, TFunc* func,
+ typename TDup, TDup* dupfunc>
+struct TWrapperSingleDup: public TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperSingleDup(const __m128i& op, const int shift) {
- TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(shift));
- }
-};
-
-template <typename TOp, typename TFunc, TFunc* func,
- typename TDup, TDup* dupfunc>
-struct TWrapperSingleNegDup: public TBaseWrapper<__m128i> {
+ TWrapperSingleDup(const __m128i& op, const int shift) {
+ TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(shift));
+ }
+};
+
+template <typename TOp, typename TFunc, TFunc* func,
+ typename TDup, TDup* dupfunc>
+struct TWrapperSingleNegDup: public TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperSingleNegDup(const __m128i& op, const int shift) {
- TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(-shift));
- }
-};
-
+ TWrapperSingleNegDup(const __m128i& op, const int shift) {
+ TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(-shift));
+ }
+};
+
inline __m128i _mm_srl_epi16(__m128i a, __m128i count) {
__m128i res;
res.AsUi16x8 = vshlq_u16(a.AsUi16x8, vdupq_n_s16(-count.AsUi16x8[0]));
@@ -225,16 +225,16 @@ inline __m128i _mm_srai_epi32(__m128i a, int count) {
return res;
}
-using _mm_srli_epi16 =
- TWrapperSingleNegDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
- decltype(vdupq_n_s16), vdupq_n_s16>;
-using _mm_srli_epi32 =
- TWrapperSingleNegDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
- decltype(vdupq_n_s32), vdupq_n_s32>;
-using _mm_srli_epi64 =
- TWrapperSingleNegDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
- decltype(vdupq_n_s64), vdupq_n_s64>;
-
+using _mm_srli_epi16 =
+ TWrapperSingleNegDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
+ decltype(vdupq_n_s16), vdupq_n_s16>;
+using _mm_srli_epi32 =
+ TWrapperSingleNegDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
+ decltype(vdupq_n_s32), vdupq_n_s32>;
+using _mm_srli_epi64 =
+ TWrapperSingleNegDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
+ decltype(vdupq_n_s64), vdupq_n_s64>;
+
inline __m128i _mm_sll_epi16(__m128i a, __m128i count) {
__m128i res;
@@ -255,57 +255,57 @@ inline __m128i _mm_sll_epi64(__m128i a, __m128i count) {
return res;
}
-using _mm_slli_epi16 =
- TWrapperSingleDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
- decltype(vdupq_n_s16), vdupq_n_s16>;
-using _mm_slli_epi32 =
- TWrapperSingleDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
- decltype(vdupq_n_s32), vdupq_n_s32>;
-using _mm_slli_epi64 =
- TWrapperSingleDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
- decltype(vdupq_n_s64), vdupq_n_s64>;
-
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
-struct TWrapperDual : TBaseWrapper<__m128i> {
+using _mm_slli_epi16 =
+ TWrapperSingleDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
+ decltype(vdupq_n_s16), vdupq_n_s16>;
+using _mm_slli_epi32 =
+ TWrapperSingleDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
+ decltype(vdupq_n_s32), vdupq_n_s32>;
+using _mm_slli_epi64 =
+ TWrapperSingleDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
+ decltype(vdupq_n_s64), vdupq_n_s64>;
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperDual : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperDual(const __m128i& op1, const __m128i& op2, TParams... params) {
- TQType<TOp>::As(Value) = (TOp)
- func(TQType<TOp>::As(op1),
- TQType<TOp>::As(op2),
- params...);
- }
-};
-
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
-struct TWrapperDualSwap : TBaseWrapper<__m128i> {
+ TWrapperDual(const __m128i& op1, const __m128i& op2, TParams... params) {
+ TQType<TOp>::As(Value) = (TOp)
+ func(TQType<TOp>::As(op1),
+ TQType<TOp>::As(op2),
+ params...);
+ }
+};
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperDualSwap : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperDualSwap(const __m128i& op1, const __m128i& op2, TParams... params) {
- TQType<TOp>::As(Value) =
- func(TQType<TOp>::As(op2),
- TQType<TOp>::As(op1),
- params...);
- }
-};
-
+ TWrapperDualSwap(const __m128i& op1, const __m128i& op2, TParams... params) {
+ TQType<TOp>::As(Value) =
+ func(TQType<TOp>::As(op2),
+ TQType<TOp>::As(op1),
+ params...);
+ }
+};
+
template <typename TOp, typename TFunc, TFunc* func, typename TArgument = __m128>
struct TWrapperDualF : TBaseWrapper<TArgument> {
Y_FORCE_INLINE
TWrapperDualF(const TArgument& op1, const TArgument& op2) {
TQType<TOp>::As(TBaseWrapper<TArgument>::Value) = (TOp) func(TQType<TOp>::As(op1), TQType<TOp>::As(op2));
- }
-};
-
-using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>;
-using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>;
-using _mm_andnot_si128 =
- TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>;
+ }
+};
+
+using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>;
+using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>;
+using _mm_andnot_si128 =
+ TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>;
using _mm_xor_si128 = TWrapperDual<uint64x2_t, decltype(veorq_u64), veorq_u64>;
-
+
using _mm_add_epi8 = TWrapperDual<uint8x16_t, decltype(vaddq_u8), vaddq_u8>;
-using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>;
-using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>;
-using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>;
-
+using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>;
+using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>;
+using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>;
+
inline __m128i _mm_madd_epi16(__m128i a, __m128i b) {
int32x4_t aLow;
int32x4_t aHigh;
@@ -343,118 +343,118 @@ inline __m128i _mm_madd_epi16(__m128i a, __m128i b) {
}
using _mm_sub_epi8 = TWrapperDual<uint8x16_t, decltype(vsubq_u8), vsubq_u8>;
-using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>;
-using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>;
-using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>;
-
-using _mm_unpacklo_epi8 =
- TWrapperDual<uint8x16_t, decltype(vzip1q_u8), vzip1q_u8>;
-using _mm_unpackhi_epi8 =
- TWrapperDual<uint8x16_t, decltype(vzip2q_u8), vzip2q_u8>;
-using _mm_unpacklo_epi16 =
- TWrapperDual<uint16x8_t, decltype(vzip1q_u16), vzip1q_u16>;
-using _mm_unpackhi_epi16 =
- TWrapperDual<uint16x8_t, decltype(vzip2q_u16), vzip2q_u16>;
-using _mm_unpacklo_epi32 =
- TWrapperDual<uint32x4_t, decltype(vzip1q_u32), vzip1q_u32>;
-using _mm_unpackhi_epi32 =
- TWrapperDual<uint32x4_t, decltype(vzip2q_u32), vzip2q_u32>;
-using _mm_unpacklo_epi64 =
- TWrapperDual<uint64x2_t, decltype(vzip1q_u64), vzip1q_u64>;
-using _mm_unpackhi_epi64 =
- TWrapperDual<uint64x2_t, decltype(vzip2q_u64), vzip2q_u64>;
-
-using _mm_cmpeq_epi8 =
- TWrapperDual<uint8x16_t, decltype(vceqq_u8), vceqq_u8>;
-using _mm_cmpeq_epi16 =
- TWrapperDual<uint16x8_t, decltype(vceqq_u16), vceqq_u16>;
-using _mm_cmpeq_epi32 =
- TWrapperDual<uint32x4_t, decltype(vceqq_u32), vceqq_u32>;
-
-using _mm_cmpgt_epi8 =
- TWrapperDual<int8x16_t, decltype(vcgtq_s8), vcgtq_s8>;
-using _mm_cmpgt_epi16 =
- TWrapperDual<int16x8_t, decltype(vcgtq_s16), vcgtq_s16>;
-using _mm_cmpgt_epi32 =
- TWrapperDual<int32x4_t, decltype(vcgtq_s32), vcgtq_s32>;
-
-using _mm_cmplt_epi8 =
- TWrapperDual<int8x16_t, decltype(vcltq_s8), vcltq_s8>;
-using _mm_cmplt_epi16 =
- TWrapperDual<int16x8_t, decltype(vcltq_s16), vcltq_s16>;
-using _mm_cmplt_epi32 =
- TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>;
-
+using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>;
+using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>;
+using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>;
+
+using _mm_unpacklo_epi8 =
+ TWrapperDual<uint8x16_t, decltype(vzip1q_u8), vzip1q_u8>;
+using _mm_unpackhi_epi8 =
+ TWrapperDual<uint8x16_t, decltype(vzip2q_u8), vzip2q_u8>;
+using _mm_unpacklo_epi16 =
+ TWrapperDual<uint16x8_t, decltype(vzip1q_u16), vzip1q_u16>;
+using _mm_unpackhi_epi16 =
+ TWrapperDual<uint16x8_t, decltype(vzip2q_u16), vzip2q_u16>;
+using _mm_unpacklo_epi32 =
+ TWrapperDual<uint32x4_t, decltype(vzip1q_u32), vzip1q_u32>;
+using _mm_unpackhi_epi32 =
+ TWrapperDual<uint32x4_t, decltype(vzip2q_u32), vzip2q_u32>;
+using _mm_unpacklo_epi64 =
+ TWrapperDual<uint64x2_t, decltype(vzip1q_u64), vzip1q_u64>;
+using _mm_unpackhi_epi64 =
+ TWrapperDual<uint64x2_t, decltype(vzip2q_u64), vzip2q_u64>;
+
+using _mm_cmpeq_epi8 =
+ TWrapperDual<uint8x16_t, decltype(vceqq_u8), vceqq_u8>;
+using _mm_cmpeq_epi16 =
+ TWrapperDual<uint16x8_t, decltype(vceqq_u16), vceqq_u16>;
+using _mm_cmpeq_epi32 =
+ TWrapperDual<uint32x4_t, decltype(vceqq_u32), vceqq_u32>;
+
+using _mm_cmpgt_epi8 =
+ TWrapperDual<int8x16_t, decltype(vcgtq_s8), vcgtq_s8>;
+using _mm_cmpgt_epi16 =
+ TWrapperDual<int16x8_t, decltype(vcgtq_s16), vcgtq_s16>;
+using _mm_cmpgt_epi32 =
+ TWrapperDual<int32x4_t, decltype(vcgtq_s32), vcgtq_s32>;
+
+using _mm_cmplt_epi8 =
+ TWrapperDual<int8x16_t, decltype(vcltq_s8), vcltq_s8>;
+using _mm_cmplt_epi16 =
+ TWrapperDual<int16x8_t, decltype(vcltq_s16), vcltq_s16>;
+using _mm_cmplt_epi32 =
+ TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>;
+
Y_FORCE_INLINE __m128i _mm_load_si128(const __m128i* ptr) {
- __m128i result;
+ __m128i result;
result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr);
- return result;
-}
-
+ return result;
+}
+
Y_FORCE_INLINE __m128i _mm_loadu_si128(const __m128i* ptr) {
- __m128i result;
+ __m128i result;
result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr);
- return result;
-}
-
+ return result;
+}
+
Y_FORCE_INLINE __m128i _mm_lddqu_si128(const __m128i* ptr) {
return _mm_loadu_si128(ptr);
}
Y_FORCE_INLINE void _mm_storeu_si128(__m128i* ptr, const __m128i& op) {
vst1q_u64((uint64_t*)ptr, op.AsUi64x2);
-}
-
+}
+
Y_FORCE_INLINE void
-_mm_store_si128(__m128i* ptr, const __m128i& op) {
+_mm_store_si128(__m128i* ptr, const __m128i& op) {
vst1q_u64((uint64_t*)ptr, op.AsUi64x2);
-}
-
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
-struct TWrapperSimple : TBaseWrapper<__m128i> {
+}
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperSimple : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TWrapperSimple(TParams... params) {
- TQType<TOp>::As(Value) = func(params...);
- }
-};
-
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
-struct TWrapperSimpleF : TBaseWrapper<__m128> {
+ TWrapperSimple(TParams... params) {
+ TQType<TOp>::As(Value) = func(params...);
+ }
+};
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperSimpleF : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- TWrapperSimpleF(TParams... params) {
- TQType<TOp>::As(Value) = func(params...);
- }
-};
-
-using _mm_set1_epi8 =
- TWrapperSimple<int8x16_t, decltype(vdupq_n_s8), vdupq_n_s8, const char>;
-using _mm_set1_epi16 =
- TWrapperSimple<int16x8_t, decltype(vdupq_n_s16), vdupq_n_s16, const ui16>;
-using _mm_set1_epi32 =
- TWrapperSimple<int32x4_t, decltype(vdupq_n_s32), vdupq_n_s32, const ui32>;
-
-struct _mm_setzero_si128 : TBaseWrapper<__m128i> {
+ TWrapperSimpleF(TParams... params) {
+ TQType<TOp>::As(Value) = func(params...);
+ }
+};
+
+using _mm_set1_epi8 =
+ TWrapperSimple<int8x16_t, decltype(vdupq_n_s8), vdupq_n_s8, const char>;
+using _mm_set1_epi16 =
+ TWrapperSimple<int16x8_t, decltype(vdupq_n_s16), vdupq_n_s16, const ui16>;
+using _mm_set1_epi32 =
+ TWrapperSimple<int32x4_t, decltype(vdupq_n_s32), vdupq_n_s32, const ui32>;
+
+struct _mm_setzero_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_setzero_si128() {
- TQType<uint64x2_t>::As(Value) = vdupq_n_u64(0);
- }
-};
-
-struct _mm_loadl_epi64 : TBaseWrapper<__m128i> {
+ _mm_setzero_si128() {
+ TQType<uint64x2_t>::As(Value) = vdupq_n_u64(0);
+ }
+};
+
+struct _mm_loadl_epi64 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_loadl_epi64(const __m128i* p) {
+ _mm_loadl_epi64(const __m128i* p) {
uint64x1_t im = vld1_u64((const uint64_t*)p);
- TQType<uint64x2_t>::As(Value) = vcombine_u64(im, vdup_n_u64(0));
- }
-};
-
-struct _mm_storel_epi64 : TBaseWrapper<__m128i> {
+ TQType<uint64x2_t>::As(Value) = vcombine_u64(im, vdup_n_u64(0));
+ }
+};
+
+struct _mm_storel_epi64 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_storel_epi64(__m128i* a, __m128i op) {
+ _mm_storel_epi64(__m128i* a, __m128i op) {
vst1_u64((uint64_t*)a, vget_low_u64(op.AsUi64x2));
- }
-};
-
+ }
+};
+
struct ShuffleStruct4 {
ui8 x[4];
};
@@ -470,45 +470,45 @@ _MM_SHUFFLE(ui8 x4, ui8 x3, ui8 x2, ui8 x1) {
}
Y_FORCE_INLINE __m128i
-_mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) {
- __m128i result;
+_mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) {
+ __m128i result;
const ui8 xi[4] = {
ui8(op2.x[0] * 4), ui8(op2.x[1] * 4),
ui8(op2.x[2] * 4), ui8(op2.x[3] * 4)
};
const uint8x16_t transform = {
- ui8(xi[0]), ui8(xi[0] + 1), ui8(xi[0] + 2), ui8(xi[0] + 3),
- ui8(xi[1]), ui8(xi[1] + 1), ui8(xi[1] + 2), ui8(xi[1] + 3),
- ui8(xi[2]), ui8(xi[2] + 1), ui8(xi[2] + 2), ui8(xi[2] + 3),
+ ui8(xi[0]), ui8(xi[0] + 1), ui8(xi[0] + 2), ui8(xi[0] + 3),
+ ui8(xi[1]), ui8(xi[1] + 1), ui8(xi[1] + 2), ui8(xi[1] + 3),
+ ui8(xi[2]), ui8(xi[2] + 1), ui8(xi[2] + 2), ui8(xi[2] + 3),
ui8(xi[3]), ui8(xi[3] + 1), ui8(xi[3] + 2), ui8(xi[3] + 3)
};
- result.AsUi8x16 = vqtbl1q_u8(op1.AsUi8x16, transform);
- return result;
-}
-
+ result.AsUi8x16 = vqtbl1q_u8(op1.AsUi8x16, transform);
+ return result;
+}
+
Y_FORCE_INLINE int
-_mm_movemask_epi8(const __m128i& op) {
- uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
- uint8x16_t opmasked = vandq_u8(op.AsUi8x16, mask);
- int8x16_t byteshifter = {
- 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7};
- uint8x16_t opshifted = vshlq_u8(opmasked, byteshifter);
- int16x8_t wordshifter = {-7, -5, -3, -1, 1, 3, 5, 7};
- uint16x8_t wordshifted =
- vshlq_u16(vreinterpretq_u16_u8(opshifted), wordshifter);
- return vaddvq_u16(wordshifted);
-}
-
-template <int imm>
-struct THelper_mm_srli_si128 : TBaseWrapper<__m128i> {
+_mm_movemask_epi8(const __m128i& op) {
+ uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+ uint8x16_t opmasked = vandq_u8(op.AsUi8x16, mask);
+ int8x16_t byteshifter = {
+ 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7};
+ uint8x16_t opshifted = vshlq_u8(opmasked, byteshifter);
+ int16x8_t wordshifter = {-7, -5, -3, -1, 1, 3, 5, 7};
+ uint16x8_t wordshifted =
+ vshlq_u16(vreinterpretq_u16_u8(opshifted), wordshifter);
+ return vaddvq_u16(wordshifted);
+}
+
+template <int imm>
+struct THelper_mm_srli_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
THelper_mm_srli_si128(const __m128i a) {
const auto zero = vdupq_n_u8(0);
- TQType<uint8x16_t>::As(Value) = vextq_u8(a.AsUi8x16, zero, imm);
- }
-};
-
+ TQType<uint8x16_t>::As(Value) = vextq_u8(a.AsUi8x16, zero, imm);
+ }
+};
+
template <>
struct THelper_mm_srli_si128<16> : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
@@ -518,8 +518,8 @@ struct THelper_mm_srli_si128<16> : TBaseWrapper<__m128i> {
}
};
-#define _mm_srli_si128(a, imm) THelper_mm_srli_si128<imm>(a)
-
+#define _mm_srli_si128(a, imm) THelper_mm_srli_si128<imm>(a)
+
template<int imm>
inline uint8x16_t vextq_u8_function(uint8x16_t a, uint8x16_t b) {
return vextq_u8(a, b, imm);
@@ -531,33 +531,33 @@ inline uint8x16_t vextq_u8_function<16>(uint8x16_t /* a */, uint8x16_t b) {
}
-template <int imm>
-struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> {
+template <int imm>
+struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
THelper_mm_slli_si128(const __m128i a) {
- auto zero = vdupq_n_u8(0);
+ auto zero = vdupq_n_u8(0);
TQType<uint8x16_t>::As(Value) = vextq_u8_function<16 - imm>(zero, a.AsUi8x16);
- }
-};
-
-#define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a)
-
+ }
+};
+
+#define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a)
+
Y_FORCE_INLINE int _mm_cvtsi128_si32(const __m128i& op) {
- return vgetq_lane_s32(op.AsSi32x4, 0);
-}
-
-struct _mm_set_epi16 : TBaseWrapper<__m128i> {
+ return vgetq_lane_s32(op.AsSi32x4, 0);
+}
+
+struct _mm_set_epi16 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_set_epi16(const short w7, const short w6,
- const short w5, const short w4,
- const short w3, const short w2,
- const short w1, const short w0) {
- int16x4_t d0 = {w0, w1, w2, w3};
- int16x4_t d1 = {w4, w5, w6, w7};
- TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1);
- }
-};
-
+ _mm_set_epi16(const short w7, const short w6,
+ const short w5, const short w4,
+ const short w3, const short w2,
+ const short w1, const short w0) {
+ int16x4_t d0 = {w0, w1, w2, w3};
+ int16x4_t d1 = {w4, w5, w6, w7};
+ TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1);
+ }
+};
+
struct _mm_setr_epi16 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
_mm_setr_epi16(const short w7, const short w6,
@@ -570,16 +570,16 @@ struct _mm_setr_epi16 : TBaseWrapper<__m128i> {
}
};
-struct _mm_set_epi32 : TBaseWrapper<__m128i> {
+struct _mm_set_epi32 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_set_epi32(const int x3, const int x2,
- const int x1, const int x0) {
- int32x2_t d0 = {x0, x1};
- int32x2_t d1 = {x2, x3};
- TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1);
- }
-};
-
+ _mm_set_epi32(const int x3, const int x2,
+ const int x1, const int x0) {
+ int32x2_t d0 = {x0, x1};
+ int32x2_t d1 = {x2, x3};
+ TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1);
+ }
+};
+
struct _mm_setr_epi32 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
_mm_setr_epi32(const int x3, const int x2,
@@ -590,14 +590,14 @@ struct _mm_setr_epi32 : TBaseWrapper<__m128i> {
}
};
-struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> {
+struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- _mm_cvtsi32_si128(int op) {
- auto zero = vdupq_n_s32(0);
- TQType<int32x4_t>::As(Value) = vsetq_lane_s32(op, zero, 0);
- }
-};
-
+ _mm_cvtsi32_si128(int op) {
+ auto zero = vdupq_n_s32(0);
+ TQType<int32x4_t>::As(Value) = vsetq_lane_s32(op, zero, 0);
+ }
+};
+
struct _mm_cvtsi64_si128 : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
_mm_cvtsi64_si128(i64 op) {
@@ -606,41 +606,41 @@ struct _mm_cvtsi64_si128 : TBaseWrapper<__m128i> {
}
};
-template <typename TOpOut, typename TOpIn,
- typename TFunc, TFunc* func,
- typename TCombine, TCombine* combine>
-struct TCombineWrapper : TBaseWrapper<__m128i> {
+template <typename TOpOut, typename TOpIn,
+ typename TFunc, TFunc* func,
+ typename TCombine, TCombine* combine>
+struct TCombineWrapper : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TCombineWrapper(const __m128i op1, const __m128i op2) {
- TQType<TOpOut>::As(Value) =
- combine(func(TQType<TOpIn>::As(op1)),
- func(TQType<TOpIn>::As(op2)));
- }
-};
-
-using _mm_packs_epi16 =
- TCombineWrapper<int8x16_t, int16x8_t,
- decltype(vqmovn_s16), vqmovn_s16,
- decltype(vcombine_s8), vcombine_s8>;
-using _mm_packs_epi32 =
- TCombineWrapper<int16x8_t, int32x4_t,
- decltype(vqmovn_s32), vqmovn_s32,
- decltype(vcombine_s16), vcombine_s16>;
-using _mm_packus_epi16 =
- TCombineWrapper<uint8x16_t, int16x8_t,
- decltype(vqmovun_s16), vqmovun_s16,
- decltype(vcombine_u8), vcombine_u8>;
-
-template <typename TOpOut, typename TOpIn,
- typename TFunc, TFunc* func, typename... TParams>
-struct TScalarOutWrapper : TBaseWrapper<TOpOut> {
+ TCombineWrapper(const __m128i op1, const __m128i op2) {
+ TQType<TOpOut>::As(Value) =
+ combine(func(TQType<TOpIn>::As(op1)),
+ func(TQType<TOpIn>::As(op2)));
+ }
+};
+
+using _mm_packs_epi16 =
+ TCombineWrapper<int8x16_t, int16x8_t,
+ decltype(vqmovn_s16), vqmovn_s16,
+ decltype(vcombine_s8), vcombine_s8>;
+using _mm_packs_epi32 =
+ TCombineWrapper<int16x8_t, int32x4_t,
+ decltype(vqmovn_s32), vqmovn_s32,
+ decltype(vcombine_s16), vcombine_s16>;
+using _mm_packus_epi16 =
+ TCombineWrapper<uint8x16_t, int16x8_t,
+ decltype(vqmovun_s16), vqmovun_s16,
+ decltype(vcombine_u8), vcombine_u8>;
+
+template <typename TOpOut, typename TOpIn,
+ typename TFunc, TFunc* func, typename... TParams>
+struct TScalarOutWrapper : TBaseWrapper<TOpOut> {
Y_FORCE_INLINE
- TScalarOutWrapper(const __m128i op, TParams... params) {
- TBaseWrapper<TOpOut>::Value =
- func(TQType<TOpIn>::As(op), params...);
- }
-};
-
+ TScalarOutWrapper(const __m128i op, TParams... params) {
+ TBaseWrapper<TOpOut>::Value =
+ func(TQType<TOpIn>::As(op), params...);
+ }
+};
+
template<int imm>
int extract_epi8_arm(__m128i arg) {
return vgetq_lane_u8(arg.AsUi8x16, imm);
@@ -649,13 +649,13 @@ int extract_epi8_arm(__m128i arg) {
template<int imm>
int extract_epi16_arm(__m128i arg) {
return vgetq_lane_u16(arg.AsUi16x8, imm);
-}
-
+}
+
template<int imm>
int extract_epi32_arm(__m128i arg) {
return vgetq_lane_s32(arg.AsSi32x4, imm);
}
-
+
template<int imm>
long long extract_epi64_arm(__m128i arg) {
return vgetq_lane_s64(arg.AsSi64x2, imm);
@@ -669,49 +669,49 @@ long long extract_epi64_arm(__m128i arg) {
static Y_FORCE_INLINE
__m128i _mm_mul_epu32(__m128i op1, __m128i op2) {
- __m128i result;
- uint32x4_t r1 = vuzp1q_u32(op1.AsUi32x4, op2.AsUi32x4);
- uint32x4_t r2 = vuzp1q_u32(op2.AsUi32x4, op1.AsUi32x4);
- result.AsUi64x2 = vmull_u32(vget_low_u32(r1), vget_low_u32(r2));
- return result;
-}
-
-template <>
-struct TQType<float32x4_t> {
- static inline float32x4_t& As(__m128& value) {
- return value.AsFloat32x4;
- }
-
- static inline const float32x4_t& As(const __m128& value) {
- return value.AsFloat32x4;
- }
-
- static inline float32x4_t& As(__m128i& value) {
- return value.AsFloat32x4;
- }
-
- static inline const float32x4_t& As(const __m128i& value) {
- return value.AsFloat32x4;
- }
-};
-
-template <>
-struct TQType<float64x2_t> {
- static inline float64x2_t& As(__m128& value) {
- return value.AsFloat64x2;
- }
-
- static inline const float64x2_t& As(const __m128& value) {
- return value.AsFloat64x2;
- }
-
- static inline float64x2_t& As(__m128i& value) {
- return value.AsFloat64x2;
- }
-
- static inline const float64x2_t& As(const __m128i& value) {
- return value.AsFloat64x2;
- }
+ __m128i result;
+ uint32x4_t r1 = vuzp1q_u32(op1.AsUi32x4, op2.AsUi32x4);
+ uint32x4_t r2 = vuzp1q_u32(op2.AsUi32x4, op1.AsUi32x4);
+ result.AsUi64x2 = vmull_u32(vget_low_u32(r1), vget_low_u32(r2));
+ return result;
+}
+
+template <>
+struct TQType<float32x4_t> {
+ static inline float32x4_t& As(__m128& value) {
+ return value.AsFloat32x4;
+ }
+
+ static inline const float32x4_t& As(const __m128& value) {
+ return value.AsFloat32x4;
+ }
+
+ static inline float32x4_t& As(__m128i& value) {
+ return value.AsFloat32x4;
+ }
+
+ static inline const float32x4_t& As(const __m128i& value) {
+ return value.AsFloat32x4;
+ }
+};
+
+template <>
+struct TQType<float64x2_t> {
+ static inline float64x2_t& As(__m128& value) {
+ return value.AsFloat64x2;
+ }
+
+ static inline const float64x2_t& As(const __m128& value) {
+ return value.AsFloat64x2;
+ }
+
+ static inline float64x2_t& As(__m128i& value) {
+ return value.AsFloat64x2;
+ }
+
+ static inline const float64x2_t& As(const __m128i& value) {
+ return value.AsFloat64x2;
+ }
static inline float64x2_t& As(__m128d& value) {
return value;
@@ -720,30 +720,30 @@ struct TQType<float64x2_t> {
static inline const float64x2_t& As(const __m128d& value) {
return value;
}
-};
-
-using _mm_set1_ps = TWrapperSimpleF<float32x4_t,
- decltype(vdupq_n_f32), vdupq_n_f32, const float>;
-using _mm_set_ps1 = TWrapperSimpleF<float32x4_t,
- decltype(vdupq_n_f32), vdupq_n_f32, const float>;
-
-struct _mm_setzero_ps : TBaseWrapper<__m128> {
+};
+
+using _mm_set1_ps = TWrapperSimpleF<float32x4_t,
+ decltype(vdupq_n_f32), vdupq_n_f32, const float>;
+using _mm_set_ps1 = TWrapperSimpleF<float32x4_t,
+ decltype(vdupq_n_f32), vdupq_n_f32, const float>;
+
+struct _mm_setzero_ps : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- _mm_setzero_ps() {
- TQType<float32x4_t>::As(Value) = vdupq_n_f32(0.);
- }
-};
-
+ _mm_setzero_ps() {
+ TQType<float32x4_t>::As(Value) = vdupq_n_f32(0.);
+ }
+};
+
Y_FORCE_INLINE __m128d _mm_setzero_pd() {
return vdupq_n_f64(0.);
}
Y_FORCE_INLINE __m128 _mm_loadu_ps(const float* ptr) {
- __m128 result;
- result.AsFloat32x4 = vld1q_f32(ptr);
- return result;
-}
-
+ __m128 result;
+ result.AsFloat32x4 = vld1q_f32(ptr);
+ return result;
+}
+
Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) {
__m128 result;
result.AsFloat32x4 = vld1q_f32(ptr);
@@ -751,23 +751,23 @@ Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) {
}
Y_FORCE_INLINE void _mm_storeu_ps(float* ptr, const __m128& op) {
- vst1q_f32(ptr, op.AsFloat32x4);
-}
-
+ vst1q_f32(ptr, op.AsFloat32x4);
+}
+
Y_FORCE_INLINE void _mm_store_ps(float* ptr, const __m128& op) {
vst1q_f32(ptr, op.AsFloat32x4);
}
-struct _mm_set_ps : TBaseWrapper<__m128> {
+struct _mm_set_ps : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- _mm_set_ps(const float x3, const float x2,
- const float x1, const float x0) {
- float32x2_t d0 = {x0, x1};
- float32x2_t d1 = {x2, x3};
- TQType<float32x4_t>::As(Value) = vcombine_f32(d0, d1);
- }
-};
-
+ _mm_set_ps(const float x3, const float x2,
+ const float x1, const float x0) {
+ float32x2_t d0 = {x0, x1};
+ float32x2_t d1 = {x2, x3};
+ TQType<float32x4_t>::As(Value) = vcombine_f32(d0, d1);
+ }
+};
+
Y_FORCE_INLINE __m128d _mm_set_pd(double d1, double d0) {
const float64x1_t p0 = {d0};
const float64x1_t p1 = {d1};
@@ -788,81 +788,81 @@ Y_FORCE_INLINE void _mm_store_pd(double* res, __m128d a) {
vst1q_f64(res, a);
}
-using _mm_add_ps = TWrapperDualF<float32x4_t, decltype(vaddq_f32), vaddq_f32>;
-using _mm_sub_ps = TWrapperDualF<float32x4_t, decltype(vsubq_f32), vsubq_f32>;
-using _mm_mul_ps = TWrapperDualF<float32x4_t, decltype(vmulq_f32), vmulq_f32>;
-using _mm_div_ps = TWrapperDualF<float32x4_t, decltype(vdivq_f32), vdivq_f32>;
-using _mm_cmpeq_ps = TWrapperDualF<float32x4_t, decltype(vceqq_f32), vceqq_f32>;
-using _mm_cmpgt_ps = TWrapperDualF<float32x4_t, decltype(vcgtq_f32), vcgtq_f32>;
-using _mm_max_ps = TWrapperDualF<float32x4_t, decltype(vmaxq_f32), vmaxq_f32>;
-using _mm_min_ps = TWrapperDualF<float32x4_t, decltype(vminq_f32), vminq_f32>;
-
+using _mm_add_ps = TWrapperDualF<float32x4_t, decltype(vaddq_f32), vaddq_f32>;
+using _mm_sub_ps = TWrapperDualF<float32x4_t, decltype(vsubq_f32), vsubq_f32>;
+using _mm_mul_ps = TWrapperDualF<float32x4_t, decltype(vmulq_f32), vmulq_f32>;
+using _mm_div_ps = TWrapperDualF<float32x4_t, decltype(vdivq_f32), vdivq_f32>;
+using _mm_cmpeq_ps = TWrapperDualF<float32x4_t, decltype(vceqq_f32), vceqq_f32>;
+using _mm_cmpgt_ps = TWrapperDualF<float32x4_t, decltype(vcgtq_f32), vcgtq_f32>;
+using _mm_max_ps = TWrapperDualF<float32x4_t, decltype(vmaxq_f32), vmaxq_f32>;
+using _mm_min_ps = TWrapperDualF<float32x4_t, decltype(vminq_f32), vminq_f32>;
+
using _mm_add_pd = TWrapperDualF<float64x2_t, decltype(vaddq_f64), vaddq_f64, __m128d>;
using _mm_sub_pd = TWrapperDualF<float64x2_t, decltype(vsubq_f64), vsubq_f64, __m128d>;
using _mm_mul_pd = TWrapperDualF<float64x2_t, decltype(vmulq_f64), vmulq_f64, __m128d>;
using _mm_div_pd = TWrapperDualF<float64x2_t, decltype(vdivq_f64), vdivq_f64, __m128d>;
-struct _mm_and_ps : TBaseWrapper<__m128> {
+struct _mm_and_ps : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- _mm_and_ps(const __m128& op1, const __m128& op2) {
- TQType<uint64x2_t>::As(Value) =
- vandq_u64(TQType<uint64x2_t>::As(op1),
- TQType<uint64x2_t>::As(op2));
- }
-};
-
+ _mm_and_ps(const __m128& op1, const __m128& op2) {
+ TQType<uint64x2_t>::As(Value) =
+ vandq_u64(TQType<uint64x2_t>::As(op1),
+ TQType<uint64x2_t>::As(op2));
+ }
+};
+
Y_FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {
return vandq_u64(a, b);
}
Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m128& op3) {
- float64x2_t im0 =
- (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
- float64x2_t im1 =
- (float64x2_t)vtrn2q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
- float64x2_t im2 =
- (float64x2_t)vtrn1q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
- float64x2_t im3 =
- (float64x2_t)vtrn2q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
-
- TQType<float64x2_t>::As(op0) = vtrn1q_f64(im0, im2);
- TQType<float64x2_t>::As(op1) = vtrn1q_f64(im1, im3);
- TQType<float64x2_t>::As(op2) = vtrn2q_f64(im0, im2);
- TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3);
-};
-
+ float64x2_t im0 =
+ (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
+ float64x2_t im1 =
+ (float64x2_t)vtrn2q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
+ float64x2_t im2 =
+ (float64x2_t)vtrn1q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
+ float64x2_t im3 =
+ (float64x2_t)vtrn2q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
+
+ TQType<float64x2_t>::As(op0) = vtrn1q_f64(im0, im2);
+ TQType<float64x2_t>::As(op1) = vtrn1q_f64(im1, im3);
+ TQType<float64x2_t>::As(op2) = vtrn2q_f64(im0, im2);
+ TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3);
+};
+
Y_FORCE_INLINE __m128 _mm_castsi128_ps(__m128i op) {
- return reinterpret_cast<__m128&>(op);
-}
-
+ return reinterpret_cast<__m128&>(op);
+}
+
Y_FORCE_INLINE __m128i _mm_castps_si128(__m128 op) {
- return reinterpret_cast<__m128i&>(op);
-}
-
-template <typename TOpOut, typename TOpIn,
- typename TFunc, TFunc* func, typename... TParams>
-struct TCvtS2FWrapperSingle : TBaseWrapper<__m128> {
+ return reinterpret_cast<__m128i&>(op);
+}
+
+template <typename TOpOut, typename TOpIn,
+ typename TFunc, TFunc* func, typename... TParams>
+struct TCvtS2FWrapperSingle : TBaseWrapper<__m128> {
Y_FORCE_INLINE
- TCvtS2FWrapperSingle(const __m128i& op, TParams... params) {
- TQType<TOpOut>::As(Value) =
- func(TQType<TOpIn>::As(op), params...);
- }
-};
-
-using _mm_cvtepi32_ps =
- TCvtS2FWrapperSingle<float32x4_t, int32x4_t,
- decltype(vcvtq_f32_s32), vcvtq_f32_s32>;
-
-template <typename TOpOut, typename TOpIn,
- typename TFunc, TFunc* func, typename... TParams>
-struct TCvtF2SWrapperSingle : TBaseWrapper<__m128i> {
+ TCvtS2FWrapperSingle(const __m128i& op, TParams... params) {
+ TQType<TOpOut>::As(Value) =
+ func(TQType<TOpIn>::As(op), params...);
+ }
+};
+
+using _mm_cvtepi32_ps =
+ TCvtS2FWrapperSingle<float32x4_t, int32x4_t,
+ decltype(vcvtq_f32_s32), vcvtq_f32_s32>;
+
+template <typename TOpOut, typename TOpIn,
+ typename TFunc, TFunc* func, typename... TParams>
+struct TCvtF2SWrapperSingle : TBaseWrapper<__m128i> {
Y_FORCE_INLINE
- TCvtF2SWrapperSingle(const __m128& op, TParams... params) {
- TQType<TOpOut>::As(Value) =
- func(TQType<TOpIn>::As(op), params...);
- }
-};
-
+ TCvtF2SWrapperSingle(const __m128& op, TParams... params) {
+ TQType<TOpOut>::As(Value) =
+ func(TQType<TOpIn>::As(op), params...);
+ }
+};
+
inline __m128i _mm_cvtps_epi32(__m128 a) {
/// vcvtq_s32_f32 rounds to zero, but we need to round to the nearest.
static const float32x4_t half = vdupq_n_f32(0.5f);
@@ -874,26 +874,26 @@ inline __m128i _mm_cvtps_epi32(__m128 a) {
return res;
}
-using _mm_cvttps_epi32 =
- TCvtF2SWrapperSingle<int32x4_t, float32x4_t,
- decltype(vcvtq_s32_f32), vcvtq_s32_f32>;
-
+using _mm_cvttps_epi32 =
+ TCvtF2SWrapperSingle<int32x4_t, float32x4_t,
+ decltype(vcvtq_s32_f32), vcvtq_s32_f32>;
+
Y_FORCE_INLINE int
-_mm_movemask_ps(const __m128& op) {
- uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
- uint32x4_t bits = vandq_u32(op.AsUi32x4, mask);
- int32x4_t shifts = {-31, -30, -29, -28};
- bits = vshlq_u32(bits, shifts);
- return vaddvq_u32(bits);
-}
+_mm_movemask_ps(const __m128& op) {
+ uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+ uint32x4_t bits = vandq_u32(op.AsUi32x4, mask);
+ int32x4_t shifts = {-31, -30, -29, -28};
+ bits = vshlq_u32(bits, shifts);
+ return vaddvq_u32(bits);
+}
Y_FORCE_INLINE i64 _mm_cvtsi128_si64(__m128i a) {
return vgetq_lane_s64(a.AsSi64x2, 0);
}
-
-static inline void _mm_pause() {
+
+static inline void _mm_pause() {
__asm__ ("YIELD");
-}
+}
static inline __m128 _mm_rsqrt_ps(__m128 a) {
__m128 res;