diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/sse | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/sse')
-rw-r--r-- | library/cpp/sse/sse.h | 2 | ||||
-rw-r--r-- | library/cpp/sse/sse2neon.h | 100 | ||||
-rw-r--r-- | library/cpp/sse/ut/test.cpp | 80 | ||||
-rw-r--r-- | library/cpp/sse/ut/ya.make | 2 | ||||
-rw-r--r-- | library/cpp/sse/ya.make | 14 |
5 files changed, 99 insertions, 99 deletions
diff --git a/library/cpp/sse/sse.h b/library/cpp/sse/sse.h index 19bac17de0..af87a4e530 100644 --- a/library/cpp/sse/sse.h +++ b/library/cpp/sse/sse.h @@ -9,7 +9,7 @@ /* Author: Vitaliy Manushkin <agri@yandex-team.ru>, Danila Kutenin <danlark@yandex-team.ru> */ #include <util/system/platform.h> - + #if (defined(_i386_) || defined(_x86_64_)) && defined(_sse_) #include <xmmintrin.h> #include <emmintrin.h> diff --git a/library/cpp/sse/sse2neon.h b/library/cpp/sse/sse2neon.h index 695dbd3041..7e82be954b 100644 --- a/library/cpp/sse/sse2neon.h +++ b/library/cpp/sse/sse2neon.h @@ -12,7 +12,7 @@ /* Author: Vitaliy Manushkin <agri@yandex-team.ru> */ #include <util/system/platform.h> -#include <util/system/compiler.h> +#include <util/system/compiler.h> #include <util/system/types.h> #if !defined(_arm64_) @@ -200,7 +200,7 @@ inline __m128i _mm_srl_epi16(__m128i a, __m128i count) { return res; } - + inline __m128i _mm_srl_epi32(__m128i a, __m128i count) { __m128i res; res.AsUi32x4 = vshlq_u32(a.AsUi32x4, vdupq_n_s32(-count.AsUi32x4[0])); @@ -235,14 +235,14 @@ using _mm_srli_epi64 = TWrapperSingleNegDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64, decltype(vdupq_n_s64), vdupq_n_s64>; - + inline __m128i _mm_sll_epi16(__m128i a, __m128i count) { __m128i res; res.AsUi16x8 = vshlq_u16(a.AsUi16x8, vdupq_n_s16(count.AsUi16x8[0])); return res; } - + inline __m128i _mm_sll_epi32(__m128i a, __m128i count) { __m128i res; res.AsUi32x4 = vshlq_u32(a.AsUi32x4, vdupq_n_s32(count.AsUi32x4[0])); @@ -291,7 +291,7 @@ template <typename TOp, typename TFunc, TFunc* func, typename TArgument = __m128 struct TWrapperDualF : TBaseWrapper<TArgument> { Y_FORCE_INLINE TWrapperDualF(const TArgument& op1, const TArgument& op2) { - TQType<TOp>::As(TBaseWrapper<TArgument>::Value) = (TOp) func(TQType<TOp>::As(op1), TQType<TOp>::As(op2)); + TQType<TOp>::As(TBaseWrapper<TArgument>::Value) = (TOp) func(TQType<TOp>::As(op1), TQType<TOp>::As(op2)); } }; @@ -311,29 +311,29 @@ inline __m128i _mm_madd_epi16(__m128i a, __m128i b) { int32x4_t aHigh; int32x4_t bLow; int32x4_t bHigh; - #ifdef __LITTLE_ENDIAN__ - aLow[0] = a.AsSi16x8[0]; //!< I couldn't find vector instructions to do that. Feel free to fix this code. - aLow[1] = a.AsSi16x8[2]; - aLow[2] = a.AsSi16x8[4]; - aLow[3] = a.AsSi16x8[6]; - - aHigh[0] = a.AsSi16x8[1]; - aHigh[1] = a.AsSi16x8[3]; - aHigh[2] = a.AsSi16x8[5]; - aHigh[3] = a.AsSi16x8[7]; - - bLow[0] = b.AsSi16x8[0]; - bLow[1] = b.AsSi16x8[2]; - bLow[2] = b.AsSi16x8[4]; - bLow[3] = b.AsSi16x8[6]; - - bHigh[0] = b.AsSi16x8[1]; - bHigh[1] = b.AsSi16x8[3]; - bHigh[2] = b.AsSi16x8[5]; - bHigh[3] = b.AsSi16x8[7]; - #else - #error Not implemented yet. Do it yourself. - #endif + #ifdef __LITTLE_ENDIAN__ + aLow[0] = a.AsSi16x8[0]; //!< I couldn't find vector instructions to do that. Feel free to fix this code. + aLow[1] = a.AsSi16x8[2]; + aLow[2] = a.AsSi16x8[4]; + aLow[3] = a.AsSi16x8[6]; + + aHigh[0] = a.AsSi16x8[1]; + aHigh[1] = a.AsSi16x8[3]; + aHigh[2] = a.AsSi16x8[5]; + aHigh[3] = a.AsSi16x8[7]; + + bLow[0] = b.AsSi16x8[0]; + bLow[1] = b.AsSi16x8[2]; + bLow[2] = b.AsSi16x8[4]; + bLow[3] = b.AsSi16x8[6]; + + bHigh[0] = b.AsSi16x8[1]; + bHigh[1] = b.AsSi16x8[3]; + bHigh[2] = b.AsSi16x8[5]; + bHigh[3] = b.AsSi16x8[7]; + #else + #error Not implemented yet. Do it yourself. + #endif const int32x4_t lowMul = vmulq_u32(aLow, bLow); const int32x4_t highMul = vmulq_u32(aHigh, bHigh); @@ -387,13 +387,13 @@ using _mm_cmplt_epi32 = Y_FORCE_INLINE __m128i _mm_load_si128(const __m128i* ptr) { __m128i result; - result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr); + result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr); return result; } Y_FORCE_INLINE __m128i _mm_loadu_si128(const __m128i* ptr) { __m128i result; - result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr); + result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr); return result; } @@ -402,12 +402,12 @@ Y_FORCE_INLINE __m128i _mm_lddqu_si128(const __m128i* ptr) { } Y_FORCE_INLINE void _mm_storeu_si128(__m128i* ptr, const __m128i& op) { - vst1q_u64((uint64_t*)ptr, op.AsUi64x2); + vst1q_u64((uint64_t*)ptr, op.AsUi64x2); } Y_FORCE_INLINE void _mm_store_si128(__m128i* ptr, const __m128i& op) { - vst1q_u64((uint64_t*)ptr, op.AsUi64x2); + vst1q_u64((uint64_t*)ptr, op.AsUi64x2); } template <typename TOp, typename TFunc, TFunc* func, typename... TParams> @@ -443,7 +443,7 @@ struct _mm_setzero_si128 : TBaseWrapper<__m128i> { struct _mm_loadl_epi64 : TBaseWrapper<__m128i> { Y_FORCE_INLINE _mm_loadl_epi64(const __m128i* p) { - uint64x1_t im = vld1_u64((const uint64_t*)p); + uint64x1_t im = vld1_u64((const uint64_t*)p); TQType<uint64x2_t>::As(Value) = vcombine_u64(im, vdup_n_u64(0)); } }; @@ -451,7 +451,7 @@ struct _mm_loadl_epi64 : TBaseWrapper<__m128i> { struct _mm_storel_epi64 : TBaseWrapper<__m128i> { Y_FORCE_INLINE _mm_storel_epi64(__m128i* a, __m128i op) { - vst1_u64((uint64_t*)a, vget_low_u64(op.AsUi64x2)); + vst1_u64((uint64_t*)a, vget_low_u64(op.AsUi64x2)); } }; @@ -474,14 +474,14 @@ _mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) { __m128i result; const ui8 xi[4] = { ui8(op2.x[0] * 4), ui8(op2.x[1] * 4), - ui8(op2.x[2] * 4), ui8(op2.x[3] * 4) - }; + ui8(op2.x[2] * 4), ui8(op2.x[3] * 4) + }; const uint8x16_t transform = { ui8(xi[0]), ui8(xi[0] + 1), ui8(xi[0] + 2), ui8(xi[0] + 3), ui8(xi[1]), ui8(xi[1] + 1), ui8(xi[1] + 2), ui8(xi[1] + 3), ui8(xi[2]), ui8(xi[2] + 1), ui8(xi[2] + 2), ui8(xi[2] + 3), - ui8(xi[3]), ui8(xi[3] + 1), ui8(xi[3] + 2), ui8(xi[3] + 3) - }; + ui8(xi[3]), ui8(xi[3] + 1), ui8(xi[3] + 2), ui8(xi[3] + 3) + }; result.AsUi8x16 = vqtbl1q_u8(op1.AsUi8x16, transform); return result; } @@ -503,7 +503,7 @@ _mm_movemask_epi8(const __m128i& op) { template <int imm> struct THelper_mm_srli_si128 : TBaseWrapper<__m128i> { Y_FORCE_INLINE - THelper_mm_srli_si128(const __m128i a) { + THelper_mm_srli_si128(const __m128i a) { const auto zero = vdupq_n_u8(0); TQType<uint8x16_t>::As(Value) = vextq_u8(a.AsUi8x16, zero, imm); } @@ -520,21 +520,21 @@ struct THelper_mm_srli_si128<16> : TBaseWrapper<__m128i> { #define _mm_srli_si128(a, imm) THelper_mm_srli_si128<imm>(a) -template<int imm> +template<int imm> inline uint8x16_t vextq_u8_function(uint8x16_t a, uint8x16_t b) { return vextq_u8(a, b, imm); } -template<> +template<> inline uint8x16_t vextq_u8_function<16>(uint8x16_t /* a */, uint8x16_t b) { return b; } - + template <int imm> struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> { Y_FORCE_INLINE - THelper_mm_slli_si128(const __m128i a) { + THelper_mm_slli_si128(const __m128i a) { auto zero = vdupq_n_u8(0); TQType<uint8x16_t>::As(Value) = vextq_u8_function<16 - imm>(zero, a.AsUi8x16); } @@ -641,7 +641,7 @@ struct TScalarOutWrapper : TBaseWrapper<TOpOut> { } }; -template<int imm> +template<int imm> int extract_epi8_arm(__m128i arg) { return vgetq_lane_u8(arg.AsUi8x16, imm); } @@ -668,7 +668,7 @@ long long extract_epi64_arm(__m128i arg) { #define _mm_extract_ps(op, imm) _mm_extract_epi32(op, imm) static Y_FORCE_INLINE -__m128i _mm_mul_epu32(__m128i op1, __m128i op2) { +__m128i _mm_mul_epu32(__m128i op1, __m128i op2) { __m128i result; uint32x4_t r1 = vuzp1q_u32(op1.AsUi32x4, op2.AsUi32x4); uint32x4_t r2 = vuzp1q_u32(op2.AsUi32x4, op1.AsUi32x4); @@ -886,13 +886,13 @@ _mm_movemask_ps(const __m128& op) { bits = vshlq_u32(bits, shifts); return vaddvq_u32(bits); } - + Y_FORCE_INLINE i64 _mm_cvtsi128_si64(__m128i a) { - return vgetq_lane_s64(a.AsSi64x2, 0); -} + return vgetq_lane_s64(a.AsSi64x2, 0); +} static inline void _mm_pause() { - __asm__ ("YIELD"); + __asm__ ("YIELD"); } static inline __m128 _mm_rsqrt_ps(__m128 a) { @@ -909,7 +909,7 @@ inline __m128 _mm_cmpunord_ps(__m128 a, __m128 b) { __m128 res; res.AsUi32x4 = vorrq_u32( vmvnq_u32(vceqq_f32(a.AsFloat32x4, a.AsFloat32x4)), //!< 0xffffffff for all nans in a. - vmvnq_u32(vceqq_f32(b.AsFloat32x4, b.AsFloat32x4)) //!< 0xffffffff all nans in b. + vmvnq_u32(vceqq_f32(b.AsFloat32x4, b.AsFloat32x4)) //!< 0xffffffff all nans in b. ); return res; } diff --git a/library/cpp/sse/ut/test.cpp b/library/cpp/sse/ut/test.cpp index 33c999d284..cf0aa0060c 100644 --- a/library/cpp/sse/ut/test.cpp +++ b/library/cpp/sse/ut/test.cpp @@ -39,7 +39,7 @@ struct T_mm_CallWrapper { #if defined(_arm64_) #include "library/cpp/sse/sse2neon.h" -#elif defined(_i386_) || defined(_x86_64_) +#elif defined(_i386_) || defined(_x86_64_) #include <xmmintrin.h> #include <emmintrin.h> #include <smmintrin.h> @@ -333,7 +333,7 @@ public: __m128i Value[17]; }; - void Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo); + void Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo); void Test_mm_slli_epi16(); void Test_mm_slli_epi32(); @@ -624,10 +624,10 @@ void TSSEEmulTest::Test_mm_storeu_ps() { } } -template<typename C> +template<typename C> C MakeNumber(unsigned number); -template<> +template<> __m128i MakeNumber<__m128i>(unsigned number) { char data[16] = {0}; memcpy(data, &number, sizeof(number)); @@ -635,7 +635,7 @@ __m128i MakeNumber<__m128i>(unsigned number) { return _mm_loadu_si128((__m128i*)data); } -template<> +template<> unsigned MakeNumber<unsigned>(unsigned number) { return number; } @@ -666,16 +666,16 @@ void TSSEEmulTest::Test_mm_shifter_epiXX() { } } - -void TSSEEmulTest::Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo) { + +void TSSEEmulTest::Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo) { const char data[48] = { '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', - '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00' - }; + '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00' + }; const __m128i a = _mm_loadu_si128((__m128i*)(data + 16)); const TShiftRes res = foo(a); @@ -760,7 +760,7 @@ void TSSEEmulTest::Test_mm_slli_epi64() { } void TSSEEmulTest::Test_mm_slli_si128() { - Test_mm_byte_shifter(EDirection::Left, [] (__m128i a) -> TShiftRes { + Test_mm_byte_shifter(EDirection::Left, [] (__m128i a) -> TShiftRes { TShiftRes res; res.Value[0] = _mm_slli_si128(a, 0); res.Value[1] = _mm_slli_si128(a, 1); @@ -881,12 +881,12 @@ void TSSEEmulTest::Test_mm_add_pd() { void TSSEEmulTest::Test_mm_madd_epi16() { alignas(16) const char data1[16] = { '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', - '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C' - }; + '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C' + }; alignas(16) const char data2[16] = { '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', - '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF' - }; + '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF' + }; const __m128i value1 = TFuncLoad<__m128i>(&data1); const __m128i value2 = TFuncLoad<__m128i>(&data2); @@ -897,13 +897,13 @@ void TSSEEmulTest::Test_mm_madd_epi16() { for (size_t i = 0; i != 4; ++i) { const size_t dataIdx = i * 2; - const i32 etalonResult = (i32) dataw1[dataIdx] * (i32) dataw2[dataIdx] + (i32) dataw1[dataIdx + 1] * (i32) dataw2[dataIdx + 1]; + const i32 etalonResult = (i32) dataw1[dataIdx] * (i32) dataw2[dataIdx] + (i32) dataw1[dataIdx + 1] * (i32) dataw2[dataIdx + 1]; const i32 value = TQType<int32x4_t>::As(res)[i]; UNIT_ASSERT_EQUAL(value, etalonResult); } } - + template <typename TElem> struct THelperSub { static TElem Call(const TElem op1, const TElem op2) { @@ -1781,8 +1781,8 @@ void TSSEEmulTest::Test_mm_mul_epu32() { __m128i value0 = _mm_loadu_si128((__m128i*)&data0); __m128i value1 = _mm_loadu_si128((__m128i*)&data1); - ui64 mul0 = (ui64) dataw0[0] * (ui64) dataw1[0]; - ui64 mul1 = (ui64) dataw0[2] * (ui64) dataw1[2]; + ui64 mul0 = (ui64) dataw0[0] * (ui64) dataw1[0]; + ui64 mul1 = (ui64) dataw0[2] * (ui64) dataw1[2]; __m128i result = _mm_mul_epu32(value0, value1); @@ -1796,18 +1796,18 @@ void TSSEEmulTest::Test_mm_cmpunord_ps() { alignas(16) char allfs[16] = { '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', - '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff' - }; + '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff' + }; alignas(16) char allzeroes[16] = { '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', - '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00' - }; + '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00' + }; const __m128 qnan = _mm_set_ps1(std::numeric_limits<float>::quiet_NaN()); const __m128 snan = _mm_set_ps1(std::numeric_limits<float>::signaling_NaN()); - const __m128 values = _mm_loadu_ps((const float*) valuesBits); - const __m128 values2 = _mm_loadu_ps((const float*) values2Bits); + const __m128 values = _mm_loadu_ps((const float*) valuesBits); + const __m128 values2 = _mm_loadu_ps((const float*) values2Bits); const __m128 mask1 = _mm_cmpunord_ps(qnan, qnan); UNIT_ASSERT_EQUAL(::memcmp(&mask1, &allfs, sizeof(allfs)), 0); @@ -1867,21 +1867,21 @@ void TSSEEmulTest::Test_mm_storeu_pd() { void TSSEEmulTest::Test_mm_andnot_ps() { alignas(16) const char firstBits[16] = { '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff', - '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff' - }; + '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff' + }; alignas(16) const char secondBits[16] = { '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', - '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff' - }; + '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff' + }; alignas(16) const char resBits[16] = { '\x00', '\xff', '\x00', '\x00', '\x00', '\xff', '\x00', '\x00', - '\x00', '\xff', '\x00', '\x00', '\x00', '\xff', '\x00', '\x00' - }; + '\x00', '\xff', '\x00', '\x00', '\x00', '\xff', '\x00', '\x00' + }; - const __m128 value1 = _mm_loadu_ps((const float*) firstBits); - const __m128 value2 = _mm_loadu_ps((const float*) secondBits); + const __m128 value1 = _mm_loadu_ps((const float*) firstBits); + const __m128 value2 = _mm_loadu_ps((const float*) secondBits); const __m128 res = _mm_andnot_ps(value1, value2); UNIT_ASSERT_EQUAL(::memcmp(&res, resBits, sizeof(resBits)), 0); @@ -1971,21 +1971,21 @@ void TSSEEmulTest::Test_mm_loadh_pd() { void TSSEEmulTest::Test_mm_or_ps() { alignas(16) const char bytes1[16] = { '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff', - '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff' - }; + '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff' + }; alignas(16) const char bytes2[16] = { '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', - '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff' - }; + '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff' + }; alignas(16) const char etalon[16] = { '\x00', '\xff', '\xff', '\xff', '\x00', '\xff', '\xff', '\xff', - '\x00', '\xff', '\xff', '\xff', '\x00', '\xff', '\xff', '\xff' - }; + '\x00', '\xff', '\xff', '\xff', '\x00', '\xff', '\xff', '\xff' + }; - const __m128 value1 = _mm_loadu_ps((const float*) bytes1); - const __m128 value2 = _mm_loadu_ps((const float*) bytes2); + const __m128 value1 = _mm_loadu_ps((const float*) bytes1); + const __m128 value2 = _mm_loadu_ps((const float*) bytes2); const __m128 res = _mm_or_ps(value1, value2); UNIT_ASSERT_EQUAL(::memcmp(&res, etalon, sizeof(etalon)), 0); diff --git a/library/cpp/sse/ut/ya.make b/library/cpp/sse/ut/ya.make index 45e104971e..21be851e65 100644 --- a/library/cpp/sse/ut/ya.make +++ b/library/cpp/sse/ut/ya.make @@ -3,7 +3,7 @@ UNITTEST_FOR(library/cpp/sse) OWNER(danlark) SRCS( - test.cpp + test.cpp ) IF (ARCH_X86_64) diff --git a/library/cpp/sse/ya.make b/library/cpp/sse/ya.make index d2351e791d..632bdf98b7 100644 --- a/library/cpp/sse/ya.make +++ b/library/cpp/sse/ya.make @@ -1,12 +1,12 @@ -LIBRARY() - +LIBRARY() + OWNER( g:base danlark ) - -SRCS( + +SRCS( sse.cpp -) - -END() +) + +END() |