diff options
author | fixthgame <fixthgame@yandex-team.com> | 2023-11-17 10:37:07 +0300 |
---|---|---|
committer | fixthgame <fixthgame@yandex-team.com> | 2023-11-17 11:01:24 +0300 |
commit | 34eae36a4cbbe40d938ca620c2eb447fecb3d0b1 (patch) | |
tree | 21d403fd474c3f4dbecd41127794a87915fa4037 | |
parent | 60b7b80e562681da7fb3479dcb35755dba4019f9 (diff) | |
download | ydb-34eae36a4cbbe40d938ca620c2eb447fecb3d0b1.tar.gz |
Rotate + Shift
Rotate + Shift
-rw-r--r-- | ydb/library/yql/utils/simd/simd_avx2.h | 154 | ||||
-rw-r--r-- | ydb/library/yql/utils/simd/simd_sse42.h | 64 | ||||
-rw-r--r-- | ydb/library/yql/utils/simd/simd_ut.cpp | 21 |
3 files changed, 223 insertions, 16 deletions
diff --git a/ydb/library/yql/utils/simd/simd_avx2.h b/ydb/library/yql/utils/simd/simd_avx2.h index c92ced8d74..319e201cda 100644 --- a/ydb/library/yql/utils/simd/simd_avx2.h +++ b/ydb/library/yql/utils/simd/simd_avx2.h @@ -106,8 +106,8 @@ struct TSimd8 { return _mm256_load_si256(reinterpret_cast<const __m256i *>(values)); } - static inline TSimd8<T> LoadStream(T dst[32]) { - return _mm256_stream_load_si256(reinterpret_cast<__m256i *>(dst)); + static inline TSimd8<T> LoadStream(const T values[32]) { + return _mm256_stream_load_si256(reinterpret_cast<__m256i *>(values)); } inline void Store(T dst[32]) const { @@ -122,15 +122,25 @@ struct TSimd8 { return _mm256_stream_si256(reinterpret_cast<__m256i *>(dst), this->Value); } + inline void StoreMasked(void* dst, const TSimd8<T>& mask) const { + _mm_maskmoveu_si128(_mm256_castsi256_si128(this->Value), _mm256_castsi256_si128(mask.Value), dst); + _mm_maskmoveu_si128(_mm256_extracti128_si256(this->Value, 1), _mm256_extracti128_si256(mask.Value, 1), (char*) dst); + } + + template<bool CanBeNegative = true> inline TSimd8<T> Shuffle(const TSimd8<T>& other) const { - TSimd8<T> mask0(0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0); - TSimd8<T> mask1(0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, - 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70); + const TSimd8<T> mask0(0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, + 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0); + const TSimd8<T> mask1(0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70); TSimd8<T> perm = _mm256_permute4x64_epi64(this->Value, 0x4E); - TSimd8<T> tmp = Shuffle128(other + mask0) | perm.Shuffle128(other + mask1); - TSimd8<T> mask = _mm256_cmpgt_epi8(other.Value, _mm256_set1_epi8(-1)); - return tmp & mask; + if constexpr (CanBeNegative) { + TSimd8<T> tmp = Shuffle128(other + mask0) | perm.Shuffle128(other + mask1); + TSimd8<T> mask = _mm256_cmpgt_epi8(other.Value, _mm256_set1_epi8(-1)); + return tmp & mask; + } else { + return Shuffle128(other + mask0) | perm.Shuffle128(other + mask1); + } } inline TSimd8<T> Shuffle128(const TSimd8<T>& other) const { @@ -138,19 +148,139 @@ struct TSimd8 { } template<int N> - inline TSimd8<T> Blend16(const TSimd8<T>& other) { + inline TSimd8<T> Blend16(const TSimd8<T>& other) const { return _mm256_blend_epi16(this->Value, other.Value, N); } template<int N> - inline TSimd8<T> Blend32(const TSimd8<T>& other) { + inline TSimd8<T> Blend32(const TSimd8<T>& other) const { return _mm256_blend_epi32(this->Value, other.Value, N); } - inline TSimd8<T> BlendVar(const TSimd8<T>& other, const TSimd8<T>& mask) { + inline TSimd8<T> BlendVar(const TSimd8<T>& other, const TSimd8<T>& mask) const { return _mm256_blendv_epi8(this->Value, other.Value, mask.Value); } + template<int N> + inline TSimd8<T> ByteShift128() const { + if constexpr (N < 0) { + return _mm256_bsrli_epi128(this->Value, -N); + } else { + return _mm256_bslli_epi128(this->Value, N); + } + } + + template<int N> + inline TSimd8<T> ByteShift() const { + constexpr T A0 = N > 0 ? -(0 < N) : -(0 >= 32 + N); + constexpr T A1 = N > 0 ? -(1 < N) : -(1 >= 32 + N); + constexpr T A2 = N > 0 ? -(2 < N) : -(2 >= 32 + N); + constexpr T A3 = N > 0 ? -(3 < N) : -(3 >= 32 + N); + constexpr T A4 = N > 0 ? -(4 < N) : -(4 >= 32 + N); + constexpr T A5 = N > 0 ? -(5 < N) : -(5 >= 32 + N); + constexpr T A6 = N > 0 ? -(6 < N) : -(6 >= 32 + N); + constexpr T A7 = N > 0 ? -(7 < N) : -(7 >= 32 + N); + constexpr T A8 = N > 0 ? -(8 < N) : -(8 >= 32 + N); + constexpr T A9 = N > 0 ? -(9 < N) : -(9 >= 32 + N); + constexpr T A10 = N > 0 ? -(10 < N) : -(10 >= 32 + N); + constexpr T A11 = N > 0 ? -(11 < N) : -(11 >= 32 + N); + constexpr T A12 = N > 0 ? -(12 < N) : -(12 >= 32 + N); + constexpr T A13 = N > 0 ? -(13 < N) : -(13 >= 32 + N); + constexpr T A14 = N > 0 ? -(14 < N) : -(14 >= 32 + N); + constexpr T A15 = N > 0 ? -(15 < N) : -(15 >= 32 + N); + constexpr T A16 = N > 0 ? -(16 < N) : -(16 >= 32 + N); + constexpr T A17 = N > 0 ? -(17 < N) : -(17 >= 32 + N); + constexpr T A18 = N > 0 ? -(18 < N) : -(18 >= 32 + N); + constexpr T A19 = N > 0 ? -(19 < N) : -(19 >= 32 + N); + constexpr T A20 = N > 0 ? -(20 < N) : -(20 >= 32 + N); + constexpr T A21 = N > 0 ? -(21 < N) : -(21 >= 32 + N); + constexpr T A22 = N > 0 ? -(22 < N) : -(22 >= 32 + N); + constexpr T A23 = N > 0 ? -(23 < N) : -(23 >= 32 + N); + constexpr T A24 = N > 0 ? -(24 < N) : -(24 >= 32 + N); + constexpr T A25 = N > 0 ? -(25 < N) : -(25 >= 32 + N); + constexpr T A26 = N > 0 ? -(26 < N) : -(26 >= 32 + N); + constexpr T A27 = N > 0 ? -(27 < N) : -(27 >= 32 + N); + constexpr T A28 = N > 0 ? -(28 < N) : -(28 >= 32 + N); + constexpr T A29 = N > 0 ? -(29 < N) : -(29 >= 32 + N); + constexpr T A30 = N > 0 ? -(30 < N) : -(30 >= 32 + N); + constexpr T A31 = N > 0 ? -(31 < N) : -(31 >= 32 + N); + return Rotate<N>().BlendVar(TSimd8<T>(T(0)), + TSimd8<T>(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, + A16, A17, A18, A19, A20, A21, A22, A23, A24, A25, A26, A27, A28, A29, A30, A31)); + } + + template<int N> + inline TSimd8<T> ByteShiftWithCarry(const TSimd8<T>& other) const { + return Rotate<N>().BlendVar(other.Rotate<N>(), ~TSimd8<T>(T(-1)).ByteShift<N>()); + } + + template<int N> + inline TSimd8<T> Rotate128() const { + if constexpr (N % 16 == 0) { + return *this; + } else { + constexpr T A0 = (16 - N) % 16; + constexpr T A1 = (16 - N + 1) % 16; + constexpr T A2 = (16 - N + 2) % 16; + constexpr T A3 = (16 - N + 3) % 16; + constexpr T A4 = (16 - N + 4) % 16; + constexpr T A5 = (16 - N + 5) % 16; + constexpr T A6 = (16 - N + 6) % 16; + constexpr T A7 = (16 - N + 7) % 16; + constexpr T A8 = (16 - N + 8) % 16; + constexpr T A9 = (16 - N + 9) % 16; + constexpr T A10 = (16 - N + 10) % 16; + constexpr T A11 = (16 - N + 11) % 16; + constexpr T A12 = (16 - N + 12) % 16; + constexpr T A13 = (16 - N + 13) % 16; + constexpr T A14 = (16 - N + 14) % 16; + constexpr T A15 = (16 - N + 15) % 16; + return Shuffle128(Repeat16(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15)); + } + } + + template<int N> + inline TSimd8<T> Rotate() const { + if constexpr (N % 32 == 0) { + return *this; + } else { + constexpr T A0 = (32 - N) % 32; + constexpr T A1 = (32 - N + 1) % 32; + constexpr T A2 = (32 - N + 2) % 32; + constexpr T A3 = (32 - N + 3) % 32; + constexpr T A4 = (32 - N + 4) % 32; + constexpr T A5 = (32 - N + 5) % 32; + constexpr T A6 = (32 - N + 6) % 32; + constexpr T A7 = (32 - N + 7) % 32; + constexpr T A8 = (32 - N + 8) % 32; + constexpr T A9 = (32 - N + 9) % 32; + constexpr T A10 = (32 - N + 10) % 32; + constexpr T A11 = (32 - N + 11) % 32; + constexpr T A12 = (32 - N + 12) % 32; + constexpr T A13 = (32 - N + 13) % 32; + constexpr T A14 = (32 - N + 14) % 32; + constexpr T A15 = (32 - N + 15) % 32; + constexpr T A16 = (32 - N + 16) % 32; + constexpr T A17 = (32 - N + 17) % 32; + constexpr T A18 = (32 - N + 18) % 32; + constexpr T A19 = (32 - N + 19) % 32; + constexpr T A20 = (32 - N + 20) % 32; + constexpr T A21 = (32 - N + 21) % 32; + constexpr T A22 = (32 - N + 22) % 32; + constexpr T A23 = (32 - N + 23) % 32; + constexpr T A24 = (32 - N + 24) % 32; + constexpr T A25 = (32 - N + 25) % 32; + constexpr T A26 = (32 - N + 26) % 32; + constexpr T A27 = (32 - N + 27) % 32; + constexpr T A28 = (32 - N + 28) % 32; + constexpr T A29 = (32 - N + 29) % 32; + constexpr T A30 = (32 - N + 30) % 32; + constexpr T A31 = (32 - N + 31) % 32; + return Shuffle(TSimd8<T>(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, + A16, A17, A18, A19, A20, A21, A22, A23, A24, A25, A26, A27, A28, A29, A30, A31)); + } + } + static inline TSimd8<T> Repeat16( T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 diff --git a/ydb/library/yql/utils/simd/simd_sse42.h b/ydb/library/yql/utils/simd/simd_sse42.h index 696708bd2a..854f0018ca 100644 --- a/ydb/library/yql/utils/simd/simd_sse42.h +++ b/ydb/library/yql/utils/simd/simd_sse42.h @@ -99,6 +99,10 @@ struct TSimd8 { return TSimd8<T>(values); } + static inline TSimd8<T> Load128(const T values[16]) { + return Load(values); + } + static inline TSimd8<T> LoadAligned(const T values[16]) { return _mm_load_si128(reinterpret_cast<const __m128i *>(values)); } @@ -119,6 +123,11 @@ struct TSimd8 { return _mm_stream_si128(reinterpret_cast<__m128i *>(dst), this->Value); } + inline void StoreMasked(void* dst, const TSimd8<T>& mask) const { + _mm_maskmoveu_si128(this->Value, mask.Value, dst); + } + + template<bool CanBeNegative = true> inline TSimd8<T> Shuffle(const TSimd8<T>& other) const { return Shuffle128(other); } @@ -129,19 +138,68 @@ struct TSimd8 { } template<int N> - inline TSimd8<T> Blend16(const TSimd8<T>& other) { + inline TSimd8<T> Blend16(const TSimd8<T>& other) const { return _mm_blend_epi16(this->Value, other.Value, N); } template<int N> - inline TSimd8<T> Blend32(const TSimd8<T>& other) { + inline TSimd8<T> Blend32(const TSimd8<T>& other) const { return _mm_blend_epi32(this->Value, other.Value, N); } - inline TSimd8<T> BlendVar(const TSimd8<T>& other, const TSimd8<T>& mask) { + inline TSimd8<T> BlendVar(const TSimd8<T>& other, const TSimd8<T>& mask) const { return _mm_blendv_epi8(this->Value, other.Value, mask.Value); } + template<int N> + inline TSimd8<T> ByteShift128() const { + if constexpr (N < 0) { + return _mm_bsrli_si128(this->Value, -N); + } else { + return _mm_bslli_si128(this->Value, N); + } + } + + template<int N> + inline TSimd8<T> ByteShift() const { + return ByteShift128<N>(); + } + + template<int N> + inline TSimd8<T> ByteShiftWithCarry(const TSimd8<T>& other) const { + return Rotate<N>().BlendVar(other.Rotate<N>(), ~TSimd8<T>(T(-1)).ByteShift<N>()); + } + + template<int N> + inline TSimd8<T> Rotate128() const { + if constexpr (N % 16 == 0) { + return *this; + } else { + constexpr T A0 = (16 - N) % 16; + constexpr T A1 = (16 - N + 1) % 16; + constexpr T A2 = (16 - N + 2) % 16; + constexpr T A3 = (16 - N + 3) % 16; + constexpr T A4 = (16 - N + 4) % 16; + constexpr T A5 = (16 - N + 5) % 16; + constexpr T A6 = (16 - N + 6) % 16; + constexpr T A7 = (16 - N + 7) % 16; + constexpr T A8 = (16 - N + 8) % 16; + constexpr T A9 = (16 - N + 9) % 16; + constexpr T A10 = (16 - N + 10) % 16; + constexpr T A11 = (16 - N + 11) % 16; + constexpr T A12 = (16 - N + 12) % 16; + constexpr T A13 = (16 - N + 13) % 16; + constexpr T A14 = (16 - N + 14) % 16; + constexpr T A15 = (16 - N + 15) % 16; + return Shuffle128(TSimd8<T>(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15)); + } + } + + template<int N> + inline TSimd8<T> Rotate() const { + return Rotate128<N>(); + } + static inline TSimd8<T> Repeat16( T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 diff --git a/ydb/library/yql/utils/simd/simd_ut.cpp b/ydb/library/yql/utils/simd/simd_ut.cpp index 9c135e6f50..45c337f3c5 100644 --- a/ydb/library/yql/utils/simd/simd_ut.cpp +++ b/ydb/library/yql/utils/simd/simd_ut.cpp @@ -162,13 +162,21 @@ Y_UNIT_TEST_SUITE(TSimdAVX2) { 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); UNIT_ASSERT_EQUAL((Shift5Right == arr.Shuffle(mask0)).All(), true); UNIT_ASSERT_EQUAL((Shift5 == arr.Shuffle(mask1)).All(), true); + + UNIT_ASSERT_EQUAL((Shift5Right == arr.template ByteShift<5>()).All(), true); + UNIT_ASSERT_EQUAL((Shift5 == arr.template ByteShift<-5>()).All(), true); + + + TSimd8<i8> Rotate5( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4); + UNIT_ASSERT_EQUAL((Rotate5 == arr.template Rotate<-5>()).All(), true); } Y_UNIT_TEST(UnpackMask) { TSimd8<i8> unpackMask = NSimd::CreateUnpackMask<NSimd::TSimdAVX2Traits>(2, 1, false); unpackMask.Log<i8>(Cerr); UNIT_ASSERT_EQUAL((unpackMask == TSimd8<i8>(0, 1, -1, 2, 3, -1, 4, 5, -1, 6, 7, -1, 8, 9, -1, 10, 11, - -1, 12, 13, -1, 14, 15, -1, 16, 17, -1, 18, 19, -1, 20, 21)).All(), true); + -1, 12, 13, -1, 14, 15, -1, 16, 17, -1, 18, 19, -1, 20, 21)).All(), true); } Y_UNIT_TEST(CRC) { @@ -315,6 +323,17 @@ Y_UNIT_TEST_SUITE(TSimdSSE42) { TSimd8<i8> Shift5Right(0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); UNIT_ASSERT_EQUAL((Shift5Right == arr.Shuffle(mask0)).All(), true); UNIT_ASSERT_EQUAL((Shift5 == arr.Shuffle(mask1)).All(), true); + + UNIT_ASSERT_EQUAL((Shift5Right == arr.template ByteShift<5>()).All(), true); + UNIT_ASSERT_EQUAL((Shift5 == arr.template ByteShift<-5>()).All(), true); + + TSimd8<i8> Rotate5(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4); + UNIT_ASSERT_EQUAL((Rotate5 == arr.template Rotate<-5>()).All(), true); + + TSimd8<i8> a(i8(0)); + TSimd8<i8> b(i8(1)); + TSimd8<i8> c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + UNIT_ASSERT_EQUAL((a.ByteShiftWithCarry<3>(b) == c).All(), true); } |