aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/snappy/snappy-internal.h
diff options
context:
space:
mode:
authorrobot-contrib <robot-contrib@yandex-team.com>2023-10-23 23:42:01 +0300
committerrobot-contrib <robot-contrib@yandex-team.com>2023-10-24 00:04:36 +0300
commitcc78b961ed847896c5d94fc564556e2db812df70 (patch)
treee3232d9376329099f75d97812a012e535ae695cb /contrib/libs/snappy/snappy-internal.h
parent34017dfc732950000ee19e1e6e48a0175dd2109f (diff)
downloadydb-cc78b961ed847896c5d94fc564556e2db812df70.tar.gz
Update contrib/libs/snappy to 1.1.10
Diffstat (limited to 'contrib/libs/snappy/snappy-internal.h')
-rw-r--r--contrib/libs/snappy/snappy-internal.h98
1 files changed, 88 insertions, 10 deletions
diff --git a/contrib/libs/snappy/snappy-internal.h b/contrib/libs/snappy/snappy-internal.h
index 720ccd8282..0923f399a3 100644
--- a/contrib/libs/snappy/snappy-internal.h
+++ b/contrib/libs/snappy/snappy-internal.h
@@ -33,9 +33,84 @@
#include "snappy-stubs-internal.h"
+#if SNAPPY_HAVE_SSSE3
+// Please do not replace with <x86intrin.h> or with headers that assume more
+// advanced SSE versions without checking with all the OWNERS.
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#endif
+
+#if SNAPPY_HAVE_NEON
+#include <arm_neon.h>
+#endif
+
+#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
+#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
+#else
+#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
+#endif
+
namespace snappy {
namespace internal {
+#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
+#if SNAPPY_HAVE_SSSE3
+using V128 = __m128i;
+#elif SNAPPY_HAVE_NEON
+using V128 = uint8x16_t;
+#endif
+
+// Load 128 bits of integer data. `src` must be 16-byte aligned.
+inline V128 V128_Load(const V128* src);
+
+// Load 128 bits of integer data. `src` does not need to be aligned.
+inline V128 V128_LoadU(const V128* src);
+
+// Store 128 bits of integer data. `dst` does not need to be aligned.
+inline void V128_StoreU(V128* dst, V128 val);
+
+// Shuffle packed 8-bit integers using a shuffle mask.
+// Each packed integer in the shuffle mask must be in [0,16).
+inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
+
+// Constructs V128 with 16 chars |c|.
+inline V128 V128_DupChar(char c);
+
+#if SNAPPY_HAVE_SSSE3
+inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
+
+inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
+
+inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
+
+inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
+ return _mm_shuffle_epi8(input, shuffle_mask);
+}
+
+inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); }
+
+#elif SNAPPY_HAVE_NEON
+inline V128 V128_Load(const V128* src) {
+ return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
+}
+
+inline V128 V128_LoadU(const V128* src) {
+ return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
+}
+
+inline void V128_StoreU(V128* dst, V128 val) {
+ vst1q_u8(reinterpret_cast<uint8_t*>(dst), val);
+}
+
+inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
+ assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15);
+ return vqtbl1q_u8(input, shuffle_mask);
+}
+
+inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
+#endif
+#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
+
// Working memory performs a single allocation to hold all scratch space
// required for compression.
class WorkingMemory {
@@ -95,8 +170,9 @@ char* CompressFragment(const char* input,
// loading from s2 + n.
//
// Separate implementation for 64-bit, little-endian cpus.
-#if !defined(SNAPPY_IS_BIG_ENDIAN) && \
- (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || defined(ARCH_ARM))
+#if !SNAPPY_IS_BIG_ENDIAN && \
+ (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
+ defined(ARCH_ARM))
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2,
const char* s2_limit,
@@ -154,8 +230,9 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
uint64_t xorval = a1 ^ a2;
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
+ uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
#ifndef __x86_64__
- *data = UNALIGNED_LOAD64(s2 + matched_bytes);
+ a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#else
// Ideally this would just be
//
@@ -166,13 +243,13 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
// use a conditional move (it's tuned to cut data dependencies). In this
// case there is a longer parallel chain anyway AND this will be fairly
// unpredictable.
- uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
asm("testl %k2, %k2\n\t"
"cmovzq %1, %0\n\t"
: "+r"(a2)
- : "r"(a3), "r"(xorval));
- *data = a2 >> (shift & (3 * 8));
+ : "r"(a3), "r"(xorval)
+ : "cc");
#endif
+ *data = a2 >> (shift & (3 * 8));
return std::pair<size_t, bool>(matched_bytes, true);
} else {
matched = 8;
@@ -194,16 +271,17 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
uint64_t xorval = a1 ^ a2;
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
+ uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
#ifndef __x86_64__
- *data = UNALIGNED_LOAD64(s2 + matched_bytes);
+ a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#else
- uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
asm("testl %k2, %k2\n\t"
"cmovzq %1, %0\n\t"
: "+r"(a2)
- : "r"(a3), "r"(xorval));
- *data = a2 >> (shift & (3 * 8));
+ : "r"(a3), "r"(xorval)
+ : "cc");
#endif
+ *data = a2 >> (shift & (3 * 8));
matched += matched_bytes;
assert(matched >= 8);
return std::pair<size_t, bool>(matched, false);