aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.com>2023-03-25 20:23:17 +0300
committerthegeorg <thegeorg@yandex-team.com>2023-03-25 20:23:17 +0300
commita50a4399c2600b05a086acdca3ba56c957d62196 (patch)
tree2cf3f6cc37ccc6bd19c33a928e07dd6c083cea72 /contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal
parent76f3ccf647d9cff0e38a7989dc89480854107b78 (diff)
downloadydb-a50a4399c2600b05a086acdca3ba56c957d62196.tar.gz
Update contrib/restricted/abseil-cpp-tstring to 20230125.1
Diffstat (limited to 'contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal')
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/cpu_detect.cc256
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/cpu_detect.h57
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc.cc468
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc.h91
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32_x86_arm_combined_simd.h269
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32c.h39
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32c_inline.h72
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_cord_state.cc130
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_cord_state.h159
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_internal.h179
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy.h119
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy_fallback.cc75
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy_x86_64.cc434
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_non_temporal_memcpy.cc93
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_x86_arm_combined.cc725
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/non_temporal_arm_intrinsics.h79
-rw-r--r--contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/non_temporal_memcpy.h180
17 files changed, 3425 insertions, 0 deletions
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/cpu_detect.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/cpu_detect.cc
new file mode 100644
index 0000000000..8c8f8d5580
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/cpu_detect.cc
@@ -0,0 +1,256 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "y_absl/crc/internal/cpu_detect.h"
+
+#include <cstdint>
+#include <util/generic/string.h>
+
+#include "y_absl/base/config.h"
+
+#if defined(__aarch64__) && defined(__linux__)
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#endif
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <intrin.h>
+#endif
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+#if defined(__x86_64__) || defined(_M_X64)
+
+namespace {
+
+#if !defined(_WIN32) && !defined(_WIN64)
+// MSVC defines this function for us.
+// https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex
+static void __cpuid(int cpu_info[4], int info_type) {
+ __asm__ volatile("cpuid \n\t"
+ : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
+ "=d"(cpu_info[3])
+ : "a"(info_type), "c"(0));
+}
+#endif // !defined(_WIN32) && !defined(_WIN64)
+
+enum class Vendor {
+ kUnknown,
+ kIntel,
+ kAmd,
+};
+
+Vendor GetVendor() {
+ // Get the vendor string (issue CPUID with eax = 0).
+ int cpu_info[4];
+ __cpuid(cpu_info, 0);
+
+ TString vendor;
+ vendor.append(reinterpret_cast<char*>(&cpu_info[1]), 4);
+ vendor.append(reinterpret_cast<char*>(&cpu_info[3]), 4);
+ vendor.append(reinterpret_cast<char*>(&cpu_info[2]), 4);
+ if (vendor == "GenuineIntel") {
+ return Vendor::kIntel;
+ } else if (vendor == "AuthenticAMD") {
+ return Vendor::kAmd;
+ } else {
+ return Vendor::kUnknown;
+ }
+}
+
+CpuType GetIntelCpuType() {
+ // To get general information and extended features we send eax = 1 and
+ // ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx.
+ // (See Intel 64 and IA-32 Architectures Software Developer's Manual
+ // Volume 2A: Instruction Set Reference, A-M CPUID).
+ // https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2a-manual.html
+ // https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex
+ int cpu_info[4];
+ __cpuid(cpu_info, 1);
+
+ // Response in eax bits as follows:
+ // 0-3 (stepping id)
+ // 4-7 (model number),
+ // 8-11 (family code),
+ // 12-13 (processor type),
+ // 16-19 (extended model)
+ // 20-27 (extended family)
+
+ int family = (cpu_info[0] >> 8) & 0x0f;
+ int model_num = (cpu_info[0] >> 4) & 0x0f;
+ int ext_family = (cpu_info[0] >> 20) & 0xff;
+ int ext_model_num = (cpu_info[0] >> 16) & 0x0f;
+
+ int brand_id = cpu_info[1] & 0xff;
+
+ // Process the extended family and model info if necessary
+ if (family == 0x0f) {
+ family += ext_family;
+ }
+
+ if (family == 0x0f || family == 0x6) {
+ model_num += (ext_model_num << 4);
+ }
+
+ switch (brand_id) {
+ case 0: // no brand ID, so parse CPU family/model
+ switch (family) {
+ case 6: // Most PentiumIII processors are in this category
+ switch (model_num) {
+ case 0x2c: // Westmere: Gulftown
+ return CpuType::kIntelWestmere;
+ case 0x2d: // Sandybridge
+ return CpuType::kIntelSandybridge;
+ case 0x3e: // Ivybridge
+ return CpuType::kIntelIvybridge;
+ case 0x3c: // Haswell (client)
+ case 0x3f: // Haswell
+ return CpuType::kIntelHaswell;
+ case 0x4f: // Broadwell
+ case 0x56: // BroadwellDE
+ return CpuType::kIntelBroadwell;
+ case 0x55: // Skylake Xeon
+ if ((cpu_info[0] & 0x0f) < 5) { // stepping < 5 is skylake
+ return CpuType::kIntelSkylakeXeon;
+ } else { // stepping >= 5 is cascadelake
+ return CpuType::kIntelCascadelakeXeon;
+ }
+ case 0x5e: // Skylake (client)
+ return CpuType::kIntelSkylake;
+ default:
+ return CpuType::kUnknown;
+ }
+ default:
+ return CpuType::kUnknown;
+ }
+ default:
+ return CpuType::kUnknown;
+ }
+}
+
+CpuType GetAmdCpuType() {
+ // To get general information and extended features we send eax = 1 and
+ // ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx.
+ // (See Intel 64 and IA-32 Architectures Software Developer's Manual
+ // Volume 2A: Instruction Set Reference, A-M CPUID).
+ // https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex
+ int cpu_info[4];
+ __cpuid(cpu_info, 1);
+
+ // Response in eax bits as follows:
+ // 0-3 (stepping id)
+ // 4-7 (model number),
+ // 8-11 (family code),
+ // 12-13 (processor type),
+ // 16-19 (extended model)
+ // 20-27 (extended family)
+
+ int family = (cpu_info[0] >> 8) & 0x0f;
+ int model_num = (cpu_info[0] >> 4) & 0x0f;
+ int ext_family = (cpu_info[0] >> 20) & 0xff;
+ int ext_model_num = (cpu_info[0] >> 16) & 0x0f;
+
+ if (family == 0x0f) {
+ family += ext_family;
+ model_num += (ext_model_num << 4);
+ }
+
+ switch (family) {
+ case 0x17:
+ switch (model_num) {
+ case 0x0: // Stepping Ax
+ case 0x1: // Stepping Bx
+ return CpuType::kAmdNaples;
+ case 0x30: // Stepping Ax
+ case 0x31: // Stepping Bx
+ return CpuType::kAmdRome;
+ default:
+ return CpuType::kUnknown;
+ }
+ break;
+ case 0x19:
+ switch (model_num) {
+ case 0x1: // Stepping B0
+ return CpuType::kAmdMilan;
+ default:
+ return CpuType::kUnknown;
+ }
+ break;
+ default:
+ return CpuType::kUnknown;
+ }
+}
+
+} // namespace
+
+CpuType GetCpuType() {
+ switch (GetVendor()) {
+ case Vendor::kIntel:
+ return GetIntelCpuType();
+ case Vendor::kAmd:
+ return GetAmdCpuType();
+ default:
+ return CpuType::kUnknown;
+ }
+}
+
+bool SupportsArmCRC32PMULL() { return false; }
+
+#elif defined(__aarch64__) && defined(__linux__)
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1 << 11)
+#endif
+
+#define Y_ABSL_INTERNAL_AARCH64_ID_REG_READ(id, val) \
+ asm("mrs %0, " #id : "=r"(val))
+
+CpuType GetCpuType() {
+ // MIDR_EL1 is not visible to EL0, however the access will be emulated by
+ // linux if AT_HWCAP has HWCAP_CPUID set.
+ //
+ // This method will be unreliable on heterogeneous computing systems (ex:
+ // big.LITTLE) since the value of MIDR_EL1 will change based on the calling
+ // thread.
+ uint64_t hwcaps = getauxval(AT_HWCAP);
+ if (hwcaps & HWCAP_CPUID) {
+ uint64_t midr = 0;
+ Y_ABSL_INTERNAL_AARCH64_ID_REG_READ(MIDR_EL1, midr);
+ uint32_t implementer = (midr >> 24) & 0xff;
+ uint32_t part_number = (midr >> 4) & 0xfff;
+ if (implementer == 0x41 && part_number == 0xd0c) {
+ return CpuType::kArmNeoverseN1;
+ }
+ }
+ return CpuType::kUnknown;
+}
+
+bool SupportsArmCRC32PMULL() {
+ uint64_t hwcaps = getauxval(AT_HWCAP);
+ return (hwcaps & HWCAP_CRC32) && (hwcaps & HWCAP_PMULL);
+}
+
+#else
+
+CpuType GetCpuType() { return CpuType::kUnknown; }
+
+bool SupportsArmCRC32PMULL() { return false; }
+
+#endif
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/cpu_detect.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/cpu_detect.h
new file mode 100644
index 0000000000..55539d9a52
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/cpu_detect.h
@@ -0,0 +1,57 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_CPU_DETECT_H_
+#define Y_ABSL_CRC_INTERNAL_CPU_DETECT_H_
+
+#include "y_absl/base/config.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+// Enumeration of architectures that we have special-case tuning parameters for.
+// This set may change over time.
+enum class CpuType {
+ kUnknown,
+ kIntelHaswell,
+ kAmdRome,
+ kAmdNaples,
+ kAmdMilan,
+ kIntelCascadelakeXeon,
+ kIntelSkylakeXeon,
+ kIntelBroadwell,
+ kIntelSkylake,
+ kIntelIvybridge,
+ kIntelSandybridge,
+ kIntelWestmere,
+ kArmNeoverseN1,
+};
+
+// Returns the type of host CPU this code is running on. Returns kUnknown if
+// the host CPU is of unknown type, or if detection otherwise fails.
+CpuType GetCpuType();
+
+// Returns whether the host CPU supports the CPU features needed for our
+// accelerated implementations. The CpuTypes enumerated above apart from
+// kUnknown support the required features. On unknown CPUs, we can use
+// this to see if it's safe to use hardware acceleration, though without any
+// tuning.
+bool SupportsArmCRC32PMULL();
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_CRC_INTERNAL_CPU_DETECT_H_
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc.cc
new file mode 100644
index 0000000000..a5b04de0bb
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc.cc
@@ -0,0 +1,468 @@
+// Copyright 2022 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Implementation of CRCs (aka Rabin Fingerprints).
+// Treats the input as a polynomial with coefficients in Z(2),
+// and finds the remainder when divided by an irreducible polynomial
+// of the appropriate length.
+// It handles all CRC sizes from 8 to 128 bits.
+// It's somewhat complicated by having separate implementations optimized for
+// CRC's <=32 bits, <= 64 bits, and <= 128 bits.
+// The input string is prefixed with a "1" bit, and has "degree" "0" bits
+// appended to it before the remainder is found. This ensures that
+// short strings are scrambled somewhat and that strings consisting
+// of all nulls have a non-zero CRC.
+//
+// Uses the "interleaved word-by-word" method from
+// "Everything we know about CRC but afraid to forget" by Andrew Kadatch
+// and Bob Jenkins,
+// http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
+//
+// The idea is to compute kStride CRCs simultaneously, allowing the
+// processor to more effectively use multiple execution units. Each of
+// the CRCs is calculated on one word of data followed by kStride - 1
+// words of zeroes; the CRC starting points are staggered by one word.
+// Assuming a stride of 4 with data words "ABCDABCDABCD", the first
+// CRC is over A000A000A, the second over 0B000B000B, and so on.
+// The CRC of the whole data is then calculated by properly aligning the
+// CRCs by appending zeroes until the data lengths agree then XORing
+// the CRCs.
+
+#include "y_absl/crc/internal/crc.h"
+
+#include <cstdint>
+
+#include "y_absl/base/internal/endian.h"
+#include "y_absl/base/internal/prefetch.h"
+#include "y_absl/base/internal/raw_logging.h"
+#include "y_absl/crc/internal/crc_internal.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+namespace {
+
+// Constants
+#if defined(__i386__) || defined(__x86_64__)
+constexpr bool kNeedAlignedLoads = false;
+#else
+constexpr bool kNeedAlignedLoads = true;
+#endif
+
+// We express the number of zeroes as a number in base ZEROES_BASE. By
+// pre-computing the zero extensions for all possible components of such an
+// expression (numbers in a form a*ZEROES_BASE**b), we can calculate the
+// resulting extension by multiplying the extensions for individual components
+// using log_{ZEROES_BASE}(num_zeroes) polynomial multiplications. The tables of
+// zero extensions contain (ZEROES_BASE - 1) * (log_{ZEROES_BASE}(64)) entries.
+constexpr int ZEROES_BASE_LG = 4; // log_2(ZEROES_BASE)
+constexpr int ZEROES_BASE = (1 << ZEROES_BASE_LG); // must be a power of 2
+
+constexpr uint32_t kCrc32cPoly = 0x82f63b78;
+
+uint32_t ReverseBits(uint32_t bits) {
+ bits = (bits & 0xaaaaaaaau) >> 1 | (bits & 0x55555555u) << 1;
+ bits = (bits & 0xccccccccu) >> 2 | (bits & 0x33333333u) << 2;
+ bits = (bits & 0xf0f0f0f0u) >> 4 | (bits & 0x0f0f0f0fu) << 4;
+ return y_absl::gbswap_32(bits);
+}
+
+// Polynomial long multiplication mod the polynomial of degree 32.
+void PolyMultiply(uint32_t* val, uint32_t m, uint32_t poly) {
+ uint32_t l = *val;
+ uint32_t result = 0;
+ auto onebit = uint32_t{0x80000000u};
+ for (uint32_t one = onebit; one != 0; one >>= 1) {
+ if ((l & one) != 0) {
+ result ^= m;
+ }
+ if (m & 1) {
+ m = (m >> 1) ^ poly;
+ } else {
+ m >>= 1;
+ }
+ }
+ *val = result;
+}
+} // namespace
+
+void CRCImpl::FillWordTable(uint32_t poly, uint32_t last, int word_size,
+ Uint32By256* t) {
+ for (int j = 0; j != word_size; j++) { // for each byte of extension....
+ t[j][0] = 0; // a zero has no effect
+ for (int i = 128; i != 0; i >>= 1) { // fill in entries for powers of 2
+ if (j == 0 && i == 128) {
+ t[j][i] = last; // top bit in last byte is given
+ } else {
+ // each successive power of two is derived from the previous
+ // one, either in this table, or the last table
+ uint32_t pred;
+ if (i == 128) {
+ pred = t[j - 1][1];
+ } else {
+ pred = t[j][i << 1];
+ }
+ // Advance the CRC by one bit (multiply by X, and take remainder
+ // through one step of polynomial long division)
+ if (pred & 1) {
+ t[j][i] = (pred >> 1) ^ poly;
+ } else {
+ t[j][i] = pred >> 1;
+ }
+ }
+ }
+ // CRCs have the property that CRC(a xor b) == CRC(a) xor CRC(b)
+ // so we can make all the tables for non-powers of two by
+ // xoring previously created entries.
+ for (int i = 2; i != 256; i <<= 1) {
+ for (int k = i + 1; k != (i << 1); k++) {
+ t[j][k] = t[j][i] ^ t[j][k - i];
+ }
+ }
+ }
+}
+
+int CRCImpl::FillZeroesTable(uint32_t poly, Uint32By256* t) {
+ uint32_t inc = 1;
+ inc <<= 31;
+
+ // Extend by one zero bit. We know degree > 1 so (inc & 1) == 0.
+ inc >>= 1;
+
+ // Now extend by 2, 4, and 8 bits, so now `inc` is extended by one zero byte.
+ for (int i = 0; i < 3; ++i) {
+ PolyMultiply(&inc, inc, poly);
+ }
+
+ int j = 0;
+ for (uint64_t inc_len = 1; inc_len != 0; inc_len <<= ZEROES_BASE_LG) {
+ // Every entry in the table adds an additional inc_len zeroes.
+ uint32_t v = inc;
+ for (int a = 1; a != ZEROES_BASE; a++) {
+ t[0][j] = v;
+ PolyMultiply(&v, inc, poly);
+ j++;
+ }
+ inc = v;
+ }
+ Y_ABSL_RAW_CHECK(j <= 256, "");
+ return j;
+}
+
+// Internal version of the "constructor".
+CRCImpl* CRCImpl::NewInternal() {
+ // Find an accelearated implementation first.
+ CRCImpl* result = TryNewCRC32AcceleratedX86ARMCombined();
+
+ // Fall back to generic implementions if no acceleration is available.
+ if (result == nullptr) {
+ result = new CRC32();
+ }
+
+ result->InitTables();
+
+ return result;
+}
+
+// The CRC of the empty string is always the CRC polynomial itself.
+void CRCImpl::Empty(uint32_t* crc) const { *crc = kCrc32cPoly; }
+
+// The 32-bit implementation
+
+void CRC32::InitTables() {
+ // Compute the table for extending a CRC by one byte.
+ Uint32By256* t = new Uint32By256[4];
+ FillWordTable(kCrc32cPoly, kCrc32cPoly, 1, t);
+ for (int i = 0; i != 256; i++) {
+ this->table0_[i] = t[0][i];
+ }
+
+ // Construct a table for updating the CRC by 4 bytes data followed by
+ // 12 bytes of zeroes.
+ //
+ // Note: the data word size could be larger than the CRC size; it might
+ // be slightly faster to use a 64-bit data word, but doing so doubles the
+ // table size.
+ uint32_t last = kCrc32cPoly;
+ const size_t size = 12;
+ for (size_t i = 0; i < size; ++i) {
+ last = (last >> 8) ^ this->table0_[last & 0xff];
+ }
+ FillWordTable(kCrc32cPoly, last, 4, t);
+ for (size_t b = 0; b < 4; ++b) {
+ for (int i = 0; i < 256; ++i) {
+ this->table_[b][i] = t[b][i];
+ }
+ }
+
+ int j = FillZeroesTable(kCrc32cPoly, t);
+ Y_ABSL_RAW_CHECK(j <= static_cast<int>(Y_ABSL_ARRAYSIZE(this->zeroes_)), "");
+ for (int i = 0; i < j; i++) {
+ this->zeroes_[i] = t[0][i];
+ }
+
+ delete[] t;
+
+ // Build up tables for _reversing_ the operation of doing CRC operations on
+ // zero bytes.
+
+ // In C++, extending `crc` by a single zero bit is done by the following:
+ // (A) bool low_bit_set = (crc & 1);
+ // crc >>= 1;
+ // if (low_bit_set) crc ^= kCrc32cPoly;
+ //
+ // In particular note that the high bit of `crc` after this operation will be
+ // set if and only if the low bit of `crc` was set before it. This means that
+ // no information is lost, and the operation can be reversed, as follows:
+ // (B) bool high_bit_set = (crc & 0x80000000u);
+ // if (high_bit_set) crc ^= kCrc32cPoly;
+ // crc <<= 1;
+ // if (high_bit_set) crc ^= 1;
+ //
+ // Or, equivalently:
+ // (C) bool high_bit_set = (crc & 0x80000000u);
+ // crc <<= 1;
+ // if (high_bit_set) crc ^= ((kCrc32cPoly << 1) ^ 1);
+ //
+ // The last observation is, if we store our checksums in variable `rcrc`,
+ // with order of the bits reversed, the inverse operation becomes:
+ // (D) bool low_bit_set = (rcrc & 1);
+ // rcrc >>= 1;
+ // if (low_bit_set) rcrc ^= ReverseBits((kCrc32cPoly << 1) ^ 1)
+ //
+ // This is the same algorithm (A) that we started with, only with a different
+ // polynomial bit pattern. This means that by building up our tables with
+ // this alternate polynomial, we can apply the CRC algorithms to a
+ // bit-reversed CRC checksum to perform inverse zero-extension.
+
+ const uint32_t kCrc32cUnextendPoly =
+ ReverseBits(static_cast<uint32_t>((kCrc32cPoly << 1) ^ 1));
+ FillWordTable(kCrc32cUnextendPoly, kCrc32cUnextendPoly, 1, &reverse_table0_);
+
+ j = FillZeroesTable(kCrc32cUnextendPoly, &reverse_zeroes_);
+ Y_ABSL_RAW_CHECK(j <= static_cast<int>(Y_ABSL_ARRAYSIZE(this->reverse_zeroes_)),
+ "");
+}
+
+void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const {
+ const uint8_t* p = static_cast<const uint8_t*>(bytes);
+ const uint8_t* e = p + length;
+ uint32_t l = *crc;
+
+ auto step_one_byte = [this, &p, &l] () {
+ int c = (l & 0xff) ^ *p++;
+ l = this->table0_[c] ^ (l >> 8);
+ };
+
+ if (kNeedAlignedLoads) {
+ // point x at first 4-byte aligned byte in string. this might be past the
+ // end of the string.
+ const uint8_t* x = RoundUp<4>(p);
+ if (x <= e) {
+ // Process bytes until finished or p is 4-byte aligned
+ while (p != x) {
+ step_one_byte();
+ }
+ }
+ }
+
+ const size_t kSwathSize = 16;
+ if (static_cast<size_t>(e - p) >= kSwathSize) {
+ // Load one swath of data into the operating buffers.
+ uint32_t buf0 = y_absl::little_endian::Load32(p) ^ l;
+ uint32_t buf1 = y_absl::little_endian::Load32(p + 4);
+ uint32_t buf2 = y_absl::little_endian::Load32(p + 8);
+ uint32_t buf3 = y_absl::little_endian::Load32(p + 12);
+ p += kSwathSize;
+
+ // Increment a CRC value by a "swath"; this combines the four bytes
+ // starting at `ptr` and twelve zero bytes, so that four CRCs can be
+ // built incrementally and combined at the end.
+ const auto step_swath = [this](uint32_t crc_in, const std::uint8_t* ptr) {
+ return y_absl::little_endian::Load32(ptr) ^
+ this->table_[3][crc_in & 0xff] ^
+ this->table_[2][(crc_in >> 8) & 0xff] ^
+ this->table_[1][(crc_in >> 16) & 0xff] ^
+ this->table_[0][crc_in >> 24];
+ };
+
+ // Run one CRC calculation step over all swaths in one 16-byte stride
+ const auto step_stride = [&]() {
+ buf0 = step_swath(buf0, p);
+ buf1 = step_swath(buf1, p + 4);
+ buf2 = step_swath(buf2, p + 8);
+ buf3 = step_swath(buf3, p + 12);
+ p += 16;
+ };
+
+ // Process kStride interleaved swaths through the data in parallel.
+ while ((e - p) > kPrefetchHorizon) {
+ base_internal::PrefetchNta(
+ reinterpret_cast<const void*>(p + kPrefetchHorizon));
+ // Process 64 bytes at a time
+ step_stride();
+ step_stride();
+ step_stride();
+ step_stride();
+ }
+ while (static_cast<size_t>(e - p) >= kSwathSize) {
+ step_stride();
+ }
+
+ // Now advance one word at a time as far as possible. This isn't worth
+ // doing if we have word-advance tables.
+ while (static_cast<size_t>(e - p) >= 4) {
+ buf0 = step_swath(buf0, p);
+ uint32_t tmp = buf0;
+ buf0 = buf1;
+ buf1 = buf2;
+ buf2 = buf3;
+ buf3 = tmp;
+ p += 4;
+ }
+
+ // Combine the results from the different swaths. This is just a CRC
+ // on the data values in the bufX words.
+ auto combine_one_word = [this](uint32_t crc_in, uint32_t w) {
+ w ^= crc_in;
+ for (size_t i = 0; i < 4; ++i) {
+ w = (w >> 8) ^ this->table0_[w & 0xff];
+ }
+ return w;
+ };
+
+ l = combine_one_word(0, buf0);
+ l = combine_one_word(l, buf1);
+ l = combine_one_word(l, buf2);
+ l = combine_one_word(l, buf3);
+ }
+
+ // Process the last few bytes
+ while (p != e) {
+ step_one_byte();
+ }
+
+ *crc = l;
+}
+
+void CRC32::ExtendByZeroesImpl(uint32_t* crc, size_t length,
+ const uint32_t zeroes_table[256],
+ const uint32_t poly_table[256]) const {
+ if (length != 0) {
+ uint32_t l = *crc;
+ // For each ZEROES_BASE_LG bits in length
+ // (after the low-order bits have been removed)
+ // we lookup the appropriate polynomial in the zeroes_ array
+ // and do a polynomial long multiplication (mod the CRC polynomial)
+ // to extend the CRC by the appropriate number of bits.
+ for (int i = 0; length != 0;
+ i += ZEROES_BASE - 1, length >>= ZEROES_BASE_LG) {
+ int c = length & (ZEROES_BASE - 1); // pick next ZEROES_BASE_LG bits
+ if (c != 0) { // if they are not zero,
+ // multiply by entry in table
+ // Build a table to aid in multiplying 2 bits at a time.
+ // It takes too long to build tables for more bits.
+ uint64_t m = zeroes_table[c + i - 1];
+ m <<= 1;
+ uint64_t m2 = m << 1;
+ uint64_t mtab[4] = {0, m, m2, m2 ^ m};
+
+ // Do the multiply one byte at a time.
+ uint64_t result = 0;
+ for (int x = 0; x < 32; x += 8) {
+ // The carry-less multiply.
+ result ^= mtab[l & 3] ^ (mtab[(l >> 2) & 3] << 2) ^
+ (mtab[(l >> 4) & 3] << 4) ^ (mtab[(l >> 6) & 3] << 6);
+ l >>= 8;
+
+ // Reduce modulo the polynomial
+ result = (result >> 8) ^ poly_table[result & 0xff];
+ }
+ l = static_cast<uint32_t>(result);
+ }
+ }
+ *crc = l;
+ }
+}
+
+void CRC32::ExtendByZeroes(uint32_t* crc, size_t length) const {
+ return CRC32::ExtendByZeroesImpl(crc, length, zeroes_, table0_);
+}
+
+void CRC32::UnextendByZeroes(uint32_t* crc, size_t length) const {
+ // See the comment in CRC32::InitTables() for an explanation of the algorithm
+ // below.
+ *crc = ReverseBits(*crc);
+ ExtendByZeroesImpl(crc, length, reverse_zeroes_, reverse_table0_);
+ *crc = ReverseBits(*crc);
+}
+
+void CRC32::Scramble(uint32_t* crc) const {
+ // Rotate by near half the word size plus 1. See the scramble comment in
+ // crc_internal.h for an explanation.
+ constexpr int scramble_rotate = (32 / 2) + 1;
+ *crc = RotateRight<uint32_t>(static_cast<unsigned int>(*crc + kScrambleLo),
+ 32, scramble_rotate) &
+ MaskOfLength<uint32_t>(32);
+}
+
+void CRC32::Unscramble(uint32_t* crc) const {
+ constexpr int scramble_rotate = (32 / 2) + 1;
+ uint64_t rotated = RotateRight<uint32_t>(static_cast<unsigned int>(*crc), 32,
+ 32 - scramble_rotate);
+ *crc = (rotated - kScrambleLo) & MaskOfLength<uint32_t>(32);
+}
+
+// Constructor and destructor for base class CRC.
+CRC::~CRC() {}
+CRC::CRC() {}
+
+// The "constructor" for a CRC32C with a standard polynomial.
+CRC* CRC::Crc32c() {
+ static CRC* singleton = CRCImpl::NewInternal();
+ return singleton;
+}
+
+// This Concat implementation works for arbitrary polynomials.
+void CRC::Concat(uint32_t* px, uint32_t y, size_t ylen) {
+ // https://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks
+ // The CRC of a message M is the remainder of polynomial divison modulo G,
+ // where the coefficient arithmetic is performed modulo 2 (so +/- are XOR):
+ // R(x) = M(x) x**n (mod G)
+ // (n is the degree of G)
+ // In practice, we use an initial value A and a bitmask B to get
+ // R = (A ^ B)x**|M| ^ Mx**n ^ B (mod G)
+ // If M is the concatenation of two strings S and T, and Z is the string of
+ // len(T) 0s, then the remainder CRC(ST) can be expressed as:
+ // R = (A ^ B)x**|ST| ^ STx**n ^ B
+ // = (A ^ B)x**|SZ| ^ SZx**n ^ B ^ Tx**n
+ // = CRC(SZ) ^ Tx**n
+ // CRC(Z) = (A ^ B)x**|T| ^ B
+ // CRC(T) = (A ^ B)x**|T| ^ Tx**n ^ B
+ // So R = CRC(SZ) ^ CRC(Z) ^ CRC(T)
+ //
+ // And further, since CRC(SZ) = Extend(CRC(S), Z),
+ // CRC(SZ) ^ CRC(Z) = Extend(CRC(S) ^ CRC(''), Z).
+ uint32_t z;
+ uint32_t t;
+ Empty(&z);
+ t = *px ^ z;
+ ExtendByZeroes(&t, ylen);
+ *px = t ^ y;
+}
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc.h
new file mode 100644
index 0000000000..051015a5f3
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc.h
@@ -0,0 +1,91 @@
+// Copyright 2022 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_CRC_H_
+#define Y_ABSL_CRC_INTERNAL_CRC_H_
+
+#include <cstdint>
+
+#include "y_absl/base/config.h"
+
+// This class implements CRCs (aka Rabin Fingerprints).
+// Treats the input as a polynomial with coefficients in Z(2),
+// and finds the remainder when divided by an primitive polynomial
+// of the appropriate length.
+
+// A polynomial is represented by the bit pattern formed by its coefficients,
+// but with the highest order bit not stored.
+// The highest degree coefficient is stored in the lowest numbered bit
+// in the lowest addressed byte. Thus, in what follows, the highest degree
+// coefficient that is stored is in the low order bit of "lo" or "*lo".
+
+// Hardware acceleration is used when available.
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+class CRC {
+ public:
+ virtual ~CRC();
+
+ // Place the CRC of the empty string in "*crc"
+ virtual void Empty(uint32_t* crc) const = 0;
+
+ // If "*crc" is the CRC of bytestring A, place the CRC of
+ // the bytestring formed from the concatenation of A and the "length"
+ // bytes at "bytes" into "*crc".
+ virtual void Extend(uint32_t* crc, const void* bytes,
+ size_t length) const = 0;
+
+ // Equivalent to Extend(crc, bytes, length) where "bytes"
+ // points to an array of "length" zero bytes.
+ virtual void ExtendByZeroes(uint32_t* crc, size_t length) const = 0;
+
+ // Inverse opration of ExtendByZeroes. If `crc` is the CRC value of a string
+ // ending in `length` zero bytes, this returns a CRC value of that string
+ // with those zero bytes removed.
+ virtual void UnextendByZeroes(uint32_t* crc, size_t length) const = 0;
+
+ // If *px is the CRC (as defined by *crc) of some string X,
+ // and y is the CRC of some string Y that is ylen bytes long, set
+ // *px to the CRC of the concatenation of X followed by Y.
+ virtual void Concat(uint32_t* px, uint32_t y, size_t ylen);
+
+ // Apply a non-linear transformation to "*crc" so that
+ // it is safe to CRC the result with the same polynomial without
+ // any reduction of error-detection ability in the outer CRC.
+ // Unscramble() performs the inverse transformation.
+ // It is strongly recommended that CRCs be scrambled before storage or
+ // transmission, and unscrambled at the other end before futher manipulation.
+ virtual void Scramble(uint32_t* crc) const = 0;
+ virtual void Unscramble(uint32_t* crc) const = 0;
+
+ // Crc32c() returns the singleton implementation of CRC for the CRC32C
+ // polynomial. Returns a handle that MUST NOT be destroyed with delete.
+ static CRC* Crc32c();
+
+ protected:
+ CRC(); // Clients may not call constructor; use Crc32c() instead.
+
+ private:
+ CRC(const CRC&) = delete;
+ CRC& operator=(const CRC&) = delete;
+};
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_CRC_INTERNAL_CRC_H_
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32_x86_arm_combined_simd.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32_x86_arm_combined_simd.h
new file mode 100644
index 0000000000..3e49100c8d
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -0,0 +1,269 @@
+// Copyright 2022 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
+#define Y_ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
+
+#include <cstdint>
+
+#include "y_absl/base/config.h"
+
+// -------------------------------------------------------------------------
+// Many x86 and ARM machines have CRC acceleration hardware.
+// We can do a faster version of Extend() on such machines.
+// We define a translation layer for both x86 and ARM for the ease of use and
+// most performance gains.
+
+// This implementation requires 64-bit CRC instructions (part of SSE 4.2) and
+// PCLMULQDQ instructions. 32-bit builds with SSE 4.2 do exist, so the
+// __x86_64__ condition is necessary.
+#if defined(__x86_64__) && defined(__SSE4_2__) && defined(__PCLMUL__)
+
+#include <x86intrin.h>
+#define Y_ABSL_CRC_INTERNAL_HAVE_X86_SIMD
+
+#elif defined(_MSC_VER) && defined(__AVX__)
+
+// MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ.
+#include <intrin.h>
+#define Y_ABSL_CRC_INTERNAL_HAVE_X86_SIMD
+
+#elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \
+ defined(__ARM_FEATURE_CRC32) && defined(Y_ABSL_INTERNAL_HAVE_ARM_NEON) && \
+ defined(__ARM_FEATURE_CRYPTO)
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+#define Y_ABSL_CRC_INTERNAL_HAVE_ARM_SIMD
+
+#endif
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+#if defined(Y_ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
+ defined(Y_ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
+
+#if defined(Y_ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
+using V128 = uint64x2_t;
+#else
+using V128 = __m128i;
+#endif
+
+// Starting with the initial value in |crc|, accumulates a CRC32 value for
+// unsigned integers of different sizes.
+uint32_t CRC32_u8(uint32_t crc, uint8_t v);
+
+uint32_t CRC32_u16(uint32_t crc, uint16_t v);
+
+uint32_t CRC32_u32(uint32_t crc, uint32_t v);
+
+uint32_t CRC32_u64(uint32_t crc, uint64_t v);
+
+// Loads 128 bits of integer data. |src| must be 16-byte aligned.
+V128 V128_Load(const V128* src);
+
+// Load 128 bits of integer data. |src| does not need to be aligned.
+V128 V128_LoadU(const V128* src);
+
+// Polynomially multiplies the high 64 bits of |l| and |r|.
+V128 V128_PMulHi(const V128 l, const V128 r);
+
+// Polynomially multiplies the low 64 bits of |l| and |r|.
+V128 V128_PMulLow(const V128 l, const V128 r);
+
+// Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|.
+V128 V128_PMul01(const V128 l, const V128 r);
+
+// Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|.
+V128 V128_PMul10(const V128 l, const V128 r);
+
+// Produces a XOR operation of |l| and |r|.
+V128 V128_Xor(const V128 l, const V128 r);
+
+// Produces an AND operation of |l| and |r|.
+V128 V128_And(const V128 l, const V128 r);
+
+// Sets two 64 bit integers to one 128 bit vector. The order is reverse.
+// dst[63:0] := |r|
+// dst[127:64] := |l|
+V128 V128_From2x64(const uint64_t l, const uint64_t r);
+
+// Shift |l| right by |imm| bytes while shifting in zeros.
+template <int imm>
+V128 V128_ShiftRight(const V128 l);
+
+// Extracts a 32-bit integer from |l|, selected with |imm|.
+template <int imm>
+int V128_Extract32(const V128 l);
+
+// Extracts the low 64 bits from V128.
+int64_t V128_Low64(const V128 l);
+
+// Left-shifts packed 64-bit integers in l by r.
+V128 V128_ShiftLeft64(const V128 l, const V128 r);
+
+#endif
+
+#if defined(Y_ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
+
+inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) {
+ return _mm_crc32_u8(crc, v);
+}
+
+inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
+ return _mm_crc32_u16(crc, v);
+}
+
+inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
+ return _mm_crc32_u32(crc, v);
+}
+
+inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
+ return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
+}
+
+inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
+
+inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
+
+inline V128 V128_PMulHi(const V128 l, const V128 r) {
+ return _mm_clmulepi64_si128(l, r, 0x11);
+}
+
+inline V128 V128_PMulLow(const V128 l, const V128 r) {
+ return _mm_clmulepi64_si128(l, r, 0x00);
+}
+
+inline V128 V128_PMul01(const V128 l, const V128 r) {
+ return _mm_clmulepi64_si128(l, r, 0x01);
+}
+
+inline V128 V128_PMul10(const V128 l, const V128 r) {
+ return _mm_clmulepi64_si128(l, r, 0x10);
+}
+
+inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
+
+inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
+
+inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
+ return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r));
+}
+
+template <int imm>
+inline V128 V128_ShiftRight(const V128 l) {
+ return _mm_srli_si128(l, imm);
+}
+
+template <int imm>
+inline int V128_Extract32(const V128 l) {
+ return _mm_extract_epi32(l, imm);
+}
+
+inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
+
+inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
+ return _mm_sll_epi64(l, r);
+}
+
+#elif defined(Y_ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
+
+inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); }
+
+inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
+ return __crc32ch(crc, v);
+}
+
+inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
+ return __crc32cw(crc, v);
+}
+
+inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
+ return __crc32cd(crc, v);
+}
+
+inline V128 V128_Load(const V128* src) {
+ return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
+}
+
+inline V128 V128_LoadU(const V128* src) {
+ return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
+}
+
+// Using inline assembly as clang does not generate the pmull2 instruction and
+// performance drops by 15-20%.
+// TODO(b/193678732): Investigate why the compiler decides not to generate
+// such instructions and why it becomes so much worse.
+inline V128 V128_PMulHi(const V128 l, const V128 r) {
+ uint64x2_t res;
+ __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
+ : "=w"(res)
+ : "w"(l), "w"(r));
+ return res;
+}
+
+inline V128 V128_PMulLow(const V128 l, const V128 r) {
+ return reinterpret_cast<V128>(vmull_p64(
+ reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
+ reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
+}
+
+inline V128 V128_PMul01(const V128 l, const V128 r) {
+ return reinterpret_cast<V128>(vmull_p64(
+ reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))),
+ reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
+}
+
+inline V128 V128_PMul10(const V128 l, const V128 r) {
+ return reinterpret_cast<V128>(vmull_p64(
+ reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
+ reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r)))));
+}
+
+inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
+
+inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
+
+inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
+ return vcombine_u64(vcreate_u64(r), vcreate_u64(l));
+}
+
+template <int imm>
+inline V128 V128_ShiftRight(const V128 l) {
+ return vreinterpretq_u64_s8(
+ vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm));
+}
+
+template <int imm>
+inline int V128_Extract32(const V128 l) {
+ return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
+}
+
+inline int64_t V128_Low64(const V128 l) {
+ return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
+}
+
+inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
+ return vshlq_u64(l, vreinterpretq_s64_u64(r));
+}
+
+#endif
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32c.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32c.h
new file mode 100644
index 0000000000..9778c85fb9
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32c.h
@@ -0,0 +1,39 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_CRC32C_H_
+#define Y_ABSL_CRC_INTERNAL_CRC32C_H_
+
+#include "y_absl/base/config.h"
+#include "y_absl/crc/crc32c.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+// Modifies a CRC32 value by removing `length` bytes with a value of 0 from
+// the end of the string.
+//
+// This is the inverse operation of ExtendCrc32cByZeroes().
+//
+// This operation has a runtime cost of O(log(`length`))
+//
+// Internal implementation detail, exposed for testing only.
+crc32c_t UnextendCrc32cByZeroes(crc32c_t initial_crc, size_t length);
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_CRC_INTERNAL_CRC32C_H_
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32c_inline.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32c_inline.h
new file mode 100644
index 0000000000..96f7f7892e
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc32c_inline.h
@@ -0,0 +1,72 @@
+// Copyright 2022 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_CRC32C_INLINE_H_
+#define Y_ABSL_CRC_INTERNAL_CRC32C_INLINE_H_
+
+#include <cstdint>
+
+#include "y_absl/base/config.h"
+#include "y_absl/base/internal/endian.h"
+#include "y_absl/crc/internal/crc32_x86_arm_combined_simd.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+// CRC32C implementation optimized for small inputs.
+// Either computes crc and return true, or if there is
+// no hardware support does nothing and returns false.
+inline bool ExtendCrc32cInline(uint32_t* crc, const char* p, size_t n) {
+#if defined(Y_ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
+ defined(Y_ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
+ constexpr uint32_t kCrc32Xor = 0xffffffffU;
+ *crc ^= kCrc32Xor;
+ if (n & 1) {
+ *crc = CRC32_u8(*crc, static_cast<uint8_t>(*p));
+ n--;
+ p++;
+ }
+ if (n & 2) {
+ *crc = CRC32_u16(*crc, y_absl::little_endian::Load16(p));
+ n -= 2;
+ p += 2;
+ }
+ if (n & 4) {
+ *crc = CRC32_u32(*crc, y_absl::little_endian::Load32(p));
+ n -= 4;
+ p += 4;
+ }
+ while (n) {
+ *crc = CRC32_u64(*crc, y_absl::little_endian::Load64(p));
+ n -= 8;
+ p += 8;
+ }
+ *crc ^= kCrc32Xor;
+ return true;
+#else
+ // No hardware support, signal the need to fallback.
+ static_cast<void>(crc);
+ static_cast<void>(p);
+ static_cast<void>(n);
+ return false;
+#endif // defined(Y_ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) ||
+ // defined(Y_ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
+}
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_CRC_INTERNAL_CRC32C_INLINE_H_
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_cord_state.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_cord_state.cc
new file mode 100644
index 0000000000..4e8e88bce7
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_cord_state.cc
@@ -0,0 +1,130 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "y_absl/crc/internal/crc_cord_state.h"
+
+#include <cassert>
+
+#include "y_absl/base/config.h"
+#include "y_absl/numeric/bits.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+CrcCordState::RefcountedRep* CrcCordState::RefSharedEmptyRep() {
+ static CrcCordState::RefcountedRep* empty = new CrcCordState::RefcountedRep;
+
+ assert(empty->count.load(std::memory_order_relaxed) >= 1);
+ assert(empty->rep.removed_prefix.length == 0);
+ assert(empty->rep.prefix_crc.empty());
+
+ Ref(empty);
+ return empty;
+}
+
+CrcCordState::CrcCordState() : refcounted_rep_(new RefcountedRep) {}
+
+CrcCordState::CrcCordState(const CrcCordState& other)
+ : refcounted_rep_(other.refcounted_rep_) {
+ Ref(refcounted_rep_);
+}
+
+CrcCordState::CrcCordState(CrcCordState&& other)
+ : refcounted_rep_(other.refcounted_rep_) {
+ // Make `other` valid for use after move.
+ other.refcounted_rep_ = RefSharedEmptyRep();
+}
+
+CrcCordState& CrcCordState::operator=(const CrcCordState& other) {
+ if (this != &other) {
+ Unref(refcounted_rep_);
+ refcounted_rep_ = other.refcounted_rep_;
+ Ref(refcounted_rep_);
+ }
+ return *this;
+}
+
+CrcCordState& CrcCordState::operator=(CrcCordState&& other) {
+ if (this != &other) {
+ Unref(refcounted_rep_);
+ refcounted_rep_ = other.refcounted_rep_;
+ // Make `other` valid for use after move.
+ other.refcounted_rep_ = RefSharedEmptyRep();
+ }
+ return *this;
+}
+
+CrcCordState::~CrcCordState() {
+ Unref(refcounted_rep_);
+}
+
+crc32c_t CrcCordState::Checksum() const {
+ if (rep().prefix_crc.empty()) {
+ return y_absl::crc32c_t{0};
+ }
+ if (IsNormalized()) {
+ return rep().prefix_crc.back().crc;
+ }
+ return y_absl::RemoveCrc32cPrefix(
+ rep().removed_prefix.crc, rep().prefix_crc.back().crc,
+ rep().prefix_crc.back().length - rep().removed_prefix.length);
+}
+
+CrcCordState::PrefixCrc CrcCordState::NormalizedPrefixCrcAtNthChunk(
+ size_t n) const {
+ assert(n < NumChunks());
+ if (IsNormalized()) {
+ return rep().prefix_crc[n];
+ }
+ size_t length = rep().prefix_crc[n].length - rep().removed_prefix.length;
+ return PrefixCrc(length,
+ y_absl::RemoveCrc32cPrefix(rep().removed_prefix.crc,
+ rep().prefix_crc[n].crc, length));
+}
+
+void CrcCordState::Normalize() {
+ if (IsNormalized() || rep().prefix_crc.empty()) {
+ return;
+ }
+
+ Rep* r = mutable_rep();
+ for (auto& prefix_crc : r->prefix_crc) {
+ size_t remaining = prefix_crc.length - r->removed_prefix.length;
+ prefix_crc.crc = y_absl::RemoveCrc32cPrefix(r->removed_prefix.crc,
+ prefix_crc.crc, remaining);
+ prefix_crc.length = remaining;
+ }
+ r->removed_prefix = PrefixCrc();
+}
+
+void CrcCordState::Poison() {
+ Rep* rep = mutable_rep();
+ if (NumChunks() > 0) {
+ for (auto& prefix_crc : rep->prefix_crc) {
+ // This is basically CRC32::Scramble().
+ uint32_t crc = static_cast<uint32_t>(prefix_crc.crc);
+ crc += 0x2e76e41b;
+ crc = y_absl::rotr(crc, 17);
+ prefix_crc.crc = crc32c_t{crc};
+ }
+ } else {
+ // Add a fake corrupt chunk.
+ rep->prefix_crc.push_back(PrefixCrc(0, crc32c_t{1}));
+ }
+}
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_cord_state.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_cord_state.h
new file mode 100644
index 0000000000..610b0fbbc6
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_cord_state.h
@@ -0,0 +1,159 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_CRC_CORD_STATE_H_
+#define Y_ABSL_CRC_INTERNAL_CRC_CORD_STATE_H_
+
+#include <atomic>
+#include <cstddef>
+#include <deque>
+
+#include "y_absl/base/config.h"
+#include "y_absl/crc/crc32c.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+// CrcCordState is a copy-on-write class that holds the chunked CRC32C data
+// that allows CrcCord to perform efficient substring operations. CrcCordState
+// is used as a member variable in CrcCord. When a CrcCord is converted to a
+// Cord, the CrcCordState is shallow-copied into the root node of the Cord. If
+// the converted Cord is modified outside of CrcCord, the CrcCordState is
+// discarded from the Cord. If the Cord is converted back to a CrcCord, and the
+// Cord is still carrying the CrcCordState in its root node, the CrcCord can
+// re-use the CrcCordState, making the construction of the CrcCord cheap.
+//
+// CrcCordState does not try to encapsulate the CRC32C state (CrcCord requires
+// knowledge of how CrcCordState represents the CRC32C state). It does
+// encapsulate the copy-on-write nature of the state.
+class CrcCordState {
+ public:
+ // Constructors.
+ CrcCordState();
+ CrcCordState(const CrcCordState&);
+ CrcCordState(CrcCordState&&);
+
+ // Destructor. Atomically unreferences the data.
+ ~CrcCordState();
+
+ // Copy and move operators.
+ CrcCordState& operator=(const CrcCordState&);
+ CrcCordState& operator=(CrcCordState&&);
+
+ // A (length, crc) pair.
+ struct PrefixCrc {
+ PrefixCrc() = default;
+ PrefixCrc(size_t length_arg, y_absl::crc32c_t crc_arg)
+ : length(length_arg), crc(crc_arg) {}
+
+ size_t length = 0;
+
+ // TODO(y_absl-team): Memory stomping often zeros out memory. If this struct
+ // gets overwritten, we could end up with {0, 0}, which is the correct CRC
+ // for a string of length 0. Consider storing a scrambled value and
+ // unscrambling it before verifying it.
+ y_absl::crc32c_t crc = y_absl::crc32c_t{0};
+ };
+
+ // The representation of the chunked CRC32C data.
+ struct Rep {
+ // `removed_prefix` is the crc and length of any prefix that has been
+ // removed from the Cord (for example, by calling
+ // `CrcCord::RemovePrefix()`). To get the checkum of any prefix of the cord,
+ // this value must be subtracted from `prefix_crc`. See `Checksum()` for an
+ // example.
+ //
+ // CrcCordState is said to be "normalized" if removed_prefix.length == 0.
+ PrefixCrc removed_prefix;
+
+ // A deque of (length, crc) pairs, representing length and crc of a prefix
+ // of the Cord, before removed_prefix has been subtracted. The lengths of
+ // the prefixes are stored in increasing order. If the Cord is not empty,
+ // the last value in deque is the contains the CRC32C of the entire Cord
+ // when removed_prefix is subtracted from it.
+ std::deque<PrefixCrc> prefix_crc;
+ };
+
+ // Returns a reference to the representation of the chunked CRC32C data.
+ const Rep& rep() const { return refcounted_rep_->rep; }
+
+ // Returns a mutable reference to the representation of the chunked CRC32C
+ // data. Calling this function will copy the data if another instance also
+ // holds a reference to the data, so it is important to call rep() instead if
+ // the data may not be mutated.
+ Rep* mutable_rep() {
+ if (refcounted_rep_->count.load(std::memory_order_acquire) != 1) {
+ RefcountedRep* copy = new RefcountedRep;
+ copy->rep = refcounted_rep_->rep;
+ Unref(refcounted_rep_);
+ refcounted_rep_ = copy;
+ }
+ return &refcounted_rep_->rep;
+ }
+
+ // Returns the CRC32C of the entire Cord.
+ y_absl::crc32c_t Checksum() const;
+
+ // Returns true if the chunked CRC32C cached is normalized.
+ bool IsNormalized() const { return rep().removed_prefix.length == 0; }
+
+ // Normalizes the chunked CRC32C checksum cache by substracting any removed
+ // prefix from the chunks.
+ void Normalize();
+
+ // Returns the number of cached chunks.
+ size_t NumChunks() const { return rep().prefix_crc.size(); }
+
+ // Helper that returns the (length, crc) of the `n`-th cached chunked.
+ PrefixCrc NormalizedPrefixCrcAtNthChunk(size_t n) const;
+
+ // Poisons all chunks to so that Checksum() will likely be incorrect with high
+ // probability.
+ void Poison();
+
+ private:
+ struct RefcountedRep {
+ std::atomic<int32_t> count{1};
+ Rep rep;
+ };
+
+ // Adds a reference to the shared global empty `RefcountedRep`, and returns a
+ // pointer to the `RefcountedRep`. This is an optimization to avoid unneeded
+ // allocations when the allocation is unlikely to ever be used. The returned
+ // pointer can be `Unref()`ed when it is no longer needed. Since the returned
+ // instance will always have a reference counter greater than 1, attempts to
+ // modify it (by calling `mutable_rep()`) will create a new unshared copy.
+ static RefcountedRep* RefSharedEmptyRep();
+
+ static void Ref(RefcountedRep* r) {
+ assert(r != nullptr);
+ r->count.fetch_add(1, std::memory_order_relaxed);
+ }
+
+ static void Unref(RefcountedRep* r) {
+ assert(r != nullptr);
+ if (r->count.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+ delete r;
+ }
+ }
+
+ RefcountedRep* refcounted_rep_;
+};
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_CRC_INTERNAL_CRC_CORD_STATE_H_
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_internal.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_internal.h
new file mode 100644
index 0000000000..591b5200ee
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_internal.h
@@ -0,0 +1,179 @@
+// Copyright 2022 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_CRC_INTERNAL_H_
+#define Y_ABSL_CRC_INTERNAL_CRC_INTERNAL_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "y_absl/base/internal/raw_logging.h"
+#include "y_absl/crc/internal/crc.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+
+namespace crc_internal {
+
+// Prefetch constants used in some Extend() implementations
+constexpr int kPrefetchHorizon = Y_ABSL_CACHELINE_SIZE * 4; // Prefetch this far
+// Shorter prefetch distance for smaller buffers
+constexpr int kPrefetchHorizonMedium = Y_ABSL_CACHELINE_SIZE * 1;
+static_assert(kPrefetchHorizon >= 64, "CRCPrefetchHorizon less than loop len");
+
+// We require the Scramble() function:
+// - to be reversible (Unscramble() must exist)
+// - to be non-linear in the polynomial's Galois field (so the CRC of a
+// scrambled CRC is not linearly affected by the scrambled CRC, even if
+// using the same polynomial)
+// - not to be its own inverse. Preferably, if X=Scramble^N(X) and N!=0, then
+// N is large.
+// - to be fast.
+// - not to change once defined.
+// We introduce non-linearity in two ways:
+// Addition of a constant.
+// - The carries introduce non-linearity; we use bits of an irrational
+// (phi) to make it unlikely that we introduce no carries.
+// Rotate by a constant number of bits.
+// - We use floor(degree/2)+1, which does not divide the degree, and
+// splits the bits nearly evenly, which makes it less likely the
+// halves will be the same or one will be all zeroes.
+// We do both things to improve the chances of non-linearity in the face of
+// bit patterns with low numbers of bits set, while still being fast.
+// Below is the constant that we add. The bits are the first 128 bits of the
+// fractional part of phi, with a 1 ored into the bottom bit to maximize the
+// cycle length of repeated adds.
+constexpr uint64_t kScrambleHi = (static_cast<uint64_t>(0x4f1bbcdcU) << 32) |
+ static_cast<uint64_t>(0xbfa53e0aU);
+constexpr uint64_t kScrambleLo = (static_cast<uint64_t>(0xf9ce6030U) << 32) |
+ static_cast<uint64_t>(0x2e76e41bU);
+
+class CRCImpl : public CRC { // Implemention of the abstract class CRC
+ public:
+ using Uint32By256 = uint32_t[256];
+
+ CRCImpl() {}
+ ~CRCImpl() override = default;
+
+ // The internal version of CRC::New().
+ static CRCImpl* NewInternal();
+
+ void Empty(uint32_t* crc) const override;
+
+ // Fill in a table for updating a CRC by one word of 'word_size' bytes
+ // [last_lo, last_hi] contains the answer if the last bit in the word
+ // is set.
+ static void FillWordTable(uint32_t poly, uint32_t last, int word_size,
+ Uint32By256* t);
+
+ // Build the table for extending by zeroes, returning the number of entries.
+ // For a in {1, 2, ..., ZEROES_BASE-1}, b in {0, 1, 2, 3, ...},
+ // entry j=a-1+(ZEROES_BASE-1)*b
+ // contains a polynomial Pi such that multiplying
+ // a CRC by Pi mod P, where P is the CRC polynomial, is equivalent to
+ // appending a*2**(ZEROES_BASE_LG*b) zero bytes to the original string.
+ static int FillZeroesTable(uint32_t poly, Uint32By256* t);
+
+ virtual void InitTables() = 0;
+
+ private:
+ CRCImpl(const CRCImpl&) = delete;
+ CRCImpl& operator=(const CRCImpl&) = delete;
+};
+
+// This is the 32-bit implementation. It handles all sizes from 8 to 32.
+class CRC32 : public CRCImpl {
+ public:
+ CRC32() {}
+ ~CRC32() override {}
+
+ void Extend(uint32_t* crc, const void* bytes, size_t length) const override;
+ void ExtendByZeroes(uint32_t* crc, size_t length) const override;
+ void Scramble(uint32_t* crc) const override;
+ void Unscramble(uint32_t* crc) const override;
+ void UnextendByZeroes(uint32_t* crc, size_t length) const override;
+
+ void InitTables() override;
+
+ private:
+ // Common implementation guts for ExtendByZeroes and UnextendByZeroes().
+ //
+ // zeroes_table is a table as returned by FillZeroesTable(), containing
+ // polynomials representing CRCs of strings-of-zeros of various lenghts,
+ // and which can be combined by polynomial multiplication. poly_table is
+ // a table of CRC byte extension values. These tables are determined by
+ // the generator polynomial.
+ //
+ // These will be set to reverse_zeroes_ and reverse_table0_ for Unextend, and
+ // CRC32::zeroes_ and CRC32::table0_ for Extend.
+ void ExtendByZeroesImpl(uint32_t* crc, size_t length,
+ const uint32_t zeroes_table[256],
+ const uint32_t poly_table[256]) const;
+
+ uint32_t table0_[256]; // table of byte extensions
+ uint32_t zeroes_[256]; // table of zero extensions
+
+ // table of 4-byte extensions shifted by 12 bytes of zeroes
+ uint32_t table_[4][256];
+
+ // Reverse lookup tables, using the alternate polynomial used by
+ // UnextendByZeroes().
+ uint32_t reverse_table0_[256]; // table of reverse byte extensions
+ uint32_t reverse_zeroes_[256]; // table of reverse zero extensions
+
+ CRC32(const CRC32&) = delete;
+ CRC32& operator=(const CRC32&) = delete;
+};
+
+// Helpers
+
+// Return a bit mask containing len 1-bits.
+// Requires 0 < len <= sizeof(T)
+template <typename T>
+T MaskOfLength(int len) {
+ // shift 2 by len-1 rather than 1 by len because shifts of wordsize
+ // are undefined.
+ return (T(2) << (len - 1)) - 1;
+}
+
+// Rotate low-order "width" bits of "in" right by "r" bits,
+// setting other bits in word to arbitrary values.
+template <typename T>
+T RotateRight(T in, int width, int r) {
+ return (in << (width - r)) | ((in >> r) & MaskOfLength<T>(width - r));
+}
+
+// RoundUp<N>(p) returns the lowest address >= p aligned to an N-byte
+// boundary. Requires that N is a power of 2.
+template <int alignment>
+const uint8_t* RoundUp(const uint8_t* p) {
+ static_assert((alignment & (alignment - 1)) == 0, "alignment is not 2^n");
+ constexpr uintptr_t mask = alignment - 1;
+ const uintptr_t as_uintptr = reinterpret_cast<uintptr_t>(p);
+ return reinterpret_cast<const uint8_t*>((as_uintptr + mask) & ~mask);
+}
+
+// Return a newly created CRC32AcceleratedX86ARMCombined if we can use Intel's
+// or ARM's CRC acceleration for a given polynomial. Return nullptr otherwise.
+CRCImpl* TryNewCRC32AcceleratedX86ARMCombined();
+
+// Return all possible hardware accelerated implementations. For testing only.
+std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll();
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_CRC_INTERNAL_CRC_INTERNAL_H_
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy.h
new file mode 100644
index 0000000000..520a49c879
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy.h
@@ -0,0 +1,119 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_CRC_MEMCPY_H_
+#define Y_ABSL_CRC_INTERNAL_CRC_MEMCPY_H_
+
+#include <cstddef>
+#include <memory>
+
+#include "y_absl/base/config.h"
+#include "y_absl/crc/crc32c.h"
+
+// Defined if the class AcceleratedCrcMemcpyEngine exists.
+#if defined(__x86_64__) && defined(__SSE4_2__)
+#define Y_ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
+#elif defined(_MSC_VER) && defined(__AVX__)
+#define Y_ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
+#endif
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+class CrcMemcpyEngine {
+ public:
+ virtual ~CrcMemcpyEngine() = default;
+
+ virtual crc32c_t Compute(void* __restrict dst, const void* __restrict src,
+ std::size_t length, crc32c_t initial_crc) const = 0;
+
+ protected:
+ CrcMemcpyEngine() = default;
+};
+
+class CrcMemcpy {
+ public:
+ static crc32c_t CrcAndCopy(void* __restrict dst, const void* __restrict src,
+ std::size_t length,
+ crc32c_t initial_crc = crc32c_t{0},
+ bool non_temporal = false) {
+ static const ArchSpecificEngines engines = GetArchSpecificEngines();
+ auto* engine = non_temporal ? engines.non_temporal : engines.temporal;
+ return engine->Compute(dst, src, length, initial_crc);
+ }
+
+ // For testing only: get an architecture-specific engine for tests.
+ static std::unique_ptr<CrcMemcpyEngine> GetTestEngine(int vector,
+ int integer);
+
+ private:
+ struct ArchSpecificEngines {
+ CrcMemcpyEngine* temporal;
+ CrcMemcpyEngine* non_temporal;
+ };
+
+ static ArchSpecificEngines GetArchSpecificEngines();
+};
+
+// Fallback CRC-memcpy engine.
+class FallbackCrcMemcpyEngine : public CrcMemcpyEngine {
+ public:
+ FallbackCrcMemcpyEngine() = default;
+ FallbackCrcMemcpyEngine(const FallbackCrcMemcpyEngine&) = delete;
+ FallbackCrcMemcpyEngine operator=(const FallbackCrcMemcpyEngine&) = delete;
+
+ crc32c_t Compute(void* __restrict dst, const void* __restrict src,
+ std::size_t length, crc32c_t initial_crc) const override;
+};
+
+// CRC Non-Temporal-Memcpy engine.
+class CrcNonTemporalMemcpyEngine : public CrcMemcpyEngine {
+ public:
+ CrcNonTemporalMemcpyEngine() = default;
+ CrcNonTemporalMemcpyEngine(const CrcNonTemporalMemcpyEngine&) = delete;
+ CrcNonTemporalMemcpyEngine operator=(const CrcNonTemporalMemcpyEngine&) =
+ delete;
+
+ crc32c_t Compute(void* __restrict dst, const void* __restrict src,
+ std::size_t length, crc32c_t initial_crc) const override;
+};
+
+// CRC Non-Temporal-Memcpy AVX engine.
+class CrcNonTemporalMemcpyAVXEngine : public CrcMemcpyEngine {
+ public:
+ CrcNonTemporalMemcpyAVXEngine() = default;
+ CrcNonTemporalMemcpyAVXEngine(const CrcNonTemporalMemcpyAVXEngine&) = delete;
+ CrcNonTemporalMemcpyAVXEngine operator=(
+ const CrcNonTemporalMemcpyAVXEngine&) = delete;
+
+ crc32c_t Compute(void* __restrict dst, const void* __restrict src,
+ std::size_t length, crc32c_t initial_crc) const override;
+};
+
+// Copy source to destination and return the CRC32C of the data copied. If an
+// accelerated version is available, use the accelerated version, otherwise use
+// the generic fallback version.
+inline crc32c_t Crc32CAndCopy(void* __restrict dst, const void* __restrict src,
+ std::size_t length,
+ crc32c_t initial_crc = crc32c_t{0},
+ bool non_temporal = false) {
+ return CrcMemcpy::CrcAndCopy(dst, src, length, initial_crc, non_temporal);
+}
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_CRC_INTERNAL_CRC_MEMCPY_H_
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy_fallback.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy_fallback.cc
new file mode 100644
index 0000000000..58d921d316
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy_fallback.cc
@@ -0,0 +1,75 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <memory>
+
+#include "y_absl/base/config.h"
+#include "y_absl/crc/crc32c.h"
+#include "y_absl/crc/internal/crc_memcpy.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+y_absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
+ const void* __restrict src,
+ std::size_t length,
+ crc32c_t initial_crc) const {
+ constexpr size_t kBlockSize = 8192;
+ y_absl::crc32c_t crc = initial_crc;
+
+ const char* src_bytes = reinterpret_cast<const char*>(src);
+ char* dst_bytes = reinterpret_cast<char*>(dst);
+
+ // Copy + CRC loop - run 8k chunks until we are out of full chunks. CRC
+ // then copy was found to be slightly more efficient in our test cases.
+ std::size_t offset = 0;
+ for (; offset + kBlockSize < length; offset += kBlockSize) {
+ crc = y_absl::ExtendCrc32c(crc,
+ y_absl::string_view(src_bytes + offset, kBlockSize));
+ memcpy(dst_bytes + offset, src_bytes + offset, kBlockSize);
+ }
+
+ // Save some work if length is 0.
+ if (offset < length) {
+ std::size_t final_copy_size = length - offset;
+ crc = y_absl::ExtendCrc32c(
+ crc, y_absl::string_view(src_bytes + offset, final_copy_size));
+ memcpy(dst_bytes + offset, src_bytes + offset, final_copy_size);
+ }
+
+ return crc;
+}
+
+// Compile the following only if we don't have
+#ifndef Y_ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+
+CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
+ CrcMemcpy::ArchSpecificEngines engines;
+ engines.temporal = new FallbackCrcMemcpyEngine();
+ engines.non_temporal = new FallbackCrcMemcpyEngine();
+ return engines;
+}
+
+std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
+ int /*integer*/) {
+ return std::make_unique<FallbackCrcMemcpyEngine>();
+}
+
+#endif // Y_ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy_x86_64.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy_x86_64.cc
new file mode 100644
index 0000000000..9cef4b5454
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_memcpy_x86_64.cc
@@ -0,0 +1,434 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because
+// XMM registers do not support the CRC instruction (yet). While copying,
+// compute the running CRC of the data being copied.
+//
+// It is assumed that any CPU running this code has SSE4.2 instructions
+// available (for CRC32C). This file will do nothing if that is not true.
+//
+// The CRC instruction has a 3-byte latency, and we are stressing the ALU ports
+// here (unlike a traditional memcopy, which has almost no ALU use), so we will
+// need to copy in such a way that the CRC unit is used efficiently. We have two
+// regimes in this code:
+// 1. For operations of size < kCrcSmallSize, do the CRC then the memcpy
+// 2. For operations of size > kCrcSmallSize:
+// a) compute an initial CRC + copy on a small amount of data to align the
+// destination pointer on a 16-byte boundary.
+// b) Split the data into 3 main regions and a tail (smaller than 48 bytes)
+// c) Do the copy and CRC of the 3 main regions, interleaving (start with
+// full cache line copies for each region, then move to single 16 byte
+// pieces per region).
+// d) Combine the CRCs with CRC32C::Concat.
+// e) Copy the tail and extend the CRC with the CRC of the tail.
+// This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat
+// takes a significant amount of time. A medium-sized approach could be added
+// using 3 CRCs over fixed-size blocks where the zero-extensions required for
+// CRC32C::Concat can be precomputed.
+
+#ifdef __SSE4_2__
+#include <immintrin.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "y_absl/base/dynamic_annotations.h"
+#include "y_absl/base/internal/prefetch.h"
+#include "y_absl/base/optimization.h"
+#include "y_absl/crc/crc32c.h"
+#include "y_absl/crc/internal/cpu_detect.h"
+#include "y_absl/crc/internal/crc_memcpy.h"
+#include "y_absl/strings/string_view.h"
+
+#ifdef Y_ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+namespace {
+
+inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
+ crc32c_t crc) {
+ // Small copy: just go 1 byte at a time: being nice to the branch predictor
+ // is more important here than anything else
+ uint32_t crc_uint32 = static_cast<uint32_t>(crc);
+ for (std::size_t i = 0; i < length; i++) {
+ uint8_t data = *reinterpret_cast<const uint8_t*>(src);
+ crc_uint32 = _mm_crc32_u8(crc_uint32, data);
+ *reinterpret_cast<uint8_t*>(dst) = data;
+ ++src;
+ ++dst;
+ }
+ return crc32c_t{crc_uint32};
+}
+
+constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
+
+// Common function for copying the tails of multiple large regions.
+template <size_t vec_regions, size_t int_regions>
+inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
+ size_t region_size, size_t copy_rounds) {
+ std::array<__m128i, vec_regions> data;
+ std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
+
+ while (copy_rounds > 0) {
+ for (size_t i = 0; i < vec_regions; i++) {
+ size_t region = i;
+
+ auto* vsrc =
+ reinterpret_cast<const __m128i*>(*src + region_size * region);
+ auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
+
+ // Load the blocks, unaligned
+ data[i] = _mm_loadu_si128(vsrc);
+
+ // Store the blocks, aligned
+ _mm_store_si128(vdst, data[i]);
+
+ // Compute the running CRC
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
+ }
+
+ for (size_t i = 0; i < int_regions; i++) {
+ size_t region = vec_regions + i;
+
+ auto* usrc =
+ reinterpret_cast<const uint64_t*>(*src + region_size * region);
+ auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
+
+ for (size_t j = 0; j < kIntLoadsPerVec; j++) {
+ size_t data_index = i * kIntLoadsPerVec + j;
+
+ int_data[data_index] = *(usrc + j);
+ crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
+
+ *(udst + j) = int_data[data_index];
+ }
+ }
+
+ // Increment pointers
+ *src += sizeof(__m128i);
+ *dst += sizeof(__m128i);
+ --copy_rounds;
+ }
+}
+
+} // namespace
+
+template <size_t vec_regions, size_t int_regions>
+class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
+ public:
+ AcceleratedCrcMemcpyEngine() = default;
+ AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete;
+ AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) =
+ delete;
+
+ crc32c_t Compute(void* __restrict dst, const void* __restrict src,
+ std::size_t length, crc32c_t initial_crc) const override;
+};
+
+template <size_t vec_regions, size_t int_regions>
+crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
+ void* __restrict dst, const void* __restrict src, std::size_t length,
+ crc32c_t initial_crc) const {
+ constexpr std::size_t kRegions = vec_regions + int_regions;
+ constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
+ constexpr std::size_t kBlockSize = sizeof(__m128i);
+ constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
+
+ // Number of blocks per cacheline.
+ constexpr std::size_t kBlocksPerCacheLine = Y_ABSL_CACHELINE_SIZE / kBlockSize;
+
+ char* dst_bytes = static_cast<char*>(dst);
+ const char* src_bytes = static_cast<const char*>(src);
+
+ // Make sure that one prefetch per big block is enough to cover the whole
+ // dataset, and we don't prefetch too much.
+ static_assert(Y_ABSL_CACHELINE_SIZE % kBlockSize == 0,
+ "Cache lines are not divided evenly into blocks, may have "
+ "unintended behavior!");
+
+ // Experimentally-determined boundary between a small and large copy.
+ // Below this number, spin-up and concatenation of CRCs takes enough time that
+ // it kills the throughput gains of using 3 regions and wide vectors.
+ constexpr size_t kCrcSmallSize = 256;
+
+ // Experimentally-determined prefetch distance. Main loop copies will
+ // prefeth data 2 cache lines ahead.
+ constexpr std::size_t kPrefetchAhead = 2 * Y_ABSL_CACHELINE_SIZE;
+
+ // Small-size CRC-memcpy : just do CRC + memcpy
+ if (length < kCrcSmallSize) {
+ crc32c_t crc =
+ ExtendCrc32c(initial_crc, y_absl::string_view(src_bytes, length));
+ memcpy(dst, src, length);
+ return crc;
+ }
+
+ // Start work on the CRC: undo the XOR from the previous calculation or set up
+ // the initial value of the CRC.
+ // initial_crc ^= kCrcDataXor;
+ initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
+
+ // Do an initial alignment copy, so we can use aligned store instructions to
+ // the destination pointer. We align the destination pointer because the
+ // penalty for an unaligned load is small compared to the penalty of an
+ // unaligned store on modern CPUs.
+ std::size_t bytes_from_last_aligned =
+ reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1);
+ if (bytes_from_last_aligned != 0) {
+ std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned;
+
+ // Do the short-sized copy and CRC.
+ initial_crc =
+ ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc);
+ src_bytes += bytes_for_alignment;
+ dst_bytes += bytes_for_alignment;
+ length -= bytes_for_alignment;
+ }
+
+ // We are going to do the copy and CRC in kRegions regions to make sure that
+ // we can saturate the CRC unit. The CRCs will be combined at the end of the
+ // run. Copying will use the SSE registers, and we will extract words from
+ // the SSE registers to add to the CRC. Initially, we run the loop one full
+ // cache line per region at a time, in order to insert prefetches.
+
+ // Initialize CRCs for kRegions regions.
+ crc32c_t crcs[kRegions];
+ crcs[0] = initial_crc;
+ for (size_t i = 1; i < kRegions; i++) {
+ crcs[i] = crc32c_t{kCrcDataXor};
+ }
+
+ // Find the number of rounds to copy and the region size. Also compute the
+ // tail size here.
+ size_t copy_rounds = length / kCopyRoundSize;
+
+ // Find the size of each region and the size of the tail.
+ const std::size_t region_size = copy_rounds * kBlockSize;
+ const std::size_t tail_size = length - (kRegions * region_size);
+
+ // Holding registers for data in each region.
+ std::array<__m128i, vec_regions> vec_data;
+ std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
+
+ // Main loop.
+ while (copy_rounds > kBlocksPerCacheLine) {
+ // Prefetch kPrefetchAhead bytes ahead of each pointer.
+ for (size_t i = 0; i < kRegions; i++) {
+ y_absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
+ region_size * i);
+ y_absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
+ region_size * i);
+ }
+
+ // Load and store data, computing CRC on the way.
+ for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
+ // Copy and CRC the data for the CRC regions.
+ for (size_t j = 0; j < vec_regions; j++) {
+ // Cycle which regions get vector load/store and integer load/store, to
+ // engage prefetching logic around vector load/stores and save issue
+ // slots by using the integer registers.
+ size_t region = (j + i) % kRegions;
+
+ auto* vsrc =
+ reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
+ auto* vdst =
+ reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
+
+ // Load and CRC data.
+ vec_data[j] = _mm_loadu_si128(vsrc + i);
+ crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
+
+ // Store the data.
+ _mm_store_si128(vdst + i, vec_data[j]);
+ }
+
+ // Preload the partial CRCs for the CLMUL subregions.
+ for (size_t j = 0; j < int_regions; j++) {
+ // Cycle which regions get vector load/store and integer load/store, to
+ // engage prefetching logic around vector load/stores and save issue
+ // slots by using the integer registers.
+ size_t region = (j + vec_regions + i) % kRegions;
+
+ auto* usrc =
+ reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
+ auto* udst =
+ reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
+
+ for (size_t k = 0; k < kIntLoadsPerVec; k++) {
+ size_t data_index = j * kIntLoadsPerVec + k;
+
+ // Load and CRC the data.
+ int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
+ crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
+
+ // Store the data.
+ *(udst + i * kIntLoadsPerVec + k) = int_data[data_index];
+ }
+ }
+ }
+
+ // Increment pointers
+ src_bytes += kBlockSize * kBlocksPerCacheLine;
+ dst_bytes += kBlockSize * kBlocksPerCacheLine;
+ copy_rounds -= kBlocksPerCacheLine;
+ }
+
+ // Copy and CRC the tails of each region.
+ LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes,
+ region_size, copy_rounds);
+
+ // Move the source and destination pointers to the end of the region
+ src_bytes += region_size * (kRegions - 1);
+ dst_bytes += region_size * (kRegions - 1);
+
+ // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
+ // XOR done before doing block copy + CRCs.
+ for (size_t i = 0; i + 1 < kRegions; i++) {
+ crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
+ }
+
+ // Build a CRC of the first kRegions - 1 regions.
+ crc32c_t full_crc = crcs[0];
+ for (size_t i = 1; i + 1 < kRegions; i++) {
+ full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
+ }
+
+ // Copy and CRC the tail through the XMM registers.
+ std::size_t tail_blocks = tail_size / kBlockSize;
+ LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
+ tail_blocks);
+
+ // Final tail copy for under 16 bytes.
+ crcs[kRegions - 1] =
+ ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
+ crcs[kRegions - 1]);
+
+ // Finalize and concatenate the final CRC, then return.
+ crcs[kRegions - 1] =
+ crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
+ return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
+}
+
+CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
+#ifdef UNDEFINED_BEHAVIOR_SANITIZER
+ // UBSAN does not play nicely with unaligned loads (which we use a lot).
+ // Get the underlying architecture.
+ CpuType cpu_type = GetCpuType();
+ switch (cpu_type) {
+ case CpuType::kUnknown:
+ case CpuType::kAmdRome:
+ case CpuType::kAmdNaples:
+ case CpuType::kIntelCascadelakeXeon:
+ case CpuType::kIntelSkylakeXeon:
+ case CpuType::kIntelSkylake:
+ case CpuType::kIntelBroadwell:
+ case CpuType::kIntelHaswell:
+ case CpuType::kIntelIvybridge:
+ return {
+ .temporal = new FallbackCrcMemcpyEngine(),
+ .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
+ };
+ // INTEL_SANDYBRIDGE performs better with SSE than AVX.
+ case CpuType::kIntelSandybridge:
+ return {
+ .temporal = new FallbackCrcMemcpyEngine(),
+ .non_temporal = new CrcNonTemporalMemcpyEngine(),
+ };
+ default:
+ return {.temporal = new FallbackCrcMemcpyEngine(),
+ .non_temporal = new FallbackCrcMemcpyEngine()};
+ }
+#else
+ // Get the underlying architecture.
+ CpuType cpu_type = GetCpuType();
+ switch (cpu_type) {
+ // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port
+ // which data movement from the vector registers to the integer registers
+ // (where CRC32C happens) to crowd the same units as vector stores. As a
+ // result, using that path exclusively causes bottlenecking on this port.
+ // We can avoid this bottleneck by using the integer side of the CPU for
+ // most operations rather than the vector side. We keep a vector region to
+ // engage some of the prefetching logic in the cache hierarchy which seems
+ // to give vector instructions special treatment. These prefetch units see
+ // strided access to each region, and do the right thing.
+ case CpuType::kAmdRome:
+ case CpuType::kAmdNaples:
+ return {
+ .temporal = new AcceleratedCrcMemcpyEngine<1, 2>(),
+ .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
+ };
+ // PCLMULQDQ is slow and we don't have wide enough issue width to take
+ // advantage of it. For an unknown architecture, don't risk using CLMULs.
+ case CpuType::kIntelCascadelakeXeon:
+ case CpuType::kIntelSkylakeXeon:
+ case CpuType::kIntelSkylake:
+ case CpuType::kIntelBroadwell:
+ case CpuType::kIntelHaswell:
+ case CpuType::kIntelIvybridge:
+ return {
+ .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
+ .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
+ };
+ // INTEL_SANDYBRIDGE performs better with SSE than AVX.
+ case CpuType::kIntelSandybridge:
+ return {
+ .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
+ .non_temporal = new CrcNonTemporalMemcpyEngine(),
+ };
+ default:
+ return {.temporal = new FallbackCrcMemcpyEngine(),
+ .non_temporal = new FallbackCrcMemcpyEngine()};
+ }
+#endif // UNDEFINED_BEHAVIOR_SANITIZER
+}
+
+// For testing, allow the user to specify which engine they want.
+std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
+ int integer) {
+ if (vector == 3 && integer == 0) {
+ return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>();
+ } else if (vector == 1 && integer == 2) {
+ return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>();
+ }
+ return nullptr;
+}
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_non_temporal_memcpy.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_non_temporal_memcpy.cc
new file mode 100644
index 0000000000..e73e6487cf
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_non_temporal_memcpy.cc
@@ -0,0 +1,93 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+
+#include "y_absl/base/config.h"
+#include "y_absl/crc/crc32c.h"
+#include "y_absl/crc/internal/crc_memcpy.h"
+#include "y_absl/crc/internal/non_temporal_memcpy.h"
+#include "y_absl/strings/string_view.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+crc32c_t CrcNonTemporalMemcpyEngine::Compute(void* __restrict dst,
+ const void* __restrict src,
+ std::size_t length,
+ crc32c_t initial_crc) const {
+ constexpr size_t kBlockSize = 8192;
+ crc32c_t crc = initial_crc;
+
+ const char* src_bytes = reinterpret_cast<const char*>(src);
+ char* dst_bytes = reinterpret_cast<char*>(dst);
+
+ // Copy + CRC loop - run 8k chunks until we are out of full chunks.
+ std::size_t offset = 0;
+ for (; offset + kBlockSize < length; offset += kBlockSize) {
+ crc = y_absl::ExtendCrc32c(crc,
+ y_absl::string_view(src_bytes + offset, kBlockSize));
+ non_temporal_store_memcpy(dst_bytes + offset, src_bytes + offset,
+ kBlockSize);
+ }
+
+ // Save some work if length is 0.
+ if (offset < length) {
+ std::size_t final_copy_size = length - offset;
+ crc = ExtendCrc32c(crc,
+ y_absl::string_view(src_bytes + offset, final_copy_size));
+
+ non_temporal_store_memcpy(dst_bytes + offset, src_bytes + offset,
+ final_copy_size);
+ }
+
+ return crc;
+}
+
+crc32c_t CrcNonTemporalMemcpyAVXEngine::Compute(void* __restrict dst,
+ const void* __restrict src,
+ std::size_t length,
+ crc32c_t initial_crc) const {
+ constexpr size_t kBlockSize = 8192;
+ crc32c_t crc = initial_crc;
+
+ const char* src_bytes = reinterpret_cast<const char*>(src);
+ char* dst_bytes = reinterpret_cast<char*>(dst);
+
+ // Copy + CRC loop - run 8k chunks until we are out of full chunks.
+ std::size_t offset = 0;
+ for (; offset + kBlockSize < length; offset += kBlockSize) {
+ crc = ExtendCrc32c(crc, y_absl::string_view(src_bytes + offset, kBlockSize));
+
+ non_temporal_store_memcpy_avx(dst_bytes + offset, src_bytes + offset,
+ kBlockSize);
+ }
+
+ // Save some work if length is 0.
+ if (offset < length) {
+ std::size_t final_copy_size = length - offset;
+ crc = ExtendCrc32c(crc,
+ y_absl::string_view(src_bytes + offset, final_copy_size));
+
+ non_temporal_store_memcpy_avx(dst_bytes + offset, src_bytes + offset,
+ final_copy_size);
+ }
+
+ return crc;
+}
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_x86_arm_combined.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_x86_arm_combined.cc
new file mode 100644
index 0000000000..96e33d8f56
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/crc_x86_arm_combined.cc
@@ -0,0 +1,725 @@
+// Copyright 2022 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Hardware accelerated CRC32 computation on Intel and ARM architecture.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "y_absl/base/attributes.h"
+#include "y_absl/base/config.h"
+#include "y_absl/base/dynamic_annotations.h"
+#include "y_absl/base/internal/endian.h"
+#include "y_absl/base/internal/prefetch.h"
+#include "y_absl/crc/internal/cpu_detect.h"
+#include "y_absl/crc/internal/crc.h"
+#include "y_absl/crc/internal/crc32_x86_arm_combined_simd.h"
+#include "y_absl/crc/internal/crc_internal.h"
+#include "y_absl/memory/memory.h"
+#include "y_absl/numeric/bits.h"
+
+#if defined(Y_ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
+ defined(Y_ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
+#define Y_ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
+#endif
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+#if defined(Y_ABSL_INTERNAL_CAN_USE_SIMD_CRC32C)
+
+// Implementation details not exported outside of file
+namespace {
+
+// Some machines have CRC acceleration hardware.
+// We can do a faster version of Extend() on such machines.
+class CRC32AcceleratedX86ARMCombined : public CRC32 {
+ public:
+ CRC32AcceleratedX86ARMCombined() {}
+ ~CRC32AcceleratedX86ARMCombined() override {}
+ void ExtendByZeroes(uint32_t* crc, size_t length) const override;
+ uint32_t ComputeZeroConstant(size_t length) const;
+
+ private:
+ CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) =
+ delete;
+ CRC32AcceleratedX86ARMCombined& operator=(
+ const CRC32AcceleratedX86ARMCombined&) = delete;
+};
+
+// Constants for switching between algorithms.
+// Chosen by comparing speed at different powers of 2.
+constexpr size_t kSmallCutoff = 256;
+constexpr size_t kMediumCutoff = 2048;
+
+#define Y_ABSL_INTERNAL_STEP1(crc) \
+ do { \
+ crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
+ } while (0)
+#define Y_ABSL_INTERNAL_STEP2(crc) \
+ do { \
+ crc = \
+ CRC32_u16(static_cast<uint32_t>(crc), y_absl::little_endian::Load16(p)); \
+ p += 2; \
+ } while (0)
+#define Y_ABSL_INTERNAL_STEP4(crc) \
+ do { \
+ crc = \
+ CRC32_u32(static_cast<uint32_t>(crc), y_absl::little_endian::Load32(p)); \
+ p += 4; \
+ } while (0)
+#define Y_ABSL_INTERNAL_STEP8(crc, data) \
+ do { \
+ crc = CRC32_u64(static_cast<uint32_t>(crc), \
+ y_absl::little_endian::Load64(data)); \
+ data += 8; \
+ } while (0)
+#define Y_ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
+ do { \
+ Y_ABSL_INTERNAL_STEP8(crc0, p0); \
+ Y_ABSL_INTERNAL_STEP8(crc1, p1); \
+ } while (0)
+#define Y_ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
+ do { \
+ Y_ABSL_INTERNAL_STEP8(crc0, p0); \
+ Y_ABSL_INTERNAL_STEP8(crc1, p1); \
+ Y_ABSL_INTERNAL_STEP8(crc2, p2); \
+ } while (0)
+
+namespace {
+
+uint32_t multiply(uint32_t a, uint32_t b) {
+ V128 shifts = V128_From2x64(0, 1);
+ V128 power = V128_From2x64(0, a);
+ V128 crc = V128_From2x64(0, b);
+ V128 res = V128_PMulLow(power, crc);
+
+ // Combine crc values
+ res = V128_ShiftLeft64(res, shifts);
+ return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
+ CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
+}
+
+// Powers of crc32c polynomial, for faster ExtendByZeros.
+// Verified against folly:
+// folly/hash/detail/Crc32CombineDetail.cpp
+constexpr uint32_t kCRC32CPowers[] = {
+ 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
+ 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
+ 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
+ 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
+ 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
+ 0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
+ 0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
+ 0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
+ 0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
+ 0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
+ 0x00800000, 0x00008000,
+};
+
+} // namespace
+
+// Compute a magic constant, so that multiplying by it is the same as
+// extending crc by length zeros.
+uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
+ size_t length) const {
+ // Lowest 2 bits are handled separately in ExtendByZeroes
+ length >>= 2;
+
+ int index = y_absl::countr_zero(length);
+ uint32_t prev = kCRC32CPowers[index];
+ length &= length - 1;
+
+ while (length) {
+ // For each bit of length, extend by 2**n zeros.
+ index = y_absl::countr_zero(length);
+ prev = multiply(prev, kCRC32CPowers[index]);
+ length &= length - 1;
+ }
+ return prev;
+}
+
+void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
+ size_t length) const {
+ uint32_t val = *crc;
+ // Don't bother with multiplication for small length.
+ switch (length & 3) {
+ case 0:
+ break;
+ case 1:
+ val = CRC32_u8(val, 0);
+ break;
+ case 2:
+ val = CRC32_u16(val, 0);
+ break;
+ case 3:
+ val = CRC32_u8(val, 0);
+ val = CRC32_u16(val, 0);
+ break;
+ }
+ if (length > 3) {
+ val = multiply(val, ComputeZeroConstant(length));
+ }
+ *crc = val;
+}
+
+// Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32
+// Instruction"
+// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+// We only need every 4th value, because we unroll loop by 4.
+constexpr uint64_t kClmulConstants[] = {
+ 0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a,
+ 0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456,
+ 0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c,
+ 0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a,
+ 0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a,
+ 0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28,
+ 0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2,
+ 0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2,
+ 0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6,
+ 0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312,
+ 0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24,
+ 0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e,
+ 0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa,
+};
+
+enum class CutoffStrategy {
+ // Use 3 CRC streams to fold into 1.
+ Fold3,
+ // Unroll CRC instructions for 64 bytes.
+ Unroll64CRC,
+};
+
+// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
+// methods and data that don't need the template arguments.
+class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
+ : public CRC32AcceleratedX86ARMCombined {
+ protected:
+ // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
+ // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
+ // high latency, so we run 4 128-bit partial checksums that can be reduced to
+ // a single value by FinalizePclmulStream later. Computing crc for arbitrary
+ // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
+ // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ // We are applying it to CRC32C polynomial.
+ Y_ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
+ const uint8_t* p, V128* partialCRC) const {
+ V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
+
+ V128 partialCRC1 = partialCRC[0];
+ V128 partialCRC2 = partialCRC[1];
+ V128 partialCRC3 = partialCRC[2];
+ V128 partialCRC4 = partialCRC[3];
+
+ V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
+ V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
+ V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
+ V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
+ V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
+ V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
+ V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
+ V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
+ partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
+ partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
+ partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
+ partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
+ partialCRC1 = V128_Xor(tmp1, partialCRC1);
+ partialCRC2 = V128_Xor(tmp2, partialCRC2);
+ partialCRC3 = V128_Xor(tmp3, partialCRC3);
+ partialCRC4 = V128_Xor(tmp4, partialCRC4);
+ partialCRC1 = V128_Xor(partialCRC1, data1);
+ partialCRC2 = V128_Xor(partialCRC2, data2);
+ partialCRC3 = V128_Xor(partialCRC3, data3);
+ partialCRC4 = V128_Xor(partialCRC4, data4);
+ partialCRC[0] = partialCRC1;
+ partialCRC[1] = partialCRC2;
+ partialCRC[2] = partialCRC3;
+ partialCRC[3] = partialCRC4;
+ }
+
+ // Reduce partialCRC produced by Process64BytesPclmul into a single value,
+ // that represents crc checksum of all the processed bytes.
+ Y_ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
+ FinalizePclmulStream(V128* partialCRC) const {
+ V128 partialCRC1 = partialCRC[0];
+ V128 partialCRC2 = partialCRC[1];
+ V128 partialCRC3 = partialCRC[2];
+ V128 partialCRC4 = partialCRC[3];
+
+ // Combine 4 vectors of partial crc into a single vector.
+ V128 reductionMultiplicands =
+ V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+ V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+ V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+
+ partialCRC1 = V128_Xor(low, high);
+ partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
+
+ low = V128_PMulLow(reductionMultiplicands, partialCRC3);
+ high = V128_PMulHi(reductionMultiplicands, partialCRC3);
+
+ partialCRC3 = V128_Xor(low, high);
+ partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
+
+ low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+ high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+ V128 fullCRC = V128_Xor(low, high);
+ fullCRC = V128_Xor(fullCRC, partialCRC3);
+
+ // Reduce fullCRC into scalar value.
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+ V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
+
+ V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
+ fullCRC = V128_ShiftRight<8>(fullCRC);
+ fullCRC = V128_Xor(fullCRC, tmp);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
+
+ tmp = V128_ShiftRight<4>(fullCRC);
+ fullCRC = V128_And(fullCRC, mask);
+ fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
+ fullCRC = V128_Xor(tmp, fullCRC);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
+
+ tmp = V128_And(fullCRC, mask);
+ tmp = V128_PMul01(reductionMultiplicands, tmp);
+ tmp = V128_And(tmp, mask);
+ tmp = V128_PMulLow(reductionMultiplicands, tmp);
+
+ fullCRC = V128_Xor(tmp, fullCRC);
+
+ return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
+ }
+
+ // Update crc with 64 bytes of data from p.
+ Y_ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
+ uint64_t crc) const {
+ for (int i = 0; i < 8; i++) {
+ crc =
+ CRC32_u64(static_cast<uint32_t>(crc), y_absl::little_endian::Load64(p));
+ p += 8;
+ }
+ return crc;
+ }
+
+ // Generated by crc32c_x86_test --crc32c_generate_constants=true
+ // and verified against constants in linux kernel for S390:
+ // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
+ alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
+ alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
+ alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
+ alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
+ alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
+ alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
+
+ // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
+ // size. Each group is CRCed in parallel then combined at the end of the
+ // block.
+ static constexpr size_t kGroupsSmall = 3;
+ // For large runs we use up to kMaxStreams blocks computed with CRC
+ // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
+ // are combined in the end.
+ static constexpr size_t kMaxStreams = 3;
+};
+
+#ifdef Y_ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
+alignas(16) constexpr uint32_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
+constexpr size_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
+constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
+#endif // Y_ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+
+template <size_t num_crc_streams, size_t num_pclmul_streams,
+ CutoffStrategy strategy>
+class CRC32AcceleratedX86ARMCombinedMultipleStreams
+ : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
+ Y_ABSL_ATTRIBUTE_HOT
+ void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
+ static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
+ "Invalid number of crc streams");
+ static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
+ "Invalid number of pclmul streams");
+ const uint8_t* p = static_cast<const uint8_t*>(bytes);
+ const uint8_t* e = p + length;
+ uint32_t l = *crc;
+ uint64_t l64;
+
+ // We have dedicated instruction for 1,2,4 and 8 bytes.
+ if (length & 8) {
+ Y_ABSL_INTERNAL_STEP8(l, p);
+ length &= ~size_t{8};
+ }
+ if (length & 4) {
+ Y_ABSL_INTERNAL_STEP4(l);
+ length &= ~size_t{4};
+ }
+ if (length & 2) {
+ Y_ABSL_INTERNAL_STEP2(l);
+ length &= ~size_t{2};
+ }
+ if (length & 1) {
+ Y_ABSL_INTERNAL_STEP1(l);
+ length &= ~size_t{1};
+ }
+ if (length == 0) {
+ *crc = l;
+ return;
+ }
+ // length is now multiple of 16.
+
+ // For small blocks just run simple loop, because cost of combining multiple
+ // streams is significant.
+ if (strategy != CutoffStrategy::Unroll64CRC) {
+ if (length < kSmallCutoff) {
+ while (length >= 16) {
+ Y_ABSL_INTERNAL_STEP8(l, p);
+ Y_ABSL_INTERNAL_STEP8(l, p);
+ length -= 16;
+ }
+ *crc = l;
+ return;
+ }
+ }
+
+ // For medium blocks we run 3 crc streams and combine them as described in
+ // Intel paper above. Running 4th stream doesn't help, because crc
+ // instruction has latency 3 and throughput 1.
+ if (length < kMediumCutoff) {
+ l64 = l;
+ if (strategy == CutoffStrategy::Fold3) {
+ uint64_t l641 = 0;
+ uint64_t l642 = 0;
+ const size_t blockSize = 32;
+ size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
+ const uint8_t* p1 = p + bs * blockSize;
+ const uint8_t* p2 = p1 + bs * blockSize;
+
+ for (size_t i = 0; i + 1 < bs; ++i) {
+ Y_ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
+ Y_ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
+ Y_ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
+ Y_ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
+ base_internal::PrefetchT0(
+ reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
+ base_internal::PrefetchT0(
+ reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
+ base_internal::PrefetchT0(
+ reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
+ }
+ // Don't run crc on last 8 bytes.
+ Y_ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
+ Y_ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
+ Y_ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
+ Y_ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1);
+
+ V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
+
+ V128 tmp = V128_From2x64(0, l64);
+
+ V128 res1 = V128_PMulLow(tmp, magic);
+
+ tmp = V128_From2x64(0, l641);
+
+ V128 res2 = V128_PMul10(tmp, magic);
+ V128 x = V128_Xor(res1, res2);
+ l64 = static_cast<uint64_t>(V128_Low64(x)) ^
+ y_absl::little_endian::Load64(p2);
+ l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
+
+ p = p2 + 8;
+ } else if (strategy == CutoffStrategy::Unroll64CRC) {
+ while ((e - p) >= 64) {
+ l64 = Process64BytesCRC(p, l64);
+ p += 64;
+ }
+ }
+ } else {
+ // There is a lot of data, we can ignore combine costs and run all
+ // requested streams (num_crc_streams + num_pclmul_streams),
+ // using prefetch. CRC and PCLMULQDQ use different cpu execution units,
+ // so on some cpus it makes sense to execute both of them for different
+ // streams.
+
+ // Point x at first 8-byte aligned byte in string.
+ const uint8_t* x = RoundUp<8>(p);
+ // Process bytes until p is 8-byte aligned, if that isn't past the end.
+ while (p != x) {
+ Y_ABSL_INTERNAL_STEP1(l);
+ }
+
+ size_t bs = static_cast<size_t>(e - p) /
+ (num_crc_streams + num_pclmul_streams) / 64;
+ const uint8_t* crc_streams[kMaxStreams];
+ const uint8_t* pclmul_streams[kMaxStreams];
+ // We are guaranteed to have at least one crc stream.
+ crc_streams[0] = p;
+ for (size_t i = 1; i < num_crc_streams; i++) {
+ crc_streams[i] = crc_streams[i - 1] + bs * 64;
+ }
+ pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
+ for (size_t i = 1; i < num_pclmul_streams; i++) {
+ pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
+ }
+
+ // Per stream crc sums.
+ uint64_t l64_crc[kMaxStreams] = {l};
+ uint64_t l64_pclmul[kMaxStreams] = {0};
+
+ // Peel first iteration, because PCLMULQDQ stream, needs setup.
+ for (size_t i = 0; i < num_crc_streams; i++) {
+ l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
+ crc_streams[i] += 16 * 4;
+ }
+
+ V128 partialCRC[kMaxStreams][4];
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
+ partialCRC[i][0] = V128_LoadU(
+ reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
+ partialCRC[i][1] = V128_LoadU(
+ reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1));
+ partialCRC[i][2] = V128_LoadU(
+ reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2));
+ partialCRC[i][3] = V128_LoadU(
+ reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3));
+ pclmul_streams[i] += 16 * 4;
+ }
+
+ for (size_t i = 1; i < bs; i++) {
+ // Prefetch data for next itterations.
+ for (size_t j = 0; j < num_crc_streams; j++) {
+ base_internal::PrefetchT0(
+ reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
+ }
+ for (size_t j = 0; j < num_pclmul_streams; j++) {
+ base_internal::PrefetchT0(reinterpret_cast<const char*>(
+ pclmul_streams[j] + kPrefetchHorizon));
+ }
+
+ // We process each stream in 64 byte blocks. This can be written as
+ // for (int i = 0; i < num_pclmul_streams; i++) {
+ // Process64BytesPclmul(pclmul_streams[i], partialCRC[i]);
+ // pclmul_streams[i] += 16 * 4;
+ // }
+ // for (int i = 0; i < num_crc_streams; i++) {
+ // l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
+ // crc_streams[i] += 16*4;
+ // }
+ // But unrolling and interleaving PCLMULQDQ and CRC blocks manually
+ // gives ~2% performance boost.
+ l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
+ crc_streams[0] += 16 * 4;
+ if (num_pclmul_streams > 0) {
+ Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
+ pclmul_streams[0] += 16 * 4;
+ }
+ if (num_crc_streams > 1) {
+ l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
+ crc_streams[1] += 16 * 4;
+ }
+ if (num_pclmul_streams > 1) {
+ Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
+ pclmul_streams[1] += 16 * 4;
+ }
+ if (num_crc_streams > 2) {
+ l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
+ crc_streams[2] += 16 * 4;
+ }
+ if (num_pclmul_streams > 2) {
+ Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
+ pclmul_streams[2] += 16 * 4;
+ }
+ }
+
+ // PCLMULQDQ based streams require special final step;
+ // CRC based don't.
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
+ l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
+ }
+
+ // Combine all streams into single result.
+ uint32_t magic = ComputeZeroConstant(bs * 64);
+ l64 = l64_crc[0];
+ for (size_t i = 1; i < num_crc_streams; i++) {
+ l64 = multiply(static_cast<uint32_t>(l64), magic);
+ l64 ^= l64_crc[i];
+ }
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
+ l64 = multiply(static_cast<uint32_t>(l64), magic);
+ l64 ^= l64_pclmul[i];
+ }
+
+ // Update p.
+ if (num_pclmul_streams > 0) {
+ p = pclmul_streams[num_pclmul_streams - 1];
+ } else {
+ p = crc_streams[num_crc_streams - 1];
+ }
+ }
+ l = static_cast<uint32_t>(l64);
+
+ while ((e - p) >= 16) {
+ Y_ABSL_INTERNAL_STEP8(l, p);
+ Y_ABSL_INTERNAL_STEP8(l, p);
+ }
+ // Process the last few bytes
+ while (p != e) {
+ Y_ABSL_INTERNAL_STEP1(l);
+ }
+
+#undef Y_ABSL_INTERNAL_STEP8BY3
+#undef Y_ABSL_INTERNAL_STEP8BY2
+#undef Y_ABSL_INTERNAL_STEP8
+#undef Y_ABSL_INTERNAL_STEP4
+#undef Y_ABSL_INTERNAL_STEP2
+#undef Y_ABSL_INTERNAL_STEP1
+
+ *crc = l;
+ }
+};
+
+} // namespace
+
+// Intel processors with SSE4.2 have an instruction for one particular
+// 32-bit CRC polynomial: crc32c
+CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
+ CpuType type = GetCpuType();
+ switch (type) {
+ case CpuType::kIntelHaswell:
+ case CpuType::kAmdRome:
+ case CpuType::kAmdNaples:
+ case CpuType::kAmdMilan:
+ return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 1, CutoffStrategy::Fold3>();
+ // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
+ case CpuType::kIntelCascadelakeXeon:
+ case CpuType::kIntelSkylakeXeon:
+ case CpuType::kIntelBroadwell:
+ case CpuType::kIntelSkylake:
+ return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 2, CutoffStrategy::Fold3>();
+ // PCLMULQDQ is slow, don't use it.
+ case CpuType::kIntelIvybridge:
+ case CpuType::kIntelSandybridge:
+ case CpuType::kIntelWestmere:
+ return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 0, CutoffStrategy::Fold3>();
+ case CpuType::kArmNeoverseN1:
+ return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 1, CutoffStrategy::Unroll64CRC>();
+#if defined(__aarch64__)
+ default:
+ // Not all ARM processors support the needed instructions, so check here
+ // before trying to use an accelerated implementation.
+ if (SupportsArmCRC32PMULL()) {
+ return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 1, CutoffStrategy::Unroll64CRC>();
+ } else {
+ return nullptr;
+ }
+#else
+ default:
+ // Something else, play it safe and assume slow PCLMULQDQ.
+ return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 0, CutoffStrategy::Fold3>();
+#endif
+ }
+}
+
+std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
+ auto ret = std::vector<std::unique_ptr<CRCImpl>>();
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 0, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 1, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 2, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 3, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 2, 0, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 2, 1, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 2, 2, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 2, 3, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 0, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 1, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 2, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 3, CutoffStrategy::Fold3>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 0, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 1, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 2, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 3, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 2, 0, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 2, 1, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 2, 2, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 2, 3, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 0, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 1, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 2, CutoffStrategy::Unroll64CRC>>());
+ ret.push_back(y_absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 3, CutoffStrategy::Unroll64CRC>>());
+
+ return ret;
+}
+
+#else // !Y_ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
+
+std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
+ return std::vector<std::unique_ptr<CRCImpl>>();
+}
+
+// no hardware acceleration available
+CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }
+
+#endif
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/non_temporal_arm_intrinsics.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/non_temporal_arm_intrinsics.h
new file mode 100644
index 0000000000..7a1dbf9a83
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/non_temporal_arm_intrinsics.h
@@ -0,0 +1,79 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_
+#define Y_ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_
+
+#include "y_absl/base/config.h"
+
+#ifdef __aarch64__
+#include <arm_neon.h>
+
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+static inline __attribute__((always_inline)) void _mm_sfence(void) {
+ __sync_synchronize();
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+// dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(
+ const __m128i *p) {
+ return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
+}
+
+// Stores the data in a to the address p without polluting the caches. If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+static inline __attribute__((always_inline)) void _mm_stream_si128(__m128i *p,
+ __m128i a) {
+#if Y_ABSL_HAVE_BUILTIN(__builtin_nontemporal_store)
+ __builtin_nontemporal_store(a, p);
+#else
+ vst1q_s64((int64_t *)p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+static inline __attribute__((always_inline)) __m128i _mm_set_epi8(
+ signed char b15, signed char b14, signed char b13, signed char b12,
+ signed char b11, signed char b10, signed char b9, signed char b8,
+ signed char b7, signed char b6, signed char b5, signed char b4,
+ signed char b3, signed char b2, signed char b1, signed char b0) {
+ int8_t __attribute__((aligned(16)))
+ data[16] = {(int8_t)b0, (int8_t)b1, (int8_t)b2, (int8_t)b3,
+ (int8_t)b4, (int8_t)b5, (int8_t)b6, (int8_t)b7,
+ (int8_t)b8, (int8_t)b9, (int8_t)b10, (int8_t)b11,
+ (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
+ return (__m128i)vld1q_s8(data);
+}
+#endif // __aarch64__
+
+#endif // Y_ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/non_temporal_memcpy.h b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/non_temporal_memcpy.h
new file mode 100644
index 0000000000..fce0007046
--- /dev/null
+++ b/contrib/restricted/abseil-cpp-tstring/y_absl/crc/internal/non_temporal_memcpy.h
@@ -0,0 +1,180 @@
+// Copyright 2022 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Y_ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
+#define Y_ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+#ifdef __aarch64__
+#include "y_absl/crc/internal/non_temporal_arm_intrinsics.h"
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "y_absl/base/config.h"
+#include "y_absl/base/optimization.h"
+
+namespace y_absl {
+Y_ABSL_NAMESPACE_BEGIN
+namespace crc_internal {
+
+// This non-temporal memcpy does regular load and non-temporal store memory
+// copy. It is compatible to both 16-byte aligned and unaligned addresses. If
+// data at the destination is not immediately accessed, using non-temporal
+// memcpy can save 1 DRAM load of the destination cacheline.
+constexpr size_t kCacheLineSize = Y_ABSL_CACHELINE_SIZE;
+
+// If the objects overlap, the behavior is undefined.
+inline void *non_temporal_store_memcpy(void *__restrict dst,
+ const void *__restrict src, size_t len) {
+#if defined(__SSE3__) || defined(__aarch64__) || \
+ (defined(_MSC_VER) && defined(__AVX__))
+ // This implementation requires SSE3.
+ // MSVC cannot target SSE3 directly, but when MSVC targets AVX,
+ // SSE3 support is implied.
+ uint8_t *d = reinterpret_cast<uint8_t *>(dst);
+ const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
+
+ // memcpy() the misaligned header. At the end of this if block, <d> is
+ // aligned to a 64-byte cacheline boundary or <len> == 0.
+ if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
+ uintptr_t bytes_before_alignment_boundary =
+ kCacheLineSize -
+ (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
+ size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
+ assert(bytes_before_alignment_boundary < kCacheLineSize);
+ memcpy(d, s, header_len);
+ d += header_len;
+ s += header_len;
+ len -= header_len;
+ }
+
+ if (len >= kCacheLineSize) {
+ _mm_sfence();
+ __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
+ const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
+ constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
+ size_t loops = len / kCacheLineSize;
+
+ while (len >= kCacheLineSize) {
+ __m128i temp1, temp2, temp3, temp4;
+ temp1 = _mm_lddqu_si128(src_cacheline + 0);
+ temp2 = _mm_lddqu_si128(src_cacheline + 1);
+ temp3 = _mm_lddqu_si128(src_cacheline + 2);
+ temp4 = _mm_lddqu_si128(src_cacheline + 3);
+ _mm_stream_si128(dst_cacheline + 0, temp1);
+ _mm_stream_si128(dst_cacheline + 1, temp2);
+ _mm_stream_si128(dst_cacheline + 2, temp3);
+ _mm_stream_si128(dst_cacheline + 3, temp4);
+ src_cacheline += kOpsPerCacheLine;
+ dst_cacheline += kOpsPerCacheLine;
+ len -= kCacheLineSize;
+ }
+ d += loops * kCacheLineSize;
+ s += loops * kCacheLineSize;
+ _mm_sfence();
+ }
+
+ // memcpy the tail.
+ if (len) {
+ memcpy(d, s, len);
+ }
+ return dst;
+#else
+ // Fallback to regular memcpy.
+ return memcpy(dst, src, len);
+#endif // __SSE3__ || __aarch64__ || (_MSC_VER && __AVX__)
+}
+
+inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
+ const void *__restrict src,
+ size_t len) {
+#ifdef __AVX__
+ uint8_t *d = reinterpret_cast<uint8_t *>(dst);
+ const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
+
+ // memcpy() the misaligned header. At the end of this if block, <d> is
+ // aligned to a 64-byte cacheline boundary or <len> == 0.
+ if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
+ uintptr_t bytes_before_alignment_boundary =
+ kCacheLineSize -
+ (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
+ size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
+ assert(bytes_before_alignment_boundary < kCacheLineSize);
+ memcpy(d, s, header_len);
+ d += header_len;
+ s += header_len;
+ len -= header_len;
+ }
+
+ if (len >= kCacheLineSize) {
+ _mm_sfence();
+ __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
+ const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
+ constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
+ size_t loops = len / kCacheLineSize;
+
+ while (len >= kCacheLineSize) {
+ __m256i temp1, temp2;
+ temp1 = _mm256_lddqu_si256(src_cacheline + 0);
+ temp2 = _mm256_lddqu_si256(src_cacheline + 1);
+ _mm256_stream_si256(dst_cacheline + 0, temp1);
+ _mm256_stream_si256(dst_cacheline + 1, temp2);
+ src_cacheline += kOpsPerCacheLine;
+ dst_cacheline += kOpsPerCacheLine;
+ len -= kCacheLineSize;
+ }
+ d += loops * kCacheLineSize;
+ s += loops * kCacheLineSize;
+ _mm_sfence();
+ }
+
+ // memcpy the tail.
+ if (len) {
+ memcpy(d, s, len);
+ }
+ return dst;
+#else
+ // Fallback to regular memcpy when AVX is not available.
+ return memcpy(dst, src, len);
+#endif // __AVX__
+}
+
+} // namespace crc_internal
+Y_ABSL_NAMESPACE_END
+} // namespace y_absl
+
+#endif // Y_ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_