aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/restricted/abseil-cpp/absl/crc/internal
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.com>2024-01-25 20:29:07 +0300
committerAlexander Smirnov <alex@ydb.tech>2024-01-26 20:49:19 +0300
commit22817e4eef3e09482237a2dfdaed9319241a11bb (patch)
tree858f4013320fd3f5c413c1b909abdb5c2c22bb3d /contrib/restricted/abseil-cpp/absl/crc/internal
parent45ce8d4f5fd282b3221a19fe7fe71459a820dfca (diff)
downloadydb-22817e4eef3e09482237a2dfdaed9319241a11bb.tar.gz
Update contrib/restricted/abseil-cpp to 20240116.0
Diffstat (limited to 'contrib/restricted/abseil-cpp/absl/crc/internal')
-rw-r--r--contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc28
-rw-r--r--contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h6
-rw-r--r--contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h39
-rw-r--r--contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h9
-rw-r--r--contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc6
-rw-r--r--contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_arm_combined.cc (renamed from contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_64.cc)112
-rw-r--r--contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc12
7 files changed, 151 insertions, 61 deletions
diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc b/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc
index 838380854f..d7eedd1ca4 100644
--- a/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc
+++ b/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc
@@ -189,8 +189,14 @@ CpuType GetAmdCpuType() {
break;
case 0x19:
switch (model_num) {
+ case 0x0: // Stepping Ax
case 0x1: // Stepping B0
return CpuType::kAmdMilan;
+ case 0x10: // Stepping A0
+ case 0x11: // Stepping B0
+ return CpuType::kAmdGenoa;
+ case 0x44: // Stepping A0
+ return CpuType::kAmdRyzenV3000;
default:
return CpuType::kUnknown;
}
@@ -237,8 +243,26 @@ CpuType GetCpuType() {
ABSL_INTERNAL_AARCH64_ID_REG_READ(MIDR_EL1, midr);
uint32_t implementer = (midr >> 24) & 0xff;
uint32_t part_number = (midr >> 4) & 0xfff;
- if (implementer == 0x41 && part_number == 0xd0c) {
- return CpuType::kArmNeoverseN1;
+ switch (implementer) {
+ case 0x41:
+ switch (part_number) {
+ case 0xd0c: return CpuType::kArmNeoverseN1;
+ case 0xd40: return CpuType::kArmNeoverseV1;
+ case 0xd49: return CpuType::kArmNeoverseN2;
+ case 0xd4f: return CpuType::kArmNeoverseV2;
+ default:
+ return CpuType::kUnknown;
+ }
+ break;
+ case 0xc0:
+ switch (part_number) {
+ case 0xac3: return CpuType::kAmpereSiryn;
+ default:
+ return CpuType::kUnknown;
+ }
+ break;
+ default:
+ return CpuType::kUnknown;
}
}
return CpuType::kUnknown;
diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h b/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h
index 6054f6960d..01e19590ca 100644
--- a/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h
+++ b/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h
@@ -29,6 +29,8 @@ enum class CpuType {
kAmdRome,
kAmdNaples,
kAmdMilan,
+ kAmdGenoa,
+ kAmdRyzenV3000,
kIntelCascadelakeXeon,
kIntelSkylakeXeon,
kIntelBroadwell,
@@ -37,6 +39,10 @@ enum class CpuType {
kIntelSandybridge,
kIntelWestmere,
kArmNeoverseN1,
+ kArmNeoverseV1,
+ kAmpereSiryn,
+ kArmNeoverseN2,
+ kArmNeoverseV2
};
// Returns the type of host CPU this code is running on. Returns kUnknown if
diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h b/contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h
index 39e53dd08e..59995ae3e2 100644
--- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -59,6 +59,8 @@ namespace crc_internal {
#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
using V128 = uint64x2_t;
#else
+// Note: Do not use __m128i_u, it is not portable.
+// Use V128_LoadU() perform an unaligned load from __m128i*.
using V128 = __m128i;
#endif
@@ -78,6 +80,9 @@ V128 V128_Load(const V128* src);
// Load 128 bits of integer data. |src| does not need to be aligned.
V128 V128_LoadU(const V128* src);
+// Store 128 bits of integer data. |src| must be 16-byte aligned.
+void V128_Store(V128* dst, V128 data);
+
// Polynomially multiplies the high 64 bits of |l| and |r|.
V128 V128_PMulHi(const V128 l, const V128 r);
@@ -109,6 +114,10 @@ V128 V128_ShiftRight(const V128 l);
template <int imm>
int V128_Extract32(const V128 l);
+// Extracts a 64-bit integer from |l|, selected with |imm|.
+template <int imm>
+uint64_t V128_Extract64(const V128 l);
+
// Extracts the low 64 bits from V128.
int64_t V128_Low64(const V128 l);
@@ -139,6 +148,8 @@ inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
+inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
+
inline V128 V128_PMulHi(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x11);
}
@@ -173,6 +184,11 @@ inline int V128_Extract32(const V128 l) {
return _mm_extract_epi32(l, imm);
}
+template <int imm>
+inline uint64_t V128_Extract64(const V128 l) {
+ return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
+}
+
inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
@@ -203,10 +219,14 @@ inline V128 V128_LoadU(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
}
+inline void V128_Store(V128* dst, V128 data) {
+ vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
+}
+
// Using inline assembly as clang does not generate the pmull2 instruction and
// performance drops by 15-20%.
-// TODO(b/193678732): Investigate why the compiler decides not to generate
-// such instructions and why it becomes so much worse.
+// TODO(b/193678732): Investigate why there is a slight performance hit when
+// using intrinsics instead of inline assembly.
inline V128 V128_PMulHi(const V128 l, const V128 r) {
uint64x2_t res;
__asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
@@ -215,10 +235,14 @@ inline V128 V128_PMulHi(const V128 l, const V128 r) {
return res;
}
+// TODO(b/193678732): Investigate why the compiler decides to move the constant
+// loop multiplicands from GPR to Neon registers every loop iteration.
inline V128 V128_PMulLow(const V128 l, const V128 r) {
- return reinterpret_cast<V128>(vmull_p64(
- reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
- reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
+ uint64x2_t res;
+ __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t"
+ : "=w"(res)
+ : "w"(l), "w"(r));
+ return res;
}
inline V128 V128_PMul01(const V128 l, const V128 r) {
@@ -252,6 +276,11 @@ inline int V128_Extract32(const V128 l) {
return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
}
+template <int imm>
+inline uint64_t V128_Extract64(const V128 l) {
+ return vgetq_lane_u64(l, imm);
+}
+
inline int64_t V128_Low64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
}
diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h
index 4909d43366..a0fed65afa 100644
--- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h
+++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h
@@ -20,12 +20,15 @@
#include "absl/base/config.h"
#include "absl/crc/crc32c.h"
+#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
// Defined if the class AcceleratedCrcMemcpyEngine exists.
-#if defined(__x86_64__) && defined(__SSE4_2__)
-#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
-#elif defined(_MSC_VER) && defined(__AVX__)
+// TODO(b/299127771): Consider relaxing the pclmul requirement once the other
+// intrinsics are conditionally compiled without it.
+#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
+#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
+#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1
#endif
namespace absl {
diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc
index 15b4b05594..07795504e3 100644
--- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc
+++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc
@@ -54,7 +54,8 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
}
// Compile the following only if we don't have
-#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \
+ !defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
CrcMemcpy::ArchSpecificEngines engines;
@@ -68,7 +69,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
return std::make_unique<FallbackCrcMemcpyEngine>();
}
-#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE &&
+ // !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
} // namespace crc_internal
ABSL_NAMESPACE_END
diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_64.cc b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
index d42b08dc9f..968e9ae359 100644
--- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_64.cc
+++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because
-// XMM registers do not support the CRC instruction (yet). While copying,
-// compute the running CRC of the data being copied.
+// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
+// registers because XMM registers do not support the CRC instruction (yet).
+// While copying, compute the running CRC of the data being copied.
//
// It is assumed that any CPU running this code has SSE4.2 instructions
// available (for CRC32C). This file will do nothing if that is not true.
@@ -49,17 +49,20 @@
#include <array>
#include <cstddef>
#include <cstdint>
-#include <type_traits>
+#include <cstring>
+#include <memory>
-#include "absl/base/dynamic_annotations.h"
+#include "absl/base/config.h"
#include "absl/base/optimization.h"
#include "absl/base/prefetch.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/cpu_detect.h"
+#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
#include "absl/crc/internal/crc_memcpy.h"
#include "absl/strings/string_view.h"
-#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
+ defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
namespace absl {
ABSL_NAMESPACE_BEGIN
@@ -74,7 +77,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
uint32_t crc_uint32 = static_cast<uint32_t>(crc);
for (std::size_t i = 0; i < length; i++) {
uint8_t data = *reinterpret_cast<const uint8_t*>(src);
- crc_uint32 = _mm_crc32_u8(crc_uint32, data);
+ crc_uint32 = CRC32_u8(crc_uint32, data);
*reinterpret_cast<uint8_t*>(dst) = data;
++src;
++dst;
@@ -82,36 +85,35 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
return crc32c_t{crc_uint32};
}
-constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
+constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
// Common function for copying the tails of multiple large regions.
template <size_t vec_regions, size_t int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t region_size, size_t copy_rounds) {
- std::array<__m128i, vec_regions> data;
+ std::array<V128, vec_regions> data;
std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
while (copy_rounds > 0) {
for (size_t i = 0; i < vec_regions; i++) {
size_t region = i;
- auto* vsrc =
- reinterpret_cast<const __m128i*>(*src + region_size * region);
- auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
+ auto* vsrc = reinterpret_cast<const V128*>(*src + region_size * region);
+ auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
// Load the blocks, unaligned
- data[i] = _mm_loadu_si128(vsrc);
+ data[i] = V128_LoadU(vsrc);
// Store the blocks, aligned
- _mm_store_si128(vdst, data[i]);
+ V128_Store(vdst, data[i]);
// Compute the running CRC
crcs[region] = crc32c_t{static_cast<uint32_t>(
- _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
- static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
+ CRC32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(
- _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
- static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
+ CRC32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
}
for (size_t i = 0; i < int_regions; i++) {
@@ -125,7 +127,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t data_index = i * kIntLoadsPerVec + j;
int_data[data_index] = *(usrc + j);
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
*(udst + j) = int_data[data_index];
@@ -133,8 +135,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
}
// Increment pointers
- *src += sizeof(__m128i);
- *dst += sizeof(__m128i);
+ *src += sizeof(V128);
+ *dst += sizeof(V128);
--copy_rounds;
}
}
@@ -158,8 +160,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
void* __restrict dst, const void* __restrict src, std::size_t length,
crc32c_t initial_crc) const {
constexpr std::size_t kRegions = vec_regions + int_regions;
+ static_assert(kRegions > 0, "Must specify at least one region.");
constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
- constexpr std::size_t kBlockSize = sizeof(__m128i);
+ constexpr std::size_t kBlockSize = sizeof(V128);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
// Number of blocks per cacheline.
@@ -235,7 +238,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
const std::size_t tail_size = length - (kRegions * region_size);
// Holding registers for data in each region.
- std::array<__m128i, vec_regions> vec_data;
+ std::array<V128, vec_regions> vec_data;
std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
// Main loop.
@@ -243,7 +246,10 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Prefetch kPrefetchAhead bytes ahead of each pointer.
for (size_t i = 0; i < kRegions; i++) {
absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
+#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+ // TODO(b/297082454): investigate dropping prefetch on x86.
absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
+#endif
}
// Load and store data, computing CRC on the way.
@@ -256,21 +262,20 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
size_t region = (j + i) % kRegions;
auto* vsrc =
- reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
- auto* vdst =
- reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
+ reinterpret_cast<const V128*>(src_bytes + region_size * region);
+ auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
// Load and CRC data.
- vec_data[j] = _mm_loadu_si128(vsrc + i);
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
- static_cast<uint32_t>(crcs[region]),
- static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
- static_cast<uint32_t>(crcs[region]),
- static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
+ vec_data[j] = V128_LoadU(vsrc + i);
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ CRC32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ CRC32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
// Store the data.
- _mm_store_si128(vdst + i, vec_data[j]);
+ V128_Store(vdst + i, vec_data[j]);
}
// Preload the partial CRCs for the CLMUL subregions.
@@ -290,7 +295,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Load and CRC the data.
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
// Store the data.
@@ -313,6 +318,21 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
src_bytes += region_size * (kRegions - 1);
dst_bytes += region_size * (kRegions - 1);
+ // Copy and CRC the tail through the XMM registers.
+ std::size_t tail_blocks = tail_size / kBlockSize;
+ LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
+ tail_blocks);
+
+ // Final tail copy for under 16 bytes.
+ crcs[kRegions - 1] =
+ ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
+ crcs[kRegions - 1]);
+
+ if (kRegions == 1) {
+ // If there is only one region, finalize and return its CRC.
+ return crc32c_t{static_cast<uint32_t>(crcs[0]) ^ kCrcDataXor};
+ }
+
// Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
// XOR done before doing block copy + CRCs.
for (size_t i = 0; i + 1 < kRegions; i++) {
@@ -325,16 +345,6 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
}
- // Copy and CRC the tail through the XMM registers.
- std::size_t tail_blocks = tail_size / kBlockSize;
- LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
- tail_blocks);
-
- // Final tail copy for under 16 bytes.
- crcs[kRegions - 1] =
- ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
- crcs[kRegions - 1]);
-
// Finalize and concatenate the final CRC, then return.
crcs[kRegions - 1] =
crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
@@ -347,9 +357,11 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
// Get the underlying architecture.
CpuType cpu_type = GetCpuType();
switch (cpu_type) {
- case CpuType::kUnknown:
case CpuType::kAmdRome:
case CpuType::kAmdNaples:
+ case CpuType::kAmdMilan:
+ case CpuType::kAmdGenoa:
+ case CpuType::kAmdRyzenV3000:
case CpuType::kIntelCascadelakeXeon:
case CpuType::kIntelSkylakeXeon:
case CpuType::kIntelSkylake:
@@ -385,6 +397,9 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
// strided access to each region, and do the right thing.
case CpuType::kAmdRome:
case CpuType::kAmdNaples:
+ case CpuType::kAmdMilan:
+ case CpuType::kAmdGenoa:
+ case CpuType::kAmdRyzenV3000:
return {
/*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(),
/*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
@@ -421,6 +436,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>();
} else if (vector == 1 && integer == 2) {
return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>();
+ } else if (vector == 1 && integer == 0) {
+ return std::make_unique<AcceleratedCrcMemcpyEngine<1, 0>>();
}
return nullptr;
}
@@ -429,4 +446,5 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
ABSL_NAMESPACE_END
} // namespace absl
-#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
+ // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc
index ef521d22d1..51eff4eddc 100644
--- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc
@@ -16,14 +16,14 @@
#include <cstddef>
#include <cstdint>
+#include <memory>
+#include <vector>
#include "absl/base/attributes.h"
#include "absl/base/config.h"
-#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/endian.h"
#include "absl/base/prefetch.h"
#include "absl/crc/internal/cpu_detect.h"
-#include "absl/crc/internal/crc.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
#include "absl/crc/internal/crc_internal.h"
#include "absl/memory/memory.h"
@@ -634,8 +634,16 @@ CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 0, CutoffStrategy::Fold3>();
case CpuType::kArmNeoverseN1:
+ case CpuType::kArmNeoverseN2:
+ case CpuType::kArmNeoverseV1:
return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 1, CutoffStrategy::Unroll64CRC>();
+ case CpuType::kAmpereSiryn:
+ return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 2, CutoffStrategy::Fold3>();
+ case CpuType::kArmNeoverseV2:
+ return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 1, 2, CutoffStrategy::Unroll64CRC>();
#if defined(__aarch64__)
default:
// Not all ARM processors support the needed instructions, so check here