diff options
author | thegeorg <thegeorg@yandex-team.com> | 2024-01-25 20:29:07 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.com> | 2024-01-25 20:51:18 +0300 |
commit | 24abb4e0b50dd362e8cf30a682d5212252936b09 (patch) | |
tree | c5356c59cfe5480daca33b63b1742680a48586e8 /contrib/restricted/abseil-cpp/absl/crc/internal | |
parent | b65dd88d2d36688300317c22b9c14ed9dcdeb37d (diff) | |
download | ydb-24abb4e0b50dd362e8cf30a682d5212252936b09.tar.gz |
Update contrib/restricted/abseil-cpp to 20240116.0
Diffstat (limited to 'contrib/restricted/abseil-cpp/absl/crc/internal')
-rw-r--r-- | contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc | 28 | ||||
-rw-r--r-- | contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h | 6 | ||||
-rw-r--r-- | contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h | 39 | ||||
-rw-r--r-- | contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h | 9 | ||||
-rw-r--r-- | contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc | 6 | ||||
-rw-r--r-- | contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_arm_combined.cc (renamed from contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_64.cc) | 112 | ||||
-rw-r--r-- | contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc | 12 |
7 files changed, 151 insertions, 61 deletions
diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc b/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc index 838380854f..d7eedd1ca4 100644 --- a/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc +++ b/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.cc @@ -189,8 +189,14 @@ CpuType GetAmdCpuType() { break; case 0x19: switch (model_num) { + case 0x0: // Stepping Ax case 0x1: // Stepping B0 return CpuType::kAmdMilan; + case 0x10: // Stepping A0 + case 0x11: // Stepping B0 + return CpuType::kAmdGenoa; + case 0x44: // Stepping A0 + return CpuType::kAmdRyzenV3000; default: return CpuType::kUnknown; } @@ -237,8 +243,26 @@ CpuType GetCpuType() { ABSL_INTERNAL_AARCH64_ID_REG_READ(MIDR_EL1, midr); uint32_t implementer = (midr >> 24) & 0xff; uint32_t part_number = (midr >> 4) & 0xfff; - if (implementer == 0x41 && part_number == 0xd0c) { - return CpuType::kArmNeoverseN1; + switch (implementer) { + case 0x41: + switch (part_number) { + case 0xd0c: return CpuType::kArmNeoverseN1; + case 0xd40: return CpuType::kArmNeoverseV1; + case 0xd49: return CpuType::kArmNeoverseN2; + case 0xd4f: return CpuType::kArmNeoverseV2; + default: + return CpuType::kUnknown; + } + break; + case 0xc0: + switch (part_number) { + case 0xac3: return CpuType::kAmpereSiryn; + default: + return CpuType::kUnknown; + } + break; + default: + return CpuType::kUnknown; } } return CpuType::kUnknown; diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h b/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h index 6054f6960d..01e19590ca 100644 --- a/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h +++ b/contrib/restricted/abseil-cpp/absl/crc/internal/cpu_detect.h @@ -29,6 +29,8 @@ enum class CpuType { kAmdRome, kAmdNaples, kAmdMilan, + kAmdGenoa, + kAmdRyzenV3000, kIntelCascadelakeXeon, kIntelSkylakeXeon, kIntelBroadwell, @@ -37,6 +39,10 @@ enum class CpuType { kIntelSandybridge, kIntelWestmere, kArmNeoverseN1, + kArmNeoverseV1, + kAmpereSiryn, + kArmNeoverseN2, + kArmNeoverseV2 }; // Returns the type of host CPU this code is running on. Returns kUnknown if diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h b/contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h index 39e53dd08e..59995ae3e2 100644 --- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h +++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc32_x86_arm_combined_simd.h @@ -59,6 +59,8 @@ namespace crc_internal { #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) using V128 = uint64x2_t; #else +// Note: Do not use __m128i_u, it is not portable. +// Use V128_LoadU() perform an unaligned load from __m128i*. using V128 = __m128i; #endif @@ -78,6 +80,9 @@ V128 V128_Load(const V128* src); // Load 128 bits of integer data. |src| does not need to be aligned. V128 V128_LoadU(const V128* src); +// Store 128 bits of integer data. |src| must be 16-byte aligned. +void V128_Store(V128* dst, V128 data); + // Polynomially multiplies the high 64 bits of |l| and |r|. V128 V128_PMulHi(const V128 l, const V128 r); @@ -109,6 +114,10 @@ V128 V128_ShiftRight(const V128 l); template <int imm> int V128_Extract32(const V128 l); +// Extracts a 64-bit integer from |l|, selected with |imm|. +template <int imm> +uint64_t V128_Extract64(const V128 l); + // Extracts the low 64 bits from V128. int64_t V128_Low64(const V128 l); @@ -139,6 +148,8 @@ inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); } +inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); } + inline V128 V128_PMulHi(const V128 l, const V128 r) { return _mm_clmulepi64_si128(l, r, 0x11); } @@ -173,6 +184,11 @@ inline int V128_Extract32(const V128 l) { return _mm_extract_epi32(l, imm); } +template <int imm> +inline uint64_t V128_Extract64(const V128 l) { + return static_cast<uint64_t>(_mm_extract_epi64(l, imm)); +} + inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); } inline V128 V128_ShiftLeft64(const V128 l, const V128 r) { @@ -203,10 +219,14 @@ inline V128 V128_LoadU(const V128* src) { return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); } +inline void V128_Store(V128* dst, V128 data) { + vst1q_u64(reinterpret_cast<uint64_t*>(dst), data); +} + // Using inline assembly as clang does not generate the pmull2 instruction and // performance drops by 15-20%. -// TODO(b/193678732): Investigate why the compiler decides not to generate -// such instructions and why it becomes so much worse. +// TODO(b/193678732): Investigate why there is a slight performance hit when +// using intrinsics instead of inline assembly. inline V128 V128_PMulHi(const V128 l, const V128 r) { uint64x2_t res; __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t" @@ -215,10 +235,14 @@ inline V128 V128_PMulHi(const V128 l, const V128 r) { return res; } +// TODO(b/193678732): Investigate why the compiler decides to move the constant +// loop multiplicands from GPR to Neon registers every loop iteration. inline V128 V128_PMulLow(const V128 l, const V128 r) { - return reinterpret_cast<V128>(vmull_p64( - reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))), - reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r))))); + uint64x2_t res; + __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t" + : "=w"(res) + : "w"(l), "w"(r)); + return res; } inline V128 V128_PMul01(const V128 l, const V128 r) { @@ -252,6 +276,11 @@ inline int V128_Extract32(const V128 l) { return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm); } +template <int imm> +inline uint64_t V128_Extract64(const V128 l) { + return vgetq_lane_u64(l, imm); +} + inline int64_t V128_Low64(const V128 l) { return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0); } diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h index 4909d43366..a0fed65afa 100644 --- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h +++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy.h @@ -20,12 +20,15 @@ #include "absl/base/config.h" #include "absl/crc/crc32c.h" +#include "absl/crc/internal/crc32_x86_arm_combined_simd.h" // Defined if the class AcceleratedCrcMemcpyEngine exists. -#if defined(__x86_64__) && defined(__SSE4_2__) -#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 -#elif defined(_MSC_VER) && defined(__AVX__) +// TODO(b/299127771): Consider relaxing the pclmul requirement once the other +// intrinsics are conditionally compiled without it. +#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 +#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) +#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1 #endif namespace absl { diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc index 15b4b05594..07795504e3 100644 --- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc +++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_fallback.cc @@ -54,7 +54,8 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst, } // Compile the following only if we don't have -#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \ + !defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE) CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { CrcMemcpy::ArchSpecificEngines engines; @@ -68,7 +69,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/, return std::make_unique<FallbackCrcMemcpyEngine>(); } -#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#endif // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE && + // !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE } // namespace crc_internal ABSL_NAMESPACE_END diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_64.cc b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_arm_combined.cc index d42b08dc9f..968e9ae359 100644 --- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_64.cc +++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_memcpy_x86_arm_combined.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because -// XMM registers do not support the CRC instruction (yet). While copying, -// compute the running CRC of the data being copied. +// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer +// registers because XMM registers do not support the CRC instruction (yet). +// While copying, compute the running CRC of the data being copied. // // It is assumed that any CPU running this code has SSE4.2 instructions // available (for CRC32C). This file will do nothing if that is not true. @@ -49,17 +49,20 @@ #include <array> #include <cstddef> #include <cstdint> -#include <type_traits> +#include <cstring> +#include <memory> -#include "absl/base/dynamic_annotations.h" +#include "absl/base/config.h" #include "absl/base/optimization.h" #include "absl/base/prefetch.h" #include "absl/crc/crc32c.h" #include "absl/crc/internal/cpu_detect.h" +#include "absl/crc/internal/crc32_x86_arm_combined_simd.h" #include "absl/crc/internal/crc_memcpy.h" #include "absl/strings/string_view.h" -#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \ + defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE) namespace absl { ABSL_NAMESPACE_BEGIN @@ -74,7 +77,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, uint32_t crc_uint32 = static_cast<uint32_t>(crc); for (std::size_t i = 0; i < length; i++) { uint8_t data = *reinterpret_cast<const uint8_t*>(src); - crc_uint32 = _mm_crc32_u8(crc_uint32, data); + crc_uint32 = CRC32_u8(crc_uint32, data); *reinterpret_cast<uint8_t*>(dst) = data; ++src; ++dst; @@ -82,36 +85,35 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, return crc32c_t{crc_uint32}; } -constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); +constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t); // Common function for copying the tails of multiple large regions. template <size_t vec_regions, size_t int_regions> inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, size_t region_size, size_t copy_rounds) { - std::array<__m128i, vec_regions> data; + std::array<V128, vec_regions> data; std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data; while (copy_rounds > 0) { for (size_t i = 0; i < vec_regions; i++) { size_t region = i; - auto* vsrc = - reinterpret_cast<const __m128i*>(*src + region_size * region); - auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region); + auto* vsrc = reinterpret_cast<const V128*>(*src + region_size * region); + auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region); // Load the blocks, unaligned - data[i] = _mm_loadu_si128(vsrc); + data[i] = V128_LoadU(vsrc); // Store the blocks, aligned - _mm_store_si128(vdst, data[i]); + V128_Store(vdst, data[i]); // Compute the running CRC crcs[region] = crc32c_t{static_cast<uint32_t>( - _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), - static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))}; + CRC32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(V128_Extract64<0>(data[i]))))}; crcs[region] = crc32c_t{static_cast<uint32_t>( - _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), - static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))}; + CRC32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(V128_Extract64<1>(data[i]))))}; } for (size_t i = 0; i < int_regions; i++) { @@ -125,7 +127,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, size_t data_index = i * kIntLoadsPerVec + j; int_data[data_index] = *(usrc + j); - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( + crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64( static_cast<uint32_t>(crcs[region]), int_data[data_index]))}; *(udst + j) = int_data[data_index]; @@ -133,8 +135,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, } // Increment pointers - *src += sizeof(__m128i); - *dst += sizeof(__m128i); + *src += sizeof(V128); + *dst += sizeof(V128); --copy_rounds; } } @@ -158,8 +160,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( void* __restrict dst, const void* __restrict src, std::size_t length, crc32c_t initial_crc) const { constexpr std::size_t kRegions = vec_regions + int_regions; + static_assert(kRegions > 0, "Must specify at least one region."); constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; - constexpr std::size_t kBlockSize = sizeof(__m128i); + constexpr std::size_t kBlockSize = sizeof(V128); constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; // Number of blocks per cacheline. @@ -235,7 +238,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( const std::size_t tail_size = length - (kRegions * region_size); // Holding registers for data in each region. - std::array<__m128i, vec_regions> vec_data; + std::array<V128, vec_regions> vec_data; std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data; // Main loop. @@ -243,7 +246,10 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Prefetch kPrefetchAhead bytes ahead of each pointer. for (size_t i = 0; i < kRegions; i++) { absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i); +#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE + // TODO(b/297082454): investigate dropping prefetch on x86. absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i); +#endif } // Load and store data, computing CRC on the way. @@ -256,21 +262,20 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( size_t region = (j + i) % kRegions; auto* vsrc = - reinterpret_cast<const __m128i*>(src_bytes + region_size * region); - auto* vdst = - reinterpret_cast<__m128i*>(dst_bytes + region_size * region); + reinterpret_cast<const V128*>(src_bytes + region_size * region); + auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region); // Load and CRC data. - vec_data[j] = _mm_loadu_si128(vsrc + i); - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( - static_cast<uint32_t>(crcs[region]), - static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))}; - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( - static_cast<uint32_t>(crcs[region]), - static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))}; + vec_data[j] = V128_LoadU(vsrc + i); + crcs[region] = crc32c_t{static_cast<uint32_t>( + CRC32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))}; + crcs[region] = crc32c_t{static_cast<uint32_t>( + CRC32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))}; // Store the data. - _mm_store_si128(vdst + i, vec_data[j]); + V128_Store(vdst + i, vec_data[j]); } // Preload the partial CRCs for the CLMUL subregions. @@ -290,7 +295,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Load and CRC the data. int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( + crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64( static_cast<uint32_t>(crcs[region]), int_data[data_index]))}; // Store the data. @@ -313,6 +318,21 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( src_bytes += region_size * (kRegions - 1); dst_bytes += region_size * (kRegions - 1); + // Copy and CRC the tail through the XMM registers. + std::size_t tail_blocks = tail_size / kBlockSize; + LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0, + tail_blocks); + + // Final tail copy for under 16 bytes. + crcs[kRegions - 1] = + ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize, + crcs[kRegions - 1]); + + if (kRegions == 1) { + // If there is only one region, finalize and return its CRC. + return crc32c_t{static_cast<uint32_t>(crcs[0]) ^ kCrcDataXor}; + } + // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the // XOR done before doing block copy + CRCs. for (size_t i = 0; i + 1 < kRegions; i++) { @@ -325,16 +345,6 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( full_crc = ConcatCrc32c(full_crc, crcs[i], region_size); } - // Copy and CRC the tail through the XMM registers. - std::size_t tail_blocks = tail_size / kBlockSize; - LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0, - tail_blocks); - - // Final tail copy for under 16 bytes. - crcs[kRegions - 1] = - ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize, - crcs[kRegions - 1]); - // Finalize and concatenate the final CRC, then return. crcs[kRegions - 1] = crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor}; @@ -347,9 +357,11 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { // Get the underlying architecture. CpuType cpu_type = GetCpuType(); switch (cpu_type) { - case CpuType::kUnknown: case CpuType::kAmdRome: case CpuType::kAmdNaples: + case CpuType::kAmdMilan: + case CpuType::kAmdGenoa: + case CpuType::kAmdRyzenV3000: case CpuType::kIntelCascadelakeXeon: case CpuType::kIntelSkylakeXeon: case CpuType::kIntelSkylake: @@ -385,6 +397,9 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { // strided access to each region, and do the right thing. case CpuType::kAmdRome: case CpuType::kAmdNaples: + case CpuType::kAmdMilan: + case CpuType::kAmdGenoa: + case CpuType::kAmdRyzenV3000: return { /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(), /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), @@ -421,6 +436,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector, return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>(); } else if (vector == 1 && integer == 2) { return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>(); + } else if (vector == 1 && integer == 0) { + return std::make_unique<AcceleratedCrcMemcpyEngine<1, 0>>(); } return nullptr; } @@ -429,4 +446,5 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector, ABSL_NAMESPACE_END } // namespace absl -#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE || + // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE diff --git a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc index ef521d22d1..51eff4eddc 100644 --- a/contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc +++ b/contrib/restricted/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc @@ -16,14 +16,14 @@ #include <cstddef> #include <cstdint> +#include <memory> +#include <vector> #include "absl/base/attributes.h" #include "absl/base/config.h" -#include "absl/base/dynamic_annotations.h" #include "absl/base/internal/endian.h" #include "absl/base/prefetch.h" #include "absl/crc/internal/cpu_detect.h" -#include "absl/crc/internal/crc.h" #include "absl/crc/internal/crc32_x86_arm_combined_simd.h" #include "absl/crc/internal/crc_internal.h" #include "absl/memory/memory.h" @@ -634,8 +634,16 @@ CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 3, 0, CutoffStrategy::Fold3>(); case CpuType::kArmNeoverseN1: + case CpuType::kArmNeoverseN2: + case CpuType::kArmNeoverseV1: return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 1, 1, CutoffStrategy::Unroll64CRC>(); + case CpuType::kAmpereSiryn: + return new CRC32AcceleratedX86ARMCombinedMultipleStreams< + 3, 2, CutoffStrategy::Fold3>(); + case CpuType::kArmNeoverseV2: + return new CRC32AcceleratedX86ARMCombinedMultipleStreams< + 1, 2, CutoffStrategy::Unroll64CRC>(); #if defined(__aarch64__) default: // Not all ARM processors support the needed instructions, so check here |