diff options
author | f0b0s <f0b0s@yandex-team.ru> | 2022-02-10 16:46:51 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:51 +0300 |
commit | cdae02d225fb5b3afbb28990e79a7ac6c9125327 (patch) | |
tree | 49e222ea1c5804306084bb3ae065bb702625360f /contrib/libs/crcutil/crc32c_sse4.cc | |
parent | deabc5260ac2e17b8f5152ee060bec1740613540 (diff) | |
download | ydb-cdae02d225fb5b3afbb28990e79a7ac6c9125327.tar.gz |
Restoring authorship annotation for <f0b0s@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/crcutil/crc32c_sse4.cc')
-rw-r--r-- | contrib/libs/crcutil/crc32c_sse4.cc | 726 |
1 files changed, 363 insertions, 363 deletions
diff --git a/contrib/libs/crcutil/crc32c_sse4.cc b/contrib/libs/crcutil/crc32c_sse4.cc index 9875ab4ff2..ed0f64410a 100644 --- a/contrib/libs/crcutil/crc32c_sse4.cc +++ b/contrib/libs/crcutil/crc32c_sse4.cc @@ -1,369 +1,369 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements CRC32C using Intel's SSE4 crc32 instruction. -// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero, -// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise. - -#include "crc32c_sse4.h" - +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements CRC32C using Intel's SSE4 crc32 instruction. +// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero, +// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise. + +#include "crc32c_sse4.h" + #include <util/system/compiler.h> -#if HAVE_I386 || HAVE_AMD64 - -namespace crcutil { - -#define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \ - CRC_UPDATE_WORD(crc0, \ - reinterpret_cast<const size_t *>(src + \ - 0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ - CRC_UPDATE_WORD(crc1, \ - reinterpret_cast<const size_t *>(src + \ - 1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ - CRC_UPDATE_WORD(crc2, \ - reinterpret_cast<const size_t *>(src + \ - 2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ - if (num_stripes > 3) { \ - CRC_UPDATE_WORD(crc3, \ - reinterpret_cast<const size_t *>(src + \ - 3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ - } \ -} while (0) - -// Multiplies "crc" by "x**(8 * STRIPE_SIZE(block_size)" -// using appropriate multiplication table(s). -// -#if 0 - -// This variant is for illustration purposes only. -// Actual implementation below: -// 1. Splits the computation into 2 data-independent paths -// by independently multiplying lower and upper halves -// of "crc0" in interleaved manner, and combining the -// results in the end. -// 2. Removing redundant "crc0 = 0" etc. in the beginning. -// 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round. -#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ - size_t tmp0 = crc0; \ - crc0 = 0; \ - for (size_t i = 0; i < kNumTables; ++i) { \ - crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [i][tmp0 & (kTableEntries - 1)]; \ - tmp0 >>= kTableEntryBits; \ - } \ -} while (0) - -#else - -#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ - size_t tmp0 = crc0; \ - size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \ - crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [0][tmp0 & (kTableEntries - 1)]; \ - tmp0 >>= kTableEntryBits; \ - size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ - tmp1 >>= kTableEntryBits; \ - for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \ - crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [i][tmp0 & (kTableEntries - 1)]; \ - tmp0 >>= kTableEntryBits; \ - crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ - tmp1 >>= kTableEntryBits; \ - } \ - crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \ - if (kNumTables & 1) { \ - tmp0 >>= kTableEntryBits; \ - } \ - crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [kNumTables - 1][tmp1]; \ - if (kNumTables & 1) { \ - crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \ - } \ - crc0 ^= crc1; \ -} while (0) - -#endif - -// Given CRCs (crc0, crc1, etc.) of consequitive -// stripes of STRIPE_SIZE(block_size) bytes each, -// produces CRC of concatenated stripes. -#define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \ - MULTIPLY_CRC(crc0, block_size, num_stripes); \ - crc0 ^= crc1; \ - MULTIPLY_CRC(crc0, block_size, num_stripes); \ - crc0 ^= crc2; \ - if (num_stripes > 3) { \ - MULTIPLY_CRC(crc0, block_size, num_stripes); \ - crc0 ^= crc3; \ - } \ -} while (0) - -// Processes input BLOCK_SIZE(block) bytes per iteration -// by splitting a block of BLOCK_SIZE(block) bytes into N -// equally-sized stripes of STRIPE_SIZE(block_size) each, -// computing CRC of each stripe, and concatenating stripe CRCs. -#define PROCESS_BLOCK(block_size, num_stripes) do { \ - while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ - Crc crc1 = 0; \ - Crc crc2 = 0; \ - Crc crc3; \ - if (num_stripes > 3) crc3 = 0; \ - { \ - const uint8 *stripe_end = src + \ - (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \ - kUnrolledLoopBytes) * kUnrolledLoopBytes; \ - do { \ - UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \ - src += kUnrolledLoopBytes; \ - } while (src < stripe_end); \ - if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ - kUnrolledLoopBytes) != 0) { \ - stripe_end += \ - CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ - kUnrolledLoopBytes; \ - do { \ - UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ - src += sizeof(size_t); \ - } while (src < stripe_end); \ - } \ - } \ - COMBINE_STRIPE_CRCS(block_size, num_stripes); \ - src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \ - ((num_stripes) - 1); \ - bytes = static_cast<size_t>(end - src); \ - } \ - no_more_##block_size##_##num_stripes:; \ -} while (0) - +#if HAVE_I386 || HAVE_AMD64 + +namespace crcutil { + +#define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \ + CRC_UPDATE_WORD(crc0, \ + reinterpret_cast<const size_t *>(src + \ + 0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ + CRC_UPDATE_WORD(crc1, \ + reinterpret_cast<const size_t *>(src + \ + 1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ + CRC_UPDATE_WORD(crc2, \ + reinterpret_cast<const size_t *>(src + \ + 2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ + if (num_stripes > 3) { \ + CRC_UPDATE_WORD(crc3, \ + reinterpret_cast<const size_t *>(src + \ + 3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ + } \ +} while (0) + +// Multiplies "crc" by "x**(8 * STRIPE_SIZE(block_size)" +// using appropriate multiplication table(s). +// +#if 0 + +// This variant is for illustration purposes only. +// Actual implementation below: +// 1. Splits the computation into 2 data-independent paths +// by independently multiplying lower and upper halves +// of "crc0" in interleaved manner, and combining the +// results in the end. +// 2. Removing redundant "crc0 = 0" etc. in the beginning. +// 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round. +#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ + size_t tmp0 = crc0; \ + crc0 = 0; \ + for (size_t i = 0; i < kNumTables; ++i) { \ + crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [i][tmp0 & (kTableEntries - 1)]; \ + tmp0 >>= kTableEntryBits; \ + } \ +} while (0) + +#else + +#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ + size_t tmp0 = crc0; \ + size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \ + crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [0][tmp0 & (kTableEntries - 1)]; \ + tmp0 >>= kTableEntryBits; \ + size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ + tmp1 >>= kTableEntryBits; \ + for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \ + crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [i][tmp0 & (kTableEntries - 1)]; \ + tmp0 >>= kTableEntryBits; \ + crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ + tmp1 >>= kTableEntryBits; \ + } \ + crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \ + if (kNumTables & 1) { \ + tmp0 >>= kTableEntryBits; \ + } \ + crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [kNumTables - 1][tmp1]; \ + if (kNumTables & 1) { \ + crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \ + } \ + crc0 ^= crc1; \ +} while (0) + +#endif + +// Given CRCs (crc0, crc1, etc.) of consequitive +// stripes of STRIPE_SIZE(block_size) bytes each, +// produces CRC of concatenated stripes. +#define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \ + MULTIPLY_CRC(crc0, block_size, num_stripes); \ + crc0 ^= crc1; \ + MULTIPLY_CRC(crc0, block_size, num_stripes); \ + crc0 ^= crc2; \ + if (num_stripes > 3) { \ + MULTIPLY_CRC(crc0, block_size, num_stripes); \ + crc0 ^= crc3; \ + } \ +} while (0) + +// Processes input BLOCK_SIZE(block) bytes per iteration +// by splitting a block of BLOCK_SIZE(block) bytes into N +// equally-sized stripes of STRIPE_SIZE(block_size) each, +// computing CRC of each stripe, and concatenating stripe CRCs. +#define PROCESS_BLOCK(block_size, num_stripes) do { \ + while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ + Crc crc1 = 0; \ + Crc crc2 = 0; \ + Crc crc3; \ + if (num_stripes > 3) crc3 = 0; \ + { \ + const uint8 *stripe_end = src + \ + (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \ + kUnrolledLoopBytes) * kUnrolledLoopBytes; \ + do { \ + UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \ + src += kUnrolledLoopBytes; \ + } while (src < stripe_end); \ + if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ + kUnrolledLoopBytes) != 0) { \ + stripe_end += \ + CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ + kUnrolledLoopBytes; \ + do { \ + UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ + src += sizeof(size_t); \ + } while (src < stripe_end); \ + } \ + } \ + COMBINE_STRIPE_CRCS(block_size, num_stripes); \ + src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \ + ((num_stripes) - 1); \ + bytes = static_cast<size_t>(end - src); \ + } \ + no_more_##block_size##_##num_stripes:; \ +} while (0) + Y_NO_SANITIZE("undefined") -size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const { - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - crc0 ^= Base().Canonize(); - - // If we don't have too much data to process, - // do not waste time trying to align input etc. - // Noticeably improves performance on small inputs. - if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t; - if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t; - if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t; - -#define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \ - if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ - goto no_more_##block_size##_##num_stripes; \ - } \ -} while (0) -#define NOOP(block_size, num_stripes) - - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL, - NOOP, - NOOP); - -#undef PROCESS_TAIL_IF_SMALL - - - // Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because: - // 1. It uses CRC_BYTE() which won't work. - // 2. Its threshold may be incorrect becuase Crc32 that uses - // native CPU crc32 instruction is much faster than - // generic table-based CRC computation. - // - // In case of X5550 CPU, break even point is at 2KB -- exactly. - if (bytes >= 2 * 1024) { - while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { - if (src >= end) { - return (crc0 ^ Base().Canonize()); - } - CRC_UPDATE_BYTE(crc0, src[0]); - src += 1; - } - bytes = static_cast<size_t>(end - src); - } - if (src >= end) { - return (crc0 ^ Base().Canonize()); - } - - // Quickly skip processing of too large blocks - // Noticeably improves performance on small inputs. -#define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \ - if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ - goto no_more_##block_size##_##num_stripes; \ - } \ -} while (0) - - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP, - SKIP_BLOCK_IF_NEEDED, - SKIP_BLOCK_IF_NEEDED); - -#undef SKIP_BLOCK_IF_NEEDED - - // Process data in all blocks. - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK, - PROCESS_BLOCK, - PROCESS_BLOCK); - - // Finish the tail word-by-word and then byte-by-byte. -#define CRC_UPDATE_WORD_4(index) do { \ - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \ - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \ - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \ - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \ -} while (0) - - if (bytes >= 4 * 4 * sizeof(size_t)) { - end -= 4 * 4 * sizeof(size_t); - do { - CRC_UPDATE_WORD_4(4 * 0); - CRC_UPDATE_WORD_4(4 * 1); - CRC_UPDATE_WORD_4(4 * 2); - CRC_UPDATE_WORD_4(4 * 3); - src += 4 * 4 * sizeof(size_t); - } while (src <= end); - end += 4 * 4 * sizeof(size_t); - bytes = static_cast<size_t>(end - src); - } - less_than_16_size_t: - - if (bytes >= 4 * 2 * sizeof(size_t)) { - CRC_UPDATE_WORD_4(4 * 0); - CRC_UPDATE_WORD_4(4 * 1); - src += 4 * 2 * sizeof(size_t); - bytes -= 4 * 2 * sizeof(size_t); - } - less_than_8_size_t: - - if (bytes >= 4 * sizeof(size_t)) { - CRC_UPDATE_WORD_4(0); - src += 4 * sizeof(size_t); - bytes -= 4 * sizeof(size_t); - } - less_than_4_size_t: - - if (bytes >= 1 * sizeof(size_t)) { - end -= 1 * sizeof(size_t); - do { - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]); - src += 1 * sizeof(size_t); - } while (src <= end); - end += 1 * sizeof(size_t); - } - - while (src < end) { - CRC_UPDATE_BYTE(crc0, src[0]); - src += 1; - } - - return (crc0 ^ Base().Canonize()); -} - - -void Crc32cSSE4::Init(bool constant) { - base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant); - -#define INIT_MUL_TABLE(block_size, num_stripes) do { \ - size_t multiplier = \ - Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \ - for (size_t table = 0; table < kNumTables; ++table) { \ - for (size_t entry = 0; entry < kTableEntries; ++entry) { \ - size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \ - CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \ - static_cast<Entry>(Base().Multiply(value, multiplier)); \ - } \ - } \ -} while (0) - - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE); - -#undef INIT_MUL_TABLE - -#if !CRCUTIL_USE_MM_CRC32 - for (size_t j = 0; j < sizeof(Word); ++j) { - Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32); - for (size_t i = 0; i < 256; ++i) { - crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k); - } - } -#endif // !CRCUTIL_USE_MM_CRC32 -} - - -bool Crc32cSSE4::IsSSE42Available() { -#if defined(_MSC_VER) - int cpu_info[4]; - __cpuid(cpu_info, 1); +size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const { + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + crc0 ^= Base().Canonize(); + + // If we don't have too much data to process, + // do not waste time trying to align input etc. + // Noticeably improves performance on small inputs. + if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t; + if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t; + if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t; + +#define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \ + if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ + goto no_more_##block_size##_##num_stripes; \ + } \ +} while (0) +#define NOOP(block_size, num_stripes) + + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL, + NOOP, + NOOP); + +#undef PROCESS_TAIL_IF_SMALL + + + // Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because: + // 1. It uses CRC_BYTE() which won't work. + // 2. Its threshold may be incorrect becuase Crc32 that uses + // native CPU crc32 instruction is much faster than + // generic table-based CRC computation. + // + // In case of X5550 CPU, break even point is at 2KB -- exactly. + if (bytes >= 2 * 1024) { + while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { + if (src >= end) { + return (crc0 ^ Base().Canonize()); + } + CRC_UPDATE_BYTE(crc0, src[0]); + src += 1; + } + bytes = static_cast<size_t>(end - src); + } + if (src >= end) { + return (crc0 ^ Base().Canonize()); + } + + // Quickly skip processing of too large blocks + // Noticeably improves performance on small inputs. +#define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \ + if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ + goto no_more_##block_size##_##num_stripes; \ + } \ +} while (0) + + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP, + SKIP_BLOCK_IF_NEEDED, + SKIP_BLOCK_IF_NEEDED); + +#undef SKIP_BLOCK_IF_NEEDED + + // Process data in all blocks. + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK, + PROCESS_BLOCK, + PROCESS_BLOCK); + + // Finish the tail word-by-word and then byte-by-byte. +#define CRC_UPDATE_WORD_4(index) do { \ + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \ + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \ + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \ + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \ +} while (0) + + if (bytes >= 4 * 4 * sizeof(size_t)) { + end -= 4 * 4 * sizeof(size_t); + do { + CRC_UPDATE_WORD_4(4 * 0); + CRC_UPDATE_WORD_4(4 * 1); + CRC_UPDATE_WORD_4(4 * 2); + CRC_UPDATE_WORD_4(4 * 3); + src += 4 * 4 * sizeof(size_t); + } while (src <= end); + end += 4 * 4 * sizeof(size_t); + bytes = static_cast<size_t>(end - src); + } + less_than_16_size_t: + + if (bytes >= 4 * 2 * sizeof(size_t)) { + CRC_UPDATE_WORD_4(4 * 0); + CRC_UPDATE_WORD_4(4 * 1); + src += 4 * 2 * sizeof(size_t); + bytes -= 4 * 2 * sizeof(size_t); + } + less_than_8_size_t: + + if (bytes >= 4 * sizeof(size_t)) { + CRC_UPDATE_WORD_4(0); + src += 4 * sizeof(size_t); + bytes -= 4 * sizeof(size_t); + } + less_than_4_size_t: + + if (bytes >= 1 * sizeof(size_t)) { + end -= 1 * sizeof(size_t); + do { + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]); + src += 1 * sizeof(size_t); + } while (src <= end); + end += 1 * sizeof(size_t); + } + + while (src < end) { + CRC_UPDATE_BYTE(crc0, src[0]); + src += 1; + } + + return (crc0 ^ Base().Canonize()); +} + + +void Crc32cSSE4::Init(bool constant) { + base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant); + +#define INIT_MUL_TABLE(block_size, num_stripes) do { \ + size_t multiplier = \ + Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \ + for (size_t table = 0; table < kNumTables; ++table) { \ + for (size_t entry = 0; entry < kTableEntries; ++entry) { \ + size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \ + CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \ + static_cast<Entry>(Base().Multiply(value, multiplier)); \ + } \ + } \ +} while (0) + + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE); + +#undef INIT_MUL_TABLE + +#if !CRCUTIL_USE_MM_CRC32 + for (size_t j = 0; j < sizeof(Word); ++j) { + Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32); + for (size_t i = 0; i < 256; ++i) { + crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k); + } + } +#endif // !CRCUTIL_USE_MM_CRC32 +} + + +bool Crc32cSSE4::IsSSE42Available() { +#if defined(_MSC_VER) + int cpu_info[4]; + __cpuid(cpu_info, 1); return ((cpu_info[2] & (1 << 20)) != 0); -#elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386) - // Not using "cpuid.h" intentionally: it is missing from - // too many installations. - uint32 eax; - uint32 ecx; - uint32 edx; - __asm__ volatile( -#if HAVE_I386 && defined(__PIC__) +#elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386) + // Not using "cpuid.h" intentionally: it is missing from + // too many installations. + uint32 eax; + uint32 ecx; + uint32 edx; + __asm__ volatile( +#if HAVE_I386 && defined(__PIC__) "push %%ebx\n" - "cpuid\n" + "cpuid\n" "pop %%ebx\n" -#else - "cpuid\n" -#endif // HAVE_I386 && defined(__PIC__) - : "=a" (eax), "=c" (ecx), "=d" (edx) - : "a" (1), "2" (0) - : "%ebx" - ); - return ((ecx & (1 << 20)) != 0); -#else - return false; -#endif -} - - -void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc, - size_t roll_window_bytes, - const Crc &start_value) { - crc_ = &crc; - roll_window_bytes_ = roll_window_bytes; - start_value_ = start_value; - - Crc add = crc.Base().Canonize() ^ start_value; - add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes)); - add ^= crc.Base().Canonize(); - Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1); - add = crc.Base().Multiply(add, mul); - - mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree()); - for (size_t i = 0; i < 256; ++i) { - out_[i] = static_cast<Entry>( - crc.Base().MultiplyUnnormalized( - static_cast<Crc>(i), 8, mul) ^ add); - } - -#if !CRCUTIL_USE_MM_CRC32 - memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_)); -#endif // !CRCUTIL_USE_MM_CRC32 -} - -} // namespace crcutil - -#endif // HAVE_I386 || HAVE_AMD64 +#else + "cpuid\n" +#endif // HAVE_I386 && defined(__PIC__) + : "=a" (eax), "=c" (ecx), "=d" (edx) + : "a" (1), "2" (0) + : "%ebx" + ); + return ((ecx & (1 << 20)) != 0); +#else + return false; +#endif +} + + +void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc, + size_t roll_window_bytes, + const Crc &start_value) { + crc_ = &crc; + roll_window_bytes_ = roll_window_bytes; + start_value_ = start_value; + + Crc add = crc.Base().Canonize() ^ start_value; + add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes)); + add ^= crc.Base().Canonize(); + Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1); + add = crc.Base().Multiply(add, mul); + + mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree()); + for (size_t i = 0; i < 256; ++i) { + out_[i] = static_cast<Entry>( + crc.Base().MultiplyUnnormalized( + static_cast<Crc>(i), 8, mul) ^ add); + } + +#if !CRCUTIL_USE_MM_CRC32 + memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_)); +#endif // !CRCUTIL_USE_MM_CRC32 +} + +} // namespace crcutil + +#endif // HAVE_I386 || HAVE_AMD64 |