// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements CRC32C using Intel's SSE4 crc32 instruction.
// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero,
// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise.
#include "crc32c_sse4.h"
#include <util/system/compiler.h>
#if HAVE_I386 || HAVE_AMD64
namespace crcutil {
#define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \
CRC_UPDATE_WORD(crc0, \
reinterpret_cast<const size_t *>(src + \
0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
CRC_UPDATE_WORD(crc1, \
reinterpret_cast<const size_t *>(src + \
1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
CRC_UPDATE_WORD(crc2, \
reinterpret_cast<const size_t *>(src + \
2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
if (num_stripes > 3) { \
CRC_UPDATE_WORD(crc3, \
reinterpret_cast<const size_t *>(src + \
3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
} \
} while (0)
// Multiplies "crc" by "x**(8 * STRIPE_SIZE(block_size)"
// using appropriate multiplication table(s).
//
#if 0
// This variant is for illustration purposes only.
// Actual implementation below:
// 1. Splits the computation into 2 data-independent paths
// by independently multiplying lower and upper halves
// of "crc0" in interleaved manner, and combining the
// results in the end.
// 2. Removing redundant "crc0 = 0" etc. in the beginning.
// 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round.
#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
size_t tmp0 = crc0; \
crc0 = 0; \
for (size_t i = 0; i < kNumTables; ++i) { \
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[i][tmp0 & (kTableEntries - 1)]; \
tmp0 >>= kTableEntryBits; \
} \
} while (0)
#else
#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
size_t tmp0 = crc0; \
size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \
crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[0][tmp0 & (kTableEntries - 1)]; \
tmp0 >>= kTableEntryBits; \
size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
tmp1 >>= kTableEntryBits; \
for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[i][tmp0 & (kTableEntries - 1)]; \
tmp0 >>= kTableEntryBits; \
crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
tmp1 >>= kTableEntryBits; \
} \
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \
if (kNumTables & 1) { \
tmp0 >>= kTableEntryBits; \
} \
crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[kNumTables - 1][tmp1]; \
if (kNumTables & 1) { \
crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
[kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \
} \
crc0 ^= crc1; \
} while (0)
#endif
// Given CRCs (crc0, crc1, etc.) of consequitive
// stripes of STRIPE_SIZE(block_size) bytes each,
// produces CRC of concatenated stripes.
#define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \
MULTIPLY_CRC(crc0, block_size, num_stripes); \
crc0 ^= crc1; \
MULTIPLY_CRC(crc0, block_size, num_stripes); \
crc0 ^= crc2; \
if (num_stripes > 3) { \
MULTIPLY_CRC(crc0, block_size, num_stripes); \
crc0 ^= crc3; \
} \
} while (0)
// Processes input BLOCK_SIZE(block) bytes per iteration
// by splitting a block of BLOCK_SIZE(block) bytes into N
// equally-sized stripes of STRIPE_SIZE(block_size) each,
// computing CRC of each stripe, and concatenating stripe CRCs.
#define PROCESS_BLOCK(block_size, num_stripes) do { \
while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
Crc crc1 = 0; \
Crc crc2 = 0; \
Crc crc3; \
if (num_stripes > 3) crc3 = 0; \
{ \
const uint8 *stripe_end = src + \
(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \
kUnrolledLoopBytes) * kUnrolledLoopBytes; \
do { \
UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \
UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \
src += kUnrolledLoopBytes; \
} while (src < stripe_end); \
if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
kUnrolledLoopBytes) != 0) { \
stripe_end += \
CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
kUnrolledLoopBytes; \
do { \
UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
src += sizeof(size_t); \
} while (src < stripe_end); \
} \
} \
COMBINE_STRIPE_CRCS(block_size, num_stripes); \
src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \
((num_stripes) - 1); \
bytes = static_cast<size_t>(end - src); \
} \
no_more_##block_size##_##num_stripes:; \
} while (0)
Y_NO_SANITIZE("undefined")
size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const {
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
crc0 ^= Base().Canonize();
// If we don't have too much data to process,
// do not waste time trying to align input etc.
// Noticeably improves performance on small inputs.
if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t;
if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t;
if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t;
#define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \
if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
goto no_more_##block_size##_##num_stripes; \
} \
} while (0)
#define NOOP(block_size, num_stripes)
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL,
NOOP,
NOOP);
#undef PROCESS_TAIL_IF_SMALL
// Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because:
// 1. It uses CRC_BYTE() which won't work.
// 2. Its threshold may be incorrect becuase Crc32 that uses
// native CPU crc32 instruction is much faster than
// generic table-based CRC computation.
//
// In case of X5550 CPU, break even point is at 2KB -- exactly.
if (bytes >= 2 * 1024) {
while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) {
if (src >= end) {
return (crc0 ^ Base().Canonize());
}
CRC_UPDATE_BYTE(crc0, src[0]);
src += 1;
}
bytes = static_cast<size_t>(end - src);
}
if (src >= end) {
return (crc0 ^ Base().Canonize());
}
// Quickly skip processing of too large blocks
// Noticeably improves performance on small inputs.
#define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \
if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
goto no_more_##block_size##_##num_stripes; \
} \
} while (0)
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP,
SKIP_BLOCK_IF_NEEDED,
SKIP_BLOCK_IF_NEEDED);
#undef SKIP_BLOCK_IF_NEEDED
// Process data in all blocks.
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK,
PROCESS_BLOCK,
PROCESS_BLOCK);
// Finish the tail word-by-word and then byte-by-byte.
#define CRC_UPDATE_WORD_4(index) do { \
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \
} while (0)
if (bytes >= 4 * 4 * sizeof(size_t)) {
end -= 4 * 4 * sizeof(size_t);
do {
CRC_UPDATE_WORD_4(4 * 0);
CRC_UPDATE_WORD_4(4 * 1);
CRC_UPDATE_WORD_4(4 * 2);
CRC_UPDATE_WORD_4(4 * 3);
src += 4 * 4 * sizeof(size_t);
} while (src <= end);
end += 4 * 4 * sizeof(size_t);
bytes = static_cast<size_t>(end - src);
}
less_than_16_size_t:
if (bytes >= 4 * 2 * sizeof(size_t)) {
CRC_UPDATE_WORD_4(4 * 0);
CRC_UPDATE_WORD_4(4 * 1);
src += 4 * 2 * sizeof(size_t);
bytes -= 4 * 2 * sizeof(size_t);
}
less_than_8_size_t:
if (bytes >= 4 * sizeof(size_t)) {
CRC_UPDATE_WORD_4(0);
src += 4 * sizeof(size_t);
bytes -= 4 * sizeof(size_t);
}
less_than_4_size_t:
if (bytes >= 1 * sizeof(size_t)) {
end -= 1 * sizeof(size_t);
do {
CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]);
src += 1 * sizeof(size_t);
} while (src <= end);
end += 1 * sizeof(size_t);
}
while (src < end) {
CRC_UPDATE_BYTE(crc0, src[0]);
src += 1;
}
return (crc0 ^ Base().Canonize());
}
void Crc32cSSE4::Init(bool constant) {
base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant);
#define INIT_MUL_TABLE(block_size, num_stripes) do { \
size_t multiplier = \
Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \
for (size_t table = 0; table < kNumTables; ++table) { \
for (size_t entry = 0; entry < kTableEntries; ++entry) { \
size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \
CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \
static_cast<Entry>(Base().Multiply(value, multiplier)); \
} \
} \
} while (0)
CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE);
#undef INIT_MUL_TABLE
#if !CRCUTIL_USE_MM_CRC32
for (size_t j = 0; j < sizeof(Word); ++j) {
Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32);
for (size_t i = 0; i < 256; ++i) {
crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k);
}
}
#endif // !CRCUTIL_USE_MM_CRC32
}
bool Crc32cSSE4::IsSSE42Available() {
#if defined(_MSC_VER)
int cpu_info[4];
__cpuid(cpu_info, 1);
return ((cpu_info[2] & (1 << 20)) != 0);
#elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386)
// Not using "cpuid.h" intentionally: it is missing from
// too many installations.
uint32 eax;
uint32 ecx;
uint32 edx;
__asm__ volatile(
#if HAVE_I386 && defined(__PIC__)
"push %%ebx\n"
"cpuid\n"
"pop %%ebx\n"
#else
"cpuid\n"
#endif // HAVE_I386 && defined(__PIC__)
: "=a" (eax), "=c" (ecx), "=d" (edx)
: "a" (1), "2" (0)
: "%ebx"
);
return ((ecx & (1 << 20)) != 0);
#else
return false;
#endif
}
void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc,
size_t roll_window_bytes,
const Crc &start_value) {
crc_ = &crc;
roll_window_bytes_ = roll_window_bytes;
start_value_ = start_value;
Crc add = crc.Base().Canonize() ^ start_value;
add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes));
add ^= crc.Base().Canonize();
Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1);
add = crc.Base().Multiply(add, mul);
mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree());
for (size_t i = 0; i < 256; ++i) {
out_[i] = static_cast<Entry>(
crc.Base().MultiplyUnnormalized(
static_cast<Crc>(i), 8, mul) ^ add);
}
#if !CRCUTIL_USE_MM_CRC32
memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_));
#endif // !CRCUTIL_USE_MM_CRC32
}
} // namespace crcutil
#endif // HAVE_I386 || HAVE_AMD64