diff options
author | f0b0s <f0b0s@yandex-team.ru> | 2022-02-10 16:46:51 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:51 +0300 |
commit | deabc5260ac2e17b8f5152ee060bec1740613540 (patch) | |
tree | bc498b2fe3c447d13c2abea85b429fee8dd485ef /contrib/libs/crcutil | |
parent | 2e6009493e74f88988b81f219b301f450331648d (diff) | |
download | ydb-deabc5260ac2e17b8f5152ee060bec1740613540.tar.gz |
Restoring authorship annotation for <f0b0s@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/crcutil')
23 files changed, 4785 insertions, 4785 deletions
diff --git a/contrib/libs/crcutil/aligned_alloc.h b/contrib/libs/crcutil/aligned_alloc.h index 37cefbacd9..57bdc91d8a 100644 --- a/contrib/libs/crcutil/aligned_alloc.h +++ b/contrib/libs/crcutil/aligned_alloc.h @@ -1,66 +1,66 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Poor man's platform-independent implementation of aligned memory allocator. - -#ifndef CRCUTIL_ALIGNED_ALLOC_H_ -#define CRCUTIL_ALIGNED_ALLOC_H_ - -#include "std_headers.h" // size_t, ptrdiff_t - -namespace crcutil { - -// Allocates a block of memory of "size" bytes so that a field -// at "field_offset" is aligned on "align" boundary. -// -// NB #1: "align" shall be exact power of two. -// -// NB #2: memory allocated by AlignedAlloc should be release by AlignedFree(). -// -inline void *AlignedAlloc(size_t size, - size_t field_offset, - size_t align, - const void **allocated_mem) { - if (align == 0 || (align & (align - 1)) != 0 || align < sizeof(char *)) { - align = sizeof(*allocated_mem); - } - size += align - 1 + sizeof(*allocated_mem); - char *allocated_memory = new char[size]; - char *aligned_memory = allocated_memory + sizeof(*allocated_mem); - field_offset &= align - 1; - size_t actual_alignment = - reinterpret_cast<size_t>(aligned_memory + field_offset) & (align - 1); - if (actual_alignment != 0) { - aligned_memory += align - actual_alignment; - } - reinterpret_cast<char **>(aligned_memory)[-1] = allocated_memory; - - if (allocated_mem != NULL) { - *allocated_mem = allocated_memory; - } - - return aligned_memory; -} - -// Frees memory allocated by AlignedAlloc(). -inline void AlignedFree(void *aligned_memory) { - if (aligned_memory != NULL) { - char *allocated_memory = reinterpret_cast<char **>(aligned_memory)[-1]; - delete[] allocated_memory; - } -} - -} // namespace crcutil - -#endif // CRCUTIL_ALIGNED_ALLOC_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Poor man's platform-independent implementation of aligned memory allocator. + +#ifndef CRCUTIL_ALIGNED_ALLOC_H_ +#define CRCUTIL_ALIGNED_ALLOC_H_ + +#include "std_headers.h" // size_t, ptrdiff_t + +namespace crcutil { + +// Allocates a block of memory of "size" bytes so that a field +// at "field_offset" is aligned on "align" boundary. +// +// NB #1: "align" shall be exact power of two. +// +// NB #2: memory allocated by AlignedAlloc should be release by AlignedFree(). +// +inline void *AlignedAlloc(size_t size, + size_t field_offset, + size_t align, + const void **allocated_mem) { + if (align == 0 || (align & (align - 1)) != 0 || align < sizeof(char *)) { + align = sizeof(*allocated_mem); + } + size += align - 1 + sizeof(*allocated_mem); + char *allocated_memory = new char[size]; + char *aligned_memory = allocated_memory + sizeof(*allocated_mem); + field_offset &= align - 1; + size_t actual_alignment = + reinterpret_cast<size_t>(aligned_memory + field_offset) & (align - 1); + if (actual_alignment != 0) { + aligned_memory += align - actual_alignment; + } + reinterpret_cast<char **>(aligned_memory)[-1] = allocated_memory; + + if (allocated_mem != NULL) { + *allocated_mem = allocated_memory; + } + + return aligned_memory; +} + +// Frees memory allocated by AlignedAlloc(). +inline void AlignedFree(void *aligned_memory) { + if (aligned_memory != NULL) { + char *allocated_memory = reinterpret_cast<char **>(aligned_memory)[-1]; + delete[] allocated_memory; + } +} + +} // namespace crcutil + +#endif // CRCUTIL_ALIGNED_ALLOC_H_ diff --git a/contrib/libs/crcutil/base_types.h b/contrib/libs/crcutil/base_types.h index 5b743640a4..a0d9d9a093 100644 --- a/contrib/libs/crcutil/base_types.h +++ b/contrib/libs/crcutil/base_types.h @@ -1,73 +1,73 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Defines 8/16/32/64-bit integer types. -// -// Either uint64 or uint32 will map to size_t. -// This way, specialized variants of CRC implementation -// parameterized by "size_t" will be reused when -// parameterized by "uint64" or "uint32". -// In their turn, specialized verisons are parameterized -// by "size_t" so that one version of the code is optimal -// both on 32-bit and 64-bit platforms. - -#ifndef CRCUTIL_BASE_TYPES_H_ -#define CRCUTIL_BASE_TYPES_H_ - -#include "std_headers.h" // size_t, ptrdiff_t - -namespace crcutil { - -template<typename A, typename B> class ChooseFirstIfSame { - public: - template<bool same_size, typename AA, typename BB> class ChooseFirstIfTrue { - public: - typedef AA Type; - }; - template<typename AA, typename BB> class ChooseFirstIfTrue<false, AA, BB> { - public: - typedef BB Type; - }; - - typedef typename ChooseFirstIfTrue<sizeof(A) == sizeof(B), A, B>::Type Type; -}; - -typedef unsigned char uint8; -typedef signed char int8; - -typedef unsigned short uint16; -typedef short int16; - -typedef ChooseFirstIfSame<size_t, unsigned int>::Type uint32; -typedef ChooseFirstIfSame<ptrdiff_t, int>::Type int32; - -#if defined(_MSC_VER) -typedef ChooseFirstIfSame<size_t, unsigned __int64>::Type uint64; -typedef ChooseFirstIfSame<ptrdiff_t, __int64>::Type int64; -#define HAVE_UINT64 1 -#elif defined(__GNUC__) -typedef ChooseFirstIfSame<size_t, unsigned long long>::Type uint64; -typedef ChooseFirstIfSame<ptrdiff_t, long long>::Type int64; -#define HAVE_UINT64 1 -#else -// TODO: ensure that everything compiles and works when HAVE_UINT64 is false. -// TODO: remove HAVE_UINT64 and use sizeof(uint64) instead? -#define HAVE_UINT64 0 -typedef uint32 uint64; -typedef int32 int64; -#endif - -} // namespace crcutil - -#endif // CRCUTIL_BASE_TYPES_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Defines 8/16/32/64-bit integer types. +// +// Either uint64 or uint32 will map to size_t. +// This way, specialized variants of CRC implementation +// parameterized by "size_t" will be reused when +// parameterized by "uint64" or "uint32". +// In their turn, specialized verisons are parameterized +// by "size_t" so that one version of the code is optimal +// both on 32-bit and 64-bit platforms. + +#ifndef CRCUTIL_BASE_TYPES_H_ +#define CRCUTIL_BASE_TYPES_H_ + +#include "std_headers.h" // size_t, ptrdiff_t + +namespace crcutil { + +template<typename A, typename B> class ChooseFirstIfSame { + public: + template<bool same_size, typename AA, typename BB> class ChooseFirstIfTrue { + public: + typedef AA Type; + }; + template<typename AA, typename BB> class ChooseFirstIfTrue<false, AA, BB> { + public: + typedef BB Type; + }; + + typedef typename ChooseFirstIfTrue<sizeof(A) == sizeof(B), A, B>::Type Type; +}; + +typedef unsigned char uint8; +typedef signed char int8; + +typedef unsigned short uint16; +typedef short int16; + +typedef ChooseFirstIfSame<size_t, unsigned int>::Type uint32; +typedef ChooseFirstIfSame<ptrdiff_t, int>::Type int32; + +#if defined(_MSC_VER) +typedef ChooseFirstIfSame<size_t, unsigned __int64>::Type uint64; +typedef ChooseFirstIfSame<ptrdiff_t, __int64>::Type int64; +#define HAVE_UINT64 1 +#elif defined(__GNUC__) +typedef ChooseFirstIfSame<size_t, unsigned long long>::Type uint64; +typedef ChooseFirstIfSame<ptrdiff_t, long long>::Type int64; +#define HAVE_UINT64 1 +#else +// TODO: ensure that everything compiles and works when HAVE_UINT64 is false. +// TODO: remove HAVE_UINT64 and use sizeof(uint64) instead? +#define HAVE_UINT64 0 +typedef uint32 uint64; +typedef int32 int64; +#endif + +} // namespace crcutil + +#endif // CRCUTIL_BASE_TYPES_H_ diff --git a/contrib/libs/crcutil/bob_jenkins_rng.h b/contrib/libs/crcutil/bob_jenkins_rng.h index aed178c2a7..0925af960d 100644 --- a/contrib/libs/crcutil/bob_jenkins_rng.h +++ b/contrib/libs/crcutil/bob_jenkins_rng.h @@ -1,126 +1,126 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Glorified C++ version of Bob Jenkins' random number generator. -// See http://burtleburtle.net/bob/rand/smallprng.html for more details. - -#ifndef CRCUTIL_BOB_JENKINS_RNG_H_ -#define CRCUTIL_BOB_JENKINS_RNG_H_ - -#include "base_types.h" - -#if !defined(_MSC_VER) -#define _rotl(value, bits) \ - static_cast<uint32>(((value) << (bits)) + ((value) >> (32 - (bits)))) -#define _rotl64(value, bits) \ - static_cast<uint64>(((value) << (bits)) + ((value) >> (64 - (bits)))) -#endif // !defined(_MSC_VER) - -namespace crcutil { - -#pragma pack(push, 8) - -template<typename T> class BobJenkinsRng; - -template<> class BobJenkinsRng<uint32> { - public: - typedef uint32 value; - - value Get() { - value e = a_ - _rotl(b_, 23); - a_ = b_ ^ _rotl(c_, 16); - b_ = c_ + _rotl(d_, 11); - c_ = d_ + e; - d_ = e + a_; - return (d_); - } - - void Init(value seed) { - a_ = 0xf1ea5eed; - b_ = seed; - c_ = seed; - d_ = seed; - for (size_t i = 0; i < 20; ++i) { - (void) Get(); - } - } - - explicit BobJenkinsRng(value seed) { - Init(seed); - } - - BobJenkinsRng() { - Init(0x1234567); - } - - private: - value a_; - value b_; - value c_; - value d_; -}; - - -#if HAVE_UINT64 - -template<> class BobJenkinsRng<uint64> { - public: - typedef uint64 value; - - value Get() { - value e = a_ - _rotl64(b_, 7); - a_ = b_ ^ _rotl64(c_, 13); - b_ = c_ + _rotl64(d_, 37); - c_ = d_ + e; - d_ = e + a_; - return d_; - } - - void Init(value seed) { - a_ = 0xf1ea5eed; - b_ = seed; - c_ = seed; - d_ = seed; - for (size_t i = 0; i < 20; ++i) { - (void) Get(); - } - } - - explicit BobJenkinsRng(value seed) { - Init(seed); - } - - BobJenkinsRng() { - Init(0x1234567); - } - - private: - value a_; - value b_; - value c_; - value d_; -}; - -#endif // HAVE_UINT64 - -#if !defined(_MSC_VER) -#undef _rotl -#undef _rotl64 -#endif // !defined(_MSC_VER) - -#pragma pack(pop) - -} // namespace crcutil - -#endif // CRCUTIL_BOB_JENKINS_RNG_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Glorified C++ version of Bob Jenkins' random number generator. +// See http://burtleburtle.net/bob/rand/smallprng.html for more details. + +#ifndef CRCUTIL_BOB_JENKINS_RNG_H_ +#define CRCUTIL_BOB_JENKINS_RNG_H_ + +#include "base_types.h" + +#if !defined(_MSC_VER) +#define _rotl(value, bits) \ + static_cast<uint32>(((value) << (bits)) + ((value) >> (32 - (bits)))) +#define _rotl64(value, bits) \ + static_cast<uint64>(((value) << (bits)) + ((value) >> (64 - (bits)))) +#endif // !defined(_MSC_VER) + +namespace crcutil { + +#pragma pack(push, 8) + +template<typename T> class BobJenkinsRng; + +template<> class BobJenkinsRng<uint32> { + public: + typedef uint32 value; + + value Get() { + value e = a_ - _rotl(b_, 23); + a_ = b_ ^ _rotl(c_, 16); + b_ = c_ + _rotl(d_, 11); + c_ = d_ + e; + d_ = e + a_; + return (d_); + } + + void Init(value seed) { + a_ = 0xf1ea5eed; + b_ = seed; + c_ = seed; + d_ = seed; + for (size_t i = 0; i < 20; ++i) { + (void) Get(); + } + } + + explicit BobJenkinsRng(value seed) { + Init(seed); + } + + BobJenkinsRng() { + Init(0x1234567); + } + + private: + value a_; + value b_; + value c_; + value d_; +}; + + +#if HAVE_UINT64 + +template<> class BobJenkinsRng<uint64> { + public: + typedef uint64 value; + + value Get() { + value e = a_ - _rotl64(b_, 7); + a_ = b_ ^ _rotl64(c_, 13); + b_ = c_ + _rotl64(d_, 37); + c_ = d_ + e; + d_ = e + a_; + return d_; + } + + void Init(value seed) { + a_ = 0xf1ea5eed; + b_ = seed; + c_ = seed; + d_ = seed; + for (size_t i = 0; i < 20; ++i) { + (void) Get(); + } + } + + explicit BobJenkinsRng(value seed) { + Init(seed); + } + + BobJenkinsRng() { + Init(0x1234567); + } + + private: + value a_; + value b_; + value c_; + value d_; +}; + +#endif // HAVE_UINT64 + +#if !defined(_MSC_VER) +#undef _rotl +#undef _rotl64 +#endif // !defined(_MSC_VER) + +#pragma pack(pop) + +} // namespace crcutil + +#endif // CRCUTIL_BOB_JENKINS_RNG_H_ diff --git a/contrib/libs/crcutil/crc32c_sse4.cc b/contrib/libs/crcutil/crc32c_sse4.cc index ed0f64410a..9875ab4ff2 100644 --- a/contrib/libs/crcutil/crc32c_sse4.cc +++ b/contrib/libs/crcutil/crc32c_sse4.cc @@ -1,369 +1,369 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements CRC32C using Intel's SSE4 crc32 instruction. -// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero, -// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise. - -#include "crc32c_sse4.h" - +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements CRC32C using Intel's SSE4 crc32 instruction. +// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero, +// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise. + +#include "crc32c_sse4.h" + #include <util/system/compiler.h> -#if HAVE_I386 || HAVE_AMD64 - -namespace crcutil { - -#define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \ - CRC_UPDATE_WORD(crc0, \ - reinterpret_cast<const size_t *>(src + \ - 0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ - CRC_UPDATE_WORD(crc1, \ - reinterpret_cast<const size_t *>(src + \ - 1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ - CRC_UPDATE_WORD(crc2, \ - reinterpret_cast<const size_t *>(src + \ - 2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ - if (num_stripes > 3) { \ - CRC_UPDATE_WORD(crc3, \ - reinterpret_cast<const size_t *>(src + \ - 3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ - } \ -} while (0) - -// Multiplies "crc" by "x**(8 * STRIPE_SIZE(block_size)" -// using appropriate multiplication table(s). -// -#if 0 - -// This variant is for illustration purposes only. -// Actual implementation below: -// 1. Splits the computation into 2 data-independent paths -// by independently multiplying lower and upper halves -// of "crc0" in interleaved manner, and combining the -// results in the end. -// 2. Removing redundant "crc0 = 0" etc. in the beginning. -// 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round. -#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ - size_t tmp0 = crc0; \ - crc0 = 0; \ - for (size_t i = 0; i < kNumTables; ++i) { \ - crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [i][tmp0 & (kTableEntries - 1)]; \ - tmp0 >>= kTableEntryBits; \ - } \ -} while (0) - -#else - -#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ - size_t tmp0 = crc0; \ - size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \ - crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [0][tmp0 & (kTableEntries - 1)]; \ - tmp0 >>= kTableEntryBits; \ - size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ - tmp1 >>= kTableEntryBits; \ - for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \ - crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [i][tmp0 & (kTableEntries - 1)]; \ - tmp0 >>= kTableEntryBits; \ - crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ - tmp1 >>= kTableEntryBits; \ - } \ - crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \ - if (kNumTables & 1) { \ - tmp0 >>= kTableEntryBits; \ - } \ - crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [kNumTables - 1][tmp1]; \ - if (kNumTables & 1) { \ - crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \ - } \ - crc0 ^= crc1; \ -} while (0) - -#endif - -// Given CRCs (crc0, crc1, etc.) of consequitive -// stripes of STRIPE_SIZE(block_size) bytes each, -// produces CRC of concatenated stripes. -#define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \ - MULTIPLY_CRC(crc0, block_size, num_stripes); \ - crc0 ^= crc1; \ - MULTIPLY_CRC(crc0, block_size, num_stripes); \ - crc0 ^= crc2; \ - if (num_stripes > 3) { \ - MULTIPLY_CRC(crc0, block_size, num_stripes); \ - crc0 ^= crc3; \ - } \ -} while (0) - -// Processes input BLOCK_SIZE(block) bytes per iteration -// by splitting a block of BLOCK_SIZE(block) bytes into N -// equally-sized stripes of STRIPE_SIZE(block_size) each, -// computing CRC of each stripe, and concatenating stripe CRCs. -#define PROCESS_BLOCK(block_size, num_stripes) do { \ - while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ - Crc crc1 = 0; \ - Crc crc2 = 0; \ - Crc crc3; \ - if (num_stripes > 3) crc3 = 0; \ - { \ - const uint8 *stripe_end = src + \ - (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \ - kUnrolledLoopBytes) * kUnrolledLoopBytes; \ - do { \ - UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \ - UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \ - src += kUnrolledLoopBytes; \ - } while (src < stripe_end); \ - if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ - kUnrolledLoopBytes) != 0) { \ - stripe_end += \ - CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ - kUnrolledLoopBytes; \ - do { \ - UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ - src += sizeof(size_t); \ - } while (src < stripe_end); \ - } \ - } \ - COMBINE_STRIPE_CRCS(block_size, num_stripes); \ - src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \ - ((num_stripes) - 1); \ - bytes = static_cast<size_t>(end - src); \ - } \ - no_more_##block_size##_##num_stripes:; \ -} while (0) - +#if HAVE_I386 || HAVE_AMD64 + +namespace crcutil { + +#define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \ + CRC_UPDATE_WORD(crc0, \ + reinterpret_cast<const size_t *>(src + \ + 0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ + CRC_UPDATE_WORD(crc1, \ + reinterpret_cast<const size_t *>(src + \ + 1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ + CRC_UPDATE_WORD(crc2, \ + reinterpret_cast<const size_t *>(src + \ + 2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ + if (num_stripes > 3) { \ + CRC_UPDATE_WORD(crc3, \ + reinterpret_cast<const size_t *>(src + \ + 3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ + } \ +} while (0) + +// Multiplies "crc" by "x**(8 * STRIPE_SIZE(block_size)" +// using appropriate multiplication table(s). +// +#if 0 + +// This variant is for illustration purposes only. +// Actual implementation below: +// 1. Splits the computation into 2 data-independent paths +// by independently multiplying lower and upper halves +// of "crc0" in interleaved manner, and combining the +// results in the end. +// 2. Removing redundant "crc0 = 0" etc. in the beginning. +// 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round. +#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ + size_t tmp0 = crc0; \ + crc0 = 0; \ + for (size_t i = 0; i < kNumTables; ++i) { \ + crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [i][tmp0 & (kTableEntries - 1)]; \ + tmp0 >>= kTableEntryBits; \ + } \ +} while (0) + +#else + +#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ + size_t tmp0 = crc0; \ + size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \ + crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [0][tmp0 & (kTableEntries - 1)]; \ + tmp0 >>= kTableEntryBits; \ + size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ + tmp1 >>= kTableEntryBits; \ + for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \ + crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [i][tmp0 & (kTableEntries - 1)]; \ + tmp0 >>= kTableEntryBits; \ + crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ + tmp1 >>= kTableEntryBits; \ + } \ + crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \ + if (kNumTables & 1) { \ + tmp0 >>= kTableEntryBits; \ + } \ + crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [kNumTables - 1][tmp1]; \ + if (kNumTables & 1) { \ + crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \ + } \ + crc0 ^= crc1; \ +} while (0) + +#endif + +// Given CRCs (crc0, crc1, etc.) of consequitive +// stripes of STRIPE_SIZE(block_size) bytes each, +// produces CRC of concatenated stripes. +#define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \ + MULTIPLY_CRC(crc0, block_size, num_stripes); \ + crc0 ^= crc1; \ + MULTIPLY_CRC(crc0, block_size, num_stripes); \ + crc0 ^= crc2; \ + if (num_stripes > 3) { \ + MULTIPLY_CRC(crc0, block_size, num_stripes); \ + crc0 ^= crc3; \ + } \ +} while (0) + +// Processes input BLOCK_SIZE(block) bytes per iteration +// by splitting a block of BLOCK_SIZE(block) bytes into N +// equally-sized stripes of STRIPE_SIZE(block_size) each, +// computing CRC of each stripe, and concatenating stripe CRCs. +#define PROCESS_BLOCK(block_size, num_stripes) do { \ + while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ + Crc crc1 = 0; \ + Crc crc2 = 0; \ + Crc crc3; \ + if (num_stripes > 3) crc3 = 0; \ + { \ + const uint8 *stripe_end = src + \ + (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \ + kUnrolledLoopBytes) * kUnrolledLoopBytes; \ + do { \ + UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \ + UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \ + src += kUnrolledLoopBytes; \ + } while (src < stripe_end); \ + if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ + kUnrolledLoopBytes) != 0) { \ + stripe_end += \ + CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ + kUnrolledLoopBytes; \ + do { \ + UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ + src += sizeof(size_t); \ + } while (src < stripe_end); \ + } \ + } \ + COMBINE_STRIPE_CRCS(block_size, num_stripes); \ + src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \ + ((num_stripes) - 1); \ + bytes = static_cast<size_t>(end - src); \ + } \ + no_more_##block_size##_##num_stripes:; \ +} while (0) + Y_NO_SANITIZE("undefined") -size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const { - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - crc0 ^= Base().Canonize(); - - // If we don't have too much data to process, - // do not waste time trying to align input etc. - // Noticeably improves performance on small inputs. - if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t; - if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t; - if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t; - -#define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \ - if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ - goto no_more_##block_size##_##num_stripes; \ - } \ -} while (0) -#define NOOP(block_size, num_stripes) - - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL, - NOOP, - NOOP); - -#undef PROCESS_TAIL_IF_SMALL - - - // Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because: - // 1. It uses CRC_BYTE() which won't work. - // 2. Its threshold may be incorrect becuase Crc32 that uses - // native CPU crc32 instruction is much faster than - // generic table-based CRC computation. - // - // In case of X5550 CPU, break even point is at 2KB -- exactly. - if (bytes >= 2 * 1024) { - while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { - if (src >= end) { - return (crc0 ^ Base().Canonize()); - } - CRC_UPDATE_BYTE(crc0, src[0]); - src += 1; - } - bytes = static_cast<size_t>(end - src); - } - if (src >= end) { - return (crc0 ^ Base().Canonize()); - } - - // Quickly skip processing of too large blocks - // Noticeably improves performance on small inputs. -#define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \ - if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ - goto no_more_##block_size##_##num_stripes; \ - } \ -} while (0) - - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP, - SKIP_BLOCK_IF_NEEDED, - SKIP_BLOCK_IF_NEEDED); - -#undef SKIP_BLOCK_IF_NEEDED - - // Process data in all blocks. - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK, - PROCESS_BLOCK, - PROCESS_BLOCK); - - // Finish the tail word-by-word and then byte-by-byte. -#define CRC_UPDATE_WORD_4(index) do { \ - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \ - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \ - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \ - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \ -} while (0) - - if (bytes >= 4 * 4 * sizeof(size_t)) { - end -= 4 * 4 * sizeof(size_t); - do { - CRC_UPDATE_WORD_4(4 * 0); - CRC_UPDATE_WORD_4(4 * 1); - CRC_UPDATE_WORD_4(4 * 2); - CRC_UPDATE_WORD_4(4 * 3); - src += 4 * 4 * sizeof(size_t); - } while (src <= end); - end += 4 * 4 * sizeof(size_t); - bytes = static_cast<size_t>(end - src); - } - less_than_16_size_t: - - if (bytes >= 4 * 2 * sizeof(size_t)) { - CRC_UPDATE_WORD_4(4 * 0); - CRC_UPDATE_WORD_4(4 * 1); - src += 4 * 2 * sizeof(size_t); - bytes -= 4 * 2 * sizeof(size_t); - } - less_than_8_size_t: - - if (bytes >= 4 * sizeof(size_t)) { - CRC_UPDATE_WORD_4(0); - src += 4 * sizeof(size_t); - bytes -= 4 * sizeof(size_t); - } - less_than_4_size_t: - - if (bytes >= 1 * sizeof(size_t)) { - end -= 1 * sizeof(size_t); - do { - CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]); - src += 1 * sizeof(size_t); - } while (src <= end); - end += 1 * sizeof(size_t); - } - - while (src < end) { - CRC_UPDATE_BYTE(crc0, src[0]); - src += 1; - } - - return (crc0 ^ Base().Canonize()); -} - - -void Crc32cSSE4::Init(bool constant) { - base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant); - -#define INIT_MUL_TABLE(block_size, num_stripes) do { \ - size_t multiplier = \ - Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \ - for (size_t table = 0; table < kNumTables; ++table) { \ - for (size_t entry = 0; entry < kTableEntries; ++entry) { \ - size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \ - CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \ - static_cast<Entry>(Base().Multiply(value, multiplier)); \ - } \ - } \ -} while (0) - - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE); - -#undef INIT_MUL_TABLE - -#if !CRCUTIL_USE_MM_CRC32 - for (size_t j = 0; j < sizeof(Word); ++j) { - Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32); - for (size_t i = 0; i < 256; ++i) { - crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k); - } - } -#endif // !CRCUTIL_USE_MM_CRC32 -} - - -bool Crc32cSSE4::IsSSE42Available() { -#if defined(_MSC_VER) - int cpu_info[4]; - __cpuid(cpu_info, 1); +size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const { + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + crc0 ^= Base().Canonize(); + + // If we don't have too much data to process, + // do not waste time trying to align input etc. + // Noticeably improves performance on small inputs. + if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t; + if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t; + if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t; + +#define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \ + if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ + goto no_more_##block_size##_##num_stripes; \ + } \ +} while (0) +#define NOOP(block_size, num_stripes) + + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL, + NOOP, + NOOP); + +#undef PROCESS_TAIL_IF_SMALL + + + // Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because: + // 1. It uses CRC_BYTE() which won't work. + // 2. Its threshold may be incorrect becuase Crc32 that uses + // native CPU crc32 instruction is much faster than + // generic table-based CRC computation. + // + // In case of X5550 CPU, break even point is at 2KB -- exactly. + if (bytes >= 2 * 1024) { + while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { + if (src >= end) { + return (crc0 ^ Base().Canonize()); + } + CRC_UPDATE_BYTE(crc0, src[0]); + src += 1; + } + bytes = static_cast<size_t>(end - src); + } + if (src >= end) { + return (crc0 ^ Base().Canonize()); + } + + // Quickly skip processing of too large blocks + // Noticeably improves performance on small inputs. +#define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \ + if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ + goto no_more_##block_size##_##num_stripes; \ + } \ +} while (0) + + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP, + SKIP_BLOCK_IF_NEEDED, + SKIP_BLOCK_IF_NEEDED); + +#undef SKIP_BLOCK_IF_NEEDED + + // Process data in all blocks. + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK, + PROCESS_BLOCK, + PROCESS_BLOCK); + + // Finish the tail word-by-word and then byte-by-byte. +#define CRC_UPDATE_WORD_4(index) do { \ + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \ + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \ + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \ + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \ +} while (0) + + if (bytes >= 4 * 4 * sizeof(size_t)) { + end -= 4 * 4 * sizeof(size_t); + do { + CRC_UPDATE_WORD_4(4 * 0); + CRC_UPDATE_WORD_4(4 * 1); + CRC_UPDATE_WORD_4(4 * 2); + CRC_UPDATE_WORD_4(4 * 3); + src += 4 * 4 * sizeof(size_t); + } while (src <= end); + end += 4 * 4 * sizeof(size_t); + bytes = static_cast<size_t>(end - src); + } + less_than_16_size_t: + + if (bytes >= 4 * 2 * sizeof(size_t)) { + CRC_UPDATE_WORD_4(4 * 0); + CRC_UPDATE_WORD_4(4 * 1); + src += 4 * 2 * sizeof(size_t); + bytes -= 4 * 2 * sizeof(size_t); + } + less_than_8_size_t: + + if (bytes >= 4 * sizeof(size_t)) { + CRC_UPDATE_WORD_4(0); + src += 4 * sizeof(size_t); + bytes -= 4 * sizeof(size_t); + } + less_than_4_size_t: + + if (bytes >= 1 * sizeof(size_t)) { + end -= 1 * sizeof(size_t); + do { + CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]); + src += 1 * sizeof(size_t); + } while (src <= end); + end += 1 * sizeof(size_t); + } + + while (src < end) { + CRC_UPDATE_BYTE(crc0, src[0]); + src += 1; + } + + return (crc0 ^ Base().Canonize()); +} + + +void Crc32cSSE4::Init(bool constant) { + base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant); + +#define INIT_MUL_TABLE(block_size, num_stripes) do { \ + size_t multiplier = \ + Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \ + for (size_t table = 0; table < kNumTables; ++table) { \ + for (size_t entry = 0; entry < kTableEntries; ++entry) { \ + size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \ + CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \ + static_cast<Entry>(Base().Multiply(value, multiplier)); \ + } \ + } \ +} while (0) + + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE); + +#undef INIT_MUL_TABLE + +#if !CRCUTIL_USE_MM_CRC32 + for (size_t j = 0; j < sizeof(Word); ++j) { + Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32); + for (size_t i = 0; i < 256; ++i) { + crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k); + } + } +#endif // !CRCUTIL_USE_MM_CRC32 +} + + +bool Crc32cSSE4::IsSSE42Available() { +#if defined(_MSC_VER) + int cpu_info[4]; + __cpuid(cpu_info, 1); return ((cpu_info[2] & (1 << 20)) != 0); -#elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386) - // Not using "cpuid.h" intentionally: it is missing from - // too many installations. - uint32 eax; - uint32 ecx; - uint32 edx; - __asm__ volatile( -#if HAVE_I386 && defined(__PIC__) +#elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386) + // Not using "cpuid.h" intentionally: it is missing from + // too many installations. + uint32 eax; + uint32 ecx; + uint32 edx; + __asm__ volatile( +#if HAVE_I386 && defined(__PIC__) "push %%ebx\n" - "cpuid\n" + "cpuid\n" "pop %%ebx\n" -#else - "cpuid\n" -#endif // HAVE_I386 && defined(__PIC__) - : "=a" (eax), "=c" (ecx), "=d" (edx) - : "a" (1), "2" (0) - : "%ebx" - ); - return ((ecx & (1 << 20)) != 0); -#else - return false; -#endif -} - - -void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc, - size_t roll_window_bytes, - const Crc &start_value) { - crc_ = &crc; - roll_window_bytes_ = roll_window_bytes; - start_value_ = start_value; - - Crc add = crc.Base().Canonize() ^ start_value; - add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes)); - add ^= crc.Base().Canonize(); - Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1); - add = crc.Base().Multiply(add, mul); - - mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree()); - for (size_t i = 0; i < 256; ++i) { - out_[i] = static_cast<Entry>( - crc.Base().MultiplyUnnormalized( - static_cast<Crc>(i), 8, mul) ^ add); - } - -#if !CRCUTIL_USE_MM_CRC32 - memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_)); -#endif // !CRCUTIL_USE_MM_CRC32 -} - -} // namespace crcutil - -#endif // HAVE_I386 || HAVE_AMD64 +#else + "cpuid\n" +#endif // HAVE_I386 && defined(__PIC__) + : "=a" (eax), "=c" (ecx), "=d" (edx) + : "a" (1), "2" (0) + : "%ebx" + ); + return ((ecx & (1 << 20)) != 0); +#else + return false; +#endif +} + + +void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc, + size_t roll_window_bytes, + const Crc &start_value) { + crc_ = &crc; + roll_window_bytes_ = roll_window_bytes; + start_value_ = start_value; + + Crc add = crc.Base().Canonize() ^ start_value; + add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes)); + add ^= crc.Base().Canonize(); + Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1); + add = crc.Base().Multiply(add, mul); + + mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree()); + for (size_t i = 0; i < 256; ++i) { + out_[i] = static_cast<Entry>( + crc.Base().MultiplyUnnormalized( + static_cast<Crc>(i), 8, mul) ^ add); + } + +#if !CRCUTIL_USE_MM_CRC32 + memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_)); +#endif // !CRCUTIL_USE_MM_CRC32 +} + +} // namespace crcutil + +#endif // HAVE_I386 || HAVE_AMD64 diff --git a/contrib/libs/crcutil/crc32c_sse4.h b/contrib/libs/crcutil/crc32c_sse4.h index ac3d8425b8..24aa815699 100644 --- a/contrib/libs/crcutil/crc32c_sse4.h +++ b/contrib/libs/crcutil/crc32c_sse4.h @@ -1,252 +1,252 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements CRC32C using Intel's SSE4 crc32 instruction. -// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero, -// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise. - -#ifndef CRCUTIL_CRC32C_SSE4_H_ -#define CRCUTIL_CRC32C_SSE4_H_ - -#include "gf_util.h" // base types, gf_util class, etc. -#include "crc32c_sse4_intrin.h" // _mm_crc32_u* intrinsics - -#if HAVE_I386 || HAVE_AMD64 - -#if CRCUTIL_USE_MM_CRC32 - -#if HAVE_I386 -#define CRC_UPDATE_WORD(crc, value) (crc = _mm_crc32_u32(crc, (value))) -#else -#define CRC_UPDATE_WORD(crc, value) (crc = _mm_crc32_u64(crc, (value))) -#endif // HAVE_I386 - -#define CRC_UPDATE_BYTE(crc, value) \ - (crc = _mm_crc32_u8(static_cast<uint32>(crc), static_cast<uint8>(value))) - -#else - -#include "generic_crc.h" - -#define CRC_UPDATE_WORD(crc, value) do { \ - size_t buf = (value); \ - CRC_WORD(this, crc, buf); \ -} while (0) -#define CRC_UPDATE_BYTE(crc, value) do { \ - CRC_BYTE(this, crc, (value)); \ -} while (0) - -#endif // CRCUTIL_USE_MM_CRC32 - -namespace crcutil { - -#pragma pack(push, 16) - -// Since the same pieces should be parameterized in many different places -// and we do not want to introduce a mistake which is rather hard to find, -// use a macro to enumerate all block sizes. -// -// Block sizes and number of stripes were tuned for best performance. -// -// All constants should be literal constants (too lazy to fix the macro). -// -// The use of different "macro_first", "macro", and "macro_last" -// allows generation of different code for smallest, in between, -// and largest block sizes. -// -// This macro shall be kept in sync with -// CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING. -// Failure to do so will cause compile-time error. -#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING( \ - macro_smallest, macro, macro_largest) \ - macro_smallest(512, 3); \ - macro(1024, 3); \ - macro(4096, 3); \ - macro_largest(32768, 3) - -// This macro shall be kept in sync with -// CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING. -// Failure to do so will cause compile-time error. -#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING( \ - macro_smallest, macro, macro_largest) \ - macro_largest(32768, 3); \ - macro(4096, 3); \ - macro(1024, 3); \ - macro_smallest(512, 3) - -// Enumerates all block sizes. -#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(macro) \ - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(macro, macro, macro) - -#define CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) \ - (((block_size) / (num_stripes)) & ~(sizeof(size_t) - 1)) - -#define CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes) \ - (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * (num_stripes)) - -#define CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - mul_table_##block_size##_##num_blocks##_ - -class RollingCrc32cSSE4; - -class Crc32cSSE4 { - public: - // Exports Crc, TableEntry, and Word (needed by RollingCrc). - typedef size_t Crc; - typedef Crc Word; - typedef Crc TableEntry; - - Crc32cSSE4() {} - - // Initializes the tables given generating polynomial of degree (degree). - // If "canonical" is true, crc value will be XOR'ed with (-1) before and - // after actual CRC computation. - explicit Crc32cSSE4(bool canonical) { - Init(canonical); - } - void Init(bool canonical); - - // Initializes the tables given generating polynomial of degree. - // If "canonical" is true, crc value will be XOR'ed with (-1) before and - // after actual CRC computation. - // Provided for compatibility with GenericCrc. - Crc32cSSE4(const Crc &generating_polynomial, - size_t degree, - bool canonical) { - Init(generating_polynomial, degree, canonical); - } - void Init(const Crc &generating_polynomial, - size_t degree, - bool canonical) { - if (generating_polynomial == FixedGeneratingPolynomial() && - degree == FixedDegree()) { - Init(canonical); - } - } - - // Returns fixed generating polymonial the class implements. - static Crc FixedGeneratingPolynomial() { - return 0x82f63b78; - } - - // Returns degree of fixed generating polymonial the class implements. - static Crc FixedDegree() { - return 32; - } - - // Returns base class. - const GfUtil<Crc> &Base() const { return base_; } - - // Computes CRC32. - size_t CrcDefault(const void *data, size_t bytes, const Crc &crc) const { - return Crc32c(data, bytes, crc); - } - - // Returns true iff crc32 instruction is available. - static bool IsSSE42Available(); - - protected: - // Actual implementation. - size_t Crc32c(const void *data, size_t bytes, Crc crc) const; - - enum { - kTableEntryBits = 8, - kTableEntries = 1 << kTableEntryBits, - kNumTables = (32 + kTableEntryBits - 1) / kTableEntryBits, - kNumTablesHalfLo = kNumTables / 2, - kNumTablesHalfHi = (kNumTables + 1) / 2, - - kUnrolledLoopCount = 8, - kUnrolledLoopBytes = kUnrolledLoopCount * sizeof(size_t), - }; - - // May be set to size_t or uint32, whichever is faster. - typedef uint32 Entry; - -#define DECLARE_MUL_TABLE(block_size, num_stripes) \ - Entry CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ - [kNumTables][kTableEntries] - - CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(DECLARE_MUL_TABLE); - -#undef DECLARE_MUL_TABLE - - GfUtil<Crc> base_; - -#if !CRCUTIL_USE_MM_CRC32 - TableEntry crc_word_[sizeof(Word)][256]; - friend class RollingCrc32cSSE4; -#endif // !CRCUTIL_USE_MM_CRC32 -} GCC_ALIGN_ATTRIBUTE(16); - -class RollingCrc32cSSE4 { - public: - typedef Crc32cSSE4::Crc Crc; - typedef Crc32cSSE4::TableEntry TableEntry; - typedef Crc32cSSE4::Word Word; - - RollingCrc32cSSE4() {} - - // Initializes internal data structures. - // Retains reference to "crc" instance -- it is used by Start(). - RollingCrc32cSSE4(const Crc32cSSE4 &crc, - size_t roll_window_bytes, - const Crc &start_value) { - Init(crc, roll_window_bytes, start_value); - } - void Init(const Crc32cSSE4 &crc, - size_t roll_window_bytes, - const Crc &start_value); - - // Computes crc of "roll_window_bytes" using - // "start_value" of "crc" (see Init()). - Crc Start(const void *data) const { - return crc_->CrcDefault(data, roll_window_bytes_, start_value_); - } - - // Computes CRC of "roll_window_bytes" starting in next position. - Crc Roll(const Crc &old_crc, size_t byte_out, size_t byte_in) const { - Crc crc = old_crc; - CRC_UPDATE_BYTE(crc, byte_in); - crc ^= out_[byte_out]; - return crc; - } - - // Returns start value. - Crc StartValue() const { return start_value_; } - - // Returns length of roll window. - size_t WindowBytes() const { return roll_window_bytes_; } - - protected: - typedef Crc Entry; - Entry out_[256]; - - // Used only by Start(). - Crc start_value_; - const Crc32cSSE4 *crc_; - size_t roll_window_bytes_; - -#if !CRCUTIL_USE_MM_CRC32 - TableEntry crc_word_[sizeof(Word)][256]; -#endif // !CRCUTIL_USE_MM_CRC32 -} GCC_ALIGN_ATTRIBUTE(16); - -#pragma pack(pop) - -} // namespace crcutil - -#endif // HAVE_I386 || HAVE_AMD64 - -#endif // CRCUTIL_CRC32C_SSE4_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements CRC32C using Intel's SSE4 crc32 instruction. +// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero, +// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise. + +#ifndef CRCUTIL_CRC32C_SSE4_H_ +#define CRCUTIL_CRC32C_SSE4_H_ + +#include "gf_util.h" // base types, gf_util class, etc. +#include "crc32c_sse4_intrin.h" // _mm_crc32_u* intrinsics + +#if HAVE_I386 || HAVE_AMD64 + +#if CRCUTIL_USE_MM_CRC32 + +#if HAVE_I386 +#define CRC_UPDATE_WORD(crc, value) (crc = _mm_crc32_u32(crc, (value))) +#else +#define CRC_UPDATE_WORD(crc, value) (crc = _mm_crc32_u64(crc, (value))) +#endif // HAVE_I386 + +#define CRC_UPDATE_BYTE(crc, value) \ + (crc = _mm_crc32_u8(static_cast<uint32>(crc), static_cast<uint8>(value))) + +#else + +#include "generic_crc.h" + +#define CRC_UPDATE_WORD(crc, value) do { \ + size_t buf = (value); \ + CRC_WORD(this, crc, buf); \ +} while (0) +#define CRC_UPDATE_BYTE(crc, value) do { \ + CRC_BYTE(this, crc, (value)); \ +} while (0) + +#endif // CRCUTIL_USE_MM_CRC32 + +namespace crcutil { + +#pragma pack(push, 16) + +// Since the same pieces should be parameterized in many different places +// and we do not want to introduce a mistake which is rather hard to find, +// use a macro to enumerate all block sizes. +// +// Block sizes and number of stripes were tuned for best performance. +// +// All constants should be literal constants (too lazy to fix the macro). +// +// The use of different "macro_first", "macro", and "macro_last" +// allows generation of different code for smallest, in between, +// and largest block sizes. +// +// This macro shall be kept in sync with +// CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING. +// Failure to do so will cause compile-time error. +#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING( \ + macro_smallest, macro, macro_largest) \ + macro_smallest(512, 3); \ + macro(1024, 3); \ + macro(4096, 3); \ + macro_largest(32768, 3) + +// This macro shall be kept in sync with +// CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING. +// Failure to do so will cause compile-time error. +#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING( \ + macro_smallest, macro, macro_largest) \ + macro_largest(32768, 3); \ + macro(4096, 3); \ + macro(1024, 3); \ + macro_smallest(512, 3) + +// Enumerates all block sizes. +#define CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(macro) \ + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(macro, macro, macro) + +#define CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) \ + (((block_size) / (num_stripes)) & ~(sizeof(size_t) - 1)) + +#define CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes) \ + (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * (num_stripes)) + +#define CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + mul_table_##block_size##_##num_blocks##_ + +class RollingCrc32cSSE4; + +class Crc32cSSE4 { + public: + // Exports Crc, TableEntry, and Word (needed by RollingCrc). + typedef size_t Crc; + typedef Crc Word; + typedef Crc TableEntry; + + Crc32cSSE4() {} + + // Initializes the tables given generating polynomial of degree (degree). + // If "canonical" is true, crc value will be XOR'ed with (-1) before and + // after actual CRC computation. + explicit Crc32cSSE4(bool canonical) { + Init(canonical); + } + void Init(bool canonical); + + // Initializes the tables given generating polynomial of degree. + // If "canonical" is true, crc value will be XOR'ed with (-1) before and + // after actual CRC computation. + // Provided for compatibility with GenericCrc. + Crc32cSSE4(const Crc &generating_polynomial, + size_t degree, + bool canonical) { + Init(generating_polynomial, degree, canonical); + } + void Init(const Crc &generating_polynomial, + size_t degree, + bool canonical) { + if (generating_polynomial == FixedGeneratingPolynomial() && + degree == FixedDegree()) { + Init(canonical); + } + } + + // Returns fixed generating polymonial the class implements. + static Crc FixedGeneratingPolynomial() { + return 0x82f63b78; + } + + // Returns degree of fixed generating polymonial the class implements. + static Crc FixedDegree() { + return 32; + } + + // Returns base class. + const GfUtil<Crc> &Base() const { return base_; } + + // Computes CRC32. + size_t CrcDefault(const void *data, size_t bytes, const Crc &crc) const { + return Crc32c(data, bytes, crc); + } + + // Returns true iff crc32 instruction is available. + static bool IsSSE42Available(); + + protected: + // Actual implementation. + size_t Crc32c(const void *data, size_t bytes, Crc crc) const; + + enum { + kTableEntryBits = 8, + kTableEntries = 1 << kTableEntryBits, + kNumTables = (32 + kTableEntryBits - 1) / kTableEntryBits, + kNumTablesHalfLo = kNumTables / 2, + kNumTablesHalfHi = (kNumTables + 1) / 2, + + kUnrolledLoopCount = 8, + kUnrolledLoopBytes = kUnrolledLoopCount * sizeof(size_t), + }; + + // May be set to size_t or uint32, whichever is faster. + typedef uint32 Entry; + +#define DECLARE_MUL_TABLE(block_size, num_stripes) \ + Entry CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ + [kNumTables][kTableEntries] + + CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(DECLARE_MUL_TABLE); + +#undef DECLARE_MUL_TABLE + + GfUtil<Crc> base_; + +#if !CRCUTIL_USE_MM_CRC32 + TableEntry crc_word_[sizeof(Word)][256]; + friend class RollingCrc32cSSE4; +#endif // !CRCUTIL_USE_MM_CRC32 +} GCC_ALIGN_ATTRIBUTE(16); + +class RollingCrc32cSSE4 { + public: + typedef Crc32cSSE4::Crc Crc; + typedef Crc32cSSE4::TableEntry TableEntry; + typedef Crc32cSSE4::Word Word; + + RollingCrc32cSSE4() {} + + // Initializes internal data structures. + // Retains reference to "crc" instance -- it is used by Start(). + RollingCrc32cSSE4(const Crc32cSSE4 &crc, + size_t roll_window_bytes, + const Crc &start_value) { + Init(crc, roll_window_bytes, start_value); + } + void Init(const Crc32cSSE4 &crc, + size_t roll_window_bytes, + const Crc &start_value); + + // Computes crc of "roll_window_bytes" using + // "start_value" of "crc" (see Init()). + Crc Start(const void *data) const { + return crc_->CrcDefault(data, roll_window_bytes_, start_value_); + } + + // Computes CRC of "roll_window_bytes" starting in next position. + Crc Roll(const Crc &old_crc, size_t byte_out, size_t byte_in) const { + Crc crc = old_crc; + CRC_UPDATE_BYTE(crc, byte_in); + crc ^= out_[byte_out]; + return crc; + } + + // Returns start value. + Crc StartValue() const { return start_value_; } + + // Returns length of roll window. + size_t WindowBytes() const { return roll_window_bytes_; } + + protected: + typedef Crc Entry; + Entry out_[256]; + + // Used only by Start(). + Crc start_value_; + const Crc32cSSE4 *crc_; + size_t roll_window_bytes_; + +#if !CRCUTIL_USE_MM_CRC32 + TableEntry crc_word_[sizeof(Word)][256]; +#endif // !CRCUTIL_USE_MM_CRC32 +} GCC_ALIGN_ATTRIBUTE(16); + +#pragma pack(pop) + +} // namespace crcutil + +#endif // HAVE_I386 || HAVE_AMD64 + +#endif // CRCUTIL_CRC32C_SSE4_H_ diff --git a/contrib/libs/crcutil/crc32c_sse4_intrin.h b/contrib/libs/crcutil/crc32c_sse4_intrin.h index ab7cc3e03d..5338d21d29 100644 --- a/contrib/libs/crcutil/crc32c_sse4_intrin.h +++ b/contrib/libs/crcutil/crc32c_sse4_intrin.h @@ -1,99 +1,99 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Provides _mm_crc32_u64/32/8 intrinsics. - -#ifndef CRCUTIL_CRC32C_SSE4_INTRIN_H_ -#define CRCUTIL_CRC32C_SSE4_INTRIN_H_ - -#include "platform.h" -#include "base_types.h" - -#if CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64) - -#if defined(_MSC_VER) || defined(__SSE4_2__) - -#if defined(_MSC_VER) -#pragma warning(push) -// '_M_IA64' is not defined as a preprocessor macro -#pragma warning(disable: 4668) -#endif // defined(_MSC_VER) - -#include <nmmintrin.h> - -#if defined(_MSC_VER) -#pragma warning(pop) -#endif // defined(_MSC_VER) - -#elif GCC_VERSION_AVAILABLE(4, 5) && !defined(CRCUTIL_FORCE_ASM_CRC32C) -// Allow the use of _mm_crc32_u* intrinsic when CRCUTIL_USE_MM_CRC32 -// is set irrespective of "-msse*" settings. This way, the sources -// may be compiled with "-msse2 -mcrc32" and work on older CPUs, -// while taking full advantage of "crc32" instruction on newer -// CPUs (requires dynamic CPU detection). See "interface.cc". -// -// If neither -msse4 or -mcrc32 is provided and CRCUTIL_USE_MM_CRC32 is set -// and CRCUTIL_FORCE_ASM_CRC32 is not set, compile-time error will happen. -// Why? Becuase GCC disables __builtin_ia32_crc32* intrinsics when compiled -// without -msse4 or -mcrc32. -msse4 could be detected at run time by checking -// whether __SSE4_2__ is defined, but there is no way to tell whether the -// sources are compiled with -mcrc32. - -extern __inline unsigned int __attribute__(( - __gnu_inline__, __always_inline__, __artificial__)) -_mm_crc32_u8(unsigned int __C, unsigned char __V) { - return __builtin_ia32_crc32qi(__C, __V); -} -#ifdef __x86_64__ -extern __inline unsigned long long __attribute__(( - __gnu_inline__, __always_inline__, __artificial__)) -_mm_crc32_u64(unsigned long long __C, unsigned long long __V) { - return __builtin_ia32_crc32di(__C, __V); -} -#else -extern __inline unsigned int __attribute__(( - __gnu_inline__, __always_inline__, __artificial__)) -_mm_crc32_u32(unsigned int __C, unsigned int __V) { - return __builtin_ia32_crc32si (__C, __V); -} -#endif // __x86_64__ - -#else - -// GCC 4.4.x and earlier: use inline asm. - -namespace crcutil { - -__forceinline uint64 _mm_crc32_u64(uint64 crc, uint64 value) { - asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - -__forceinline uint32 _mm_crc32_u32(uint32 crc, uint64 value) { - asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - -__forceinline uint32 _mm_crc32_u8(uint32 crc, uint8 value) { - asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - -} // namespace crcutil - -#endif - -#endif // CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64) - -#endif // CRCUTIL_CRC32C_SSE4_INTRIN_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Provides _mm_crc32_u64/32/8 intrinsics. + +#ifndef CRCUTIL_CRC32C_SSE4_INTRIN_H_ +#define CRCUTIL_CRC32C_SSE4_INTRIN_H_ + +#include "platform.h" +#include "base_types.h" + +#if CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64) + +#if defined(_MSC_VER) || defined(__SSE4_2__) + +#if defined(_MSC_VER) +#pragma warning(push) +// '_M_IA64' is not defined as a preprocessor macro +#pragma warning(disable: 4668) +#endif // defined(_MSC_VER) + +#include <nmmintrin.h> + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif // defined(_MSC_VER) + +#elif GCC_VERSION_AVAILABLE(4, 5) && !defined(CRCUTIL_FORCE_ASM_CRC32C) +// Allow the use of _mm_crc32_u* intrinsic when CRCUTIL_USE_MM_CRC32 +// is set irrespective of "-msse*" settings. This way, the sources +// may be compiled with "-msse2 -mcrc32" and work on older CPUs, +// while taking full advantage of "crc32" instruction on newer +// CPUs (requires dynamic CPU detection). See "interface.cc". +// +// If neither -msse4 or -mcrc32 is provided and CRCUTIL_USE_MM_CRC32 is set +// and CRCUTIL_FORCE_ASM_CRC32 is not set, compile-time error will happen. +// Why? Becuase GCC disables __builtin_ia32_crc32* intrinsics when compiled +// without -msse4 or -mcrc32. -msse4 could be detected at run time by checking +// whether __SSE4_2__ is defined, but there is no way to tell whether the +// sources are compiled with -mcrc32. + +extern __inline unsigned int __attribute__(( + __gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u8(unsigned int __C, unsigned char __V) { + return __builtin_ia32_crc32qi(__C, __V); +} +#ifdef __x86_64__ +extern __inline unsigned long long __attribute__(( + __gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u64(unsigned long long __C, unsigned long long __V) { + return __builtin_ia32_crc32di(__C, __V); +} +#else +extern __inline unsigned int __attribute__(( + __gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u32(unsigned int __C, unsigned int __V) { + return __builtin_ia32_crc32si (__C, __V); +} +#endif // __x86_64__ + +#else + +// GCC 4.4.x and earlier: use inline asm. + +namespace crcutil { + +__forceinline uint64 _mm_crc32_u64(uint64 crc, uint64 value) { + asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +__forceinline uint32 _mm_crc32_u32(uint32 crc, uint64 value) { + asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +__forceinline uint32 _mm_crc32_u8(uint32 crc, uint8 value) { + asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +} // namespace crcutil + +#endif + +#endif // CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64) + +#endif // CRCUTIL_CRC32C_SSE4_INTRIN_H_ diff --git a/contrib/libs/crcutil/crc_casts.h b/contrib/libs/crcutil/crc_casts.h index a14044fea3..f732329978 100644 --- a/contrib/libs/crcutil/crc_casts.h +++ b/contrib/libs/crcutil/crc_casts.h @@ -1,68 +1,68 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Casting between integers and compound CRC types. - -#ifndef CRCUTIL_CRC_CASTS_H_ -#define CRCUTIL_CRC_CASTS_H_ - -#include "base_types.h" // uint8, uint64 -#include "platform.h" // __forceinline - -namespace crcutil { - -// Downcasts a value of (oftentimes larger) Crc type to (smaller base integer) -// Result type, enabling specialized downcasts implemented by "large integer" -// classes (e.g. uint128_sse2). -template<typename Crc, typename Result> -__forceinline Result Downcast(const Crc &x) { - return static_cast<Result>(x); -} - -// Extracts 8 least significant bits from a value of Crc type. -#define TO_BYTE(x) Downcast<Crc, uint8>(x) - -// Converts a pair of uint64 bit values into single value of CRC type. -// It is caller's responsibility to ensure that the input is correct. -template<typename Crc> -__forceinline Crc CrcFromUint64(uint64 lo, uint64 hi = 0) { - if (sizeof(Crc) <= sizeof(lo)) { - return static_cast<Crc>(lo); - } else { - // static_cast to keep compiler happy. - Crc result = static_cast<Crc>(hi); - result = SHIFT_LEFT_SAFE(result, 8 * sizeof(lo)); - result ^= lo; - return result; - } -} - -// Converts Crc value to a pair of uint64 values. -template<typename Crc> -__forceinline void Uint64FromCrc(const Crc &crc, - uint64 *lo, uint64 *hi = NULL) { - if (sizeof(*lo) >= sizeof(crc)) { - *lo = Downcast<Crc, uint64>(crc); - if (hi != NULL) { - *hi = 0; - } - } else { - *lo = Downcast<Crc, uint64>(crc); - *hi = Downcast<Crc, uint64>(SHIFT_RIGHT_SAFE(crc, 8 * sizeof(lo))); - } -} - -} // namespace crcutil - -#endif // CRCUTIL_CRC_CASTS_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Casting between integers and compound CRC types. + +#ifndef CRCUTIL_CRC_CASTS_H_ +#define CRCUTIL_CRC_CASTS_H_ + +#include "base_types.h" // uint8, uint64 +#include "platform.h" // __forceinline + +namespace crcutil { + +// Downcasts a value of (oftentimes larger) Crc type to (smaller base integer) +// Result type, enabling specialized downcasts implemented by "large integer" +// classes (e.g. uint128_sse2). +template<typename Crc, typename Result> +__forceinline Result Downcast(const Crc &x) { + return static_cast<Result>(x); +} + +// Extracts 8 least significant bits from a value of Crc type. +#define TO_BYTE(x) Downcast<Crc, uint8>(x) + +// Converts a pair of uint64 bit values into single value of CRC type. +// It is caller's responsibility to ensure that the input is correct. +template<typename Crc> +__forceinline Crc CrcFromUint64(uint64 lo, uint64 hi = 0) { + if (sizeof(Crc) <= sizeof(lo)) { + return static_cast<Crc>(lo); + } else { + // static_cast to keep compiler happy. + Crc result = static_cast<Crc>(hi); + result = SHIFT_LEFT_SAFE(result, 8 * sizeof(lo)); + result ^= lo; + return result; + } +} + +// Converts Crc value to a pair of uint64 values. +template<typename Crc> +__forceinline void Uint64FromCrc(const Crc &crc, + uint64 *lo, uint64 *hi = NULL) { + if (sizeof(*lo) >= sizeof(crc)) { + *lo = Downcast<Crc, uint64>(crc); + if (hi != NULL) { + *hi = 0; + } + } else { + *lo = Downcast<Crc, uint64>(crc); + *hi = Downcast<Crc, uint64>(SHIFT_RIGHT_SAFE(crc, 8 * sizeof(lo))); + } +} + +} // namespace crcutil + +#endif // CRCUTIL_CRC_CASTS_H_ diff --git a/contrib/libs/crcutil/generic_crc.h b/contrib/libs/crcutil/generic_crc.h index 06af21c925..4832005776 100644 --- a/contrib/libs/crcutil/generic_crc.h +++ b/contrib/libs/crcutil/generic_crc.h @@ -1,687 +1,687 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Defines GenericCrc class which implements arbitrary CRCs. -// -// Please read crc.pdf to understand how it all works. - -#ifndef CRCUTIL_GENERIC_CRC_H_ -#define CRCUTIL_GENERIC_CRC_H_ - -#include "base_types.h" // uint8 -#include "crc_casts.h" // TO_BYTE(), Downcast<>. -#include "gf_util.h" // GfUtil<Crc> class. -#include "platform.h" // GCC_ALIGN_ATTRIBUTE(16) -#include "uint128_sse2.h" // uint128_sse2 type (if necessary) - -namespace crcutil { - -#pragma pack(push, 16) - -// Extends CRC by one byte. -// Technically, if degree of a polynomial does not exceed 8, -// right shift by 8 bits is not required, but who cares about CRC-8? -#define CRC_BYTE(table, crc, byte) do { \ - crc = ((sizeof(crc) > 1) ? SHIFT_RIGHT_SAFE(crc, 8) : 0) ^ \ - table->crc_word_[sizeof(Word) - 1][TO_BYTE(crc) ^ (byte)]; \ -} while (0) - -#define TABLE_ENTRY(table, byte, buf) \ - table[byte][Downcast<Word, uint8>(buf)] - -#define TABLE_ENTRY_LAST(table, buf) \ - table[sizeof(Word) - 1][buf] - -// Extends CRC by one word. -#define CRC_WORD(table, crc, buf) do { \ - buf ^= Downcast<Crc, Word>(crc); \ - if (sizeof(crc) > sizeof(buf)) { \ - crc = SHIFT_RIGHT_SAFE(crc, sizeof(buf) * 8); \ - crc ^= TABLE_ENTRY(table->crc_word_, 0, buf); \ - } else { \ - crc = TABLE_ENTRY(table->crc_word_, 0, buf); \ - } \ - buf >>= 8; \ - for (size_t byte = 1; byte < sizeof(buf) - 1; ++byte) { \ - crc ^= TABLE_ENTRY(table->crc_word_, byte, buf); \ - buf >>= 8; \ - } \ - crc ^= TABLE_ENTRY_LAST(table->crc_word_, buf); \ -} while (0) - -// Process beginning of data block byte by byte until source pointer -// becomes perfectly aligned on Word boundary. -#define ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word) do { \ - while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { \ - if (src >= end) { \ - return (crc ^ table->Base().Canonize()); \ - } \ - CRC_BYTE(table, crc, *src); \ - src += 1; \ - } \ -} while (0) - - -// On amd64, enforcing alignment is 2-4% slower on small (<= 64 bytes) blocks -// but 6-10% faster on larger blocks (>= 2KB). -// Break-even point (+-1%) is around 1KB (Q9650, E6600). -// -#define ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, table, src, end, crc, Word) \ -do { \ - if (sizeof(Word) > 8 || (bytes) > CRCUTIL_MIN_ALIGN_SIZE) { \ - ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word); \ - } \ -} while (0) - -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable: 4127) // conditional expression is constant -#endif // defined(_MSC_VER) - -// Forward declarations. -template<typename CrcImplementation> class RollingCrc; - -// Crc is the type used internally and to return values of N-bit CRC. -// It should be at least as large as "TableEntry" and "Word" but -// may be larger (e.g. for 16-bit CRC, TableEntry and Word may be -// set to uint16 but Crc may be set to uint32). -// -// TableEntry is the type of values stored in the tables. -// To implement N-bit CRC, TableEntry should be large enough -// to store N bits. -// -// Word is the type used to read data sizeof(Word) at a time. -// Ideally, it shoulde be "most suitable for given architecture" -// integer type -- typically "size_t". -// -// kStride is the number of words processed in interleaved manner by -// CrcMultiword() and CrcWordblock(). Shall be either 3 or 4. -// Optimal value depends on hardware architecture (AMD64, ARM, etc). -// -template<typename _Crc, typename _TableEntry, typename _Word, int kStride> - class GenericCrc { - public: - // Make Crc, TableEntry, and Word types visible (used by RollingCrc etc.) - typedef _Crc Crc; - typedef _TableEntry TableEntry; - typedef _Word Word; - - GenericCrc() {} - - // Initializes the tables given generating polynomial of degree. - // If "canonical" is true, crc value will be XOR'ed with (-1) before and - // after actual CRC computation. - GenericCrc(const Crc &generating_polynomial, size_t degree, bool canonical) { - Init(generating_polynomial, degree, canonical); - } - void Init(const Crc &generating_polynomial, size_t degree, bool canonical) { - base_.Init(generating_polynomial, degree, canonical); - - // Instead of computing - // table[j][i] = MultiplyUnnormalized(i, 8, k), - // for all i = 0...255, we may notice that - // if i = 2**n then for all m = 1...(i-1) - // MultiplyUnnormalized(i + m, 8, k) = - // MultiplyUnnormalized(i ^ m, 8, k) = - // MultiplyUnnormalized(i, 8, k) ^ MultiplyUnnormalized(m, 8, k) = - // MultiplyUnnormalized(i, 8, k) ^ crc_word_interleaved[j][m] = - // table[i] ^ table[m]. -#if 0 - for (size_t j = 0; j < sizeof(Word); ++j) { - Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree); - for (size_t i = 0; i < 256; ++i) { - Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k); - this->crc_word_interleaved_[j][i] = Downcast<Crc, TableEntry>(temp); - } - } -#else - for (size_t j = 0; j < sizeof(Word); ++j) { - Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree); - TableEntry *table = this->crc_word_interleaved_[j]; - table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0. - for (size_t i = 1; i < 256; i <<= 1) { - TableEntry value = Downcast<Crc, TableEntry>( - Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k)); - table[i] = value; - for (size_t m = 1; m < i; ++m) { - table[i + m] = value ^ table[m]; - } - } - } -#endif - -#if 0 - for (size_t j = 0; j < sizeof(Word); ++j) { - Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree); - for (size_t i = 0; i < 256; ++i) { - Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k); - this->crc_word_[j][i] = Downcast<Crc, TableEntry>(temp); - } - } -#else - for (size_t j = 0; j < sizeof(Word); ++j) { - Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree); - TableEntry *table = this->crc_word_[j]; - table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0. - for (size_t i = 1; i < 256; i <<= 1) { - TableEntry value = Downcast<Crc, TableEntry>( - Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k)); - table[i] = value; - for (size_t m = 1; m < i; ++m) { - table[i + m] = value ^ table[m]; - } - } - } -#endif - } - - // Default CRC implementation - Crc CrcDefault(const void *data, size_t bytes, const Crc &start) const { -#if HAVE_AMD64 || HAVE_I386 - return CrcMultiword(data, bytes, start); -#else - // Very few CPUs have multiple ALUs and speculative execution - // (Itanium is an exception) so sophisticated algorithms will - // not perform better than good old Sarwate algorithm. - return CrcByteUnrolled(data, bytes, start); -#endif // HAVE_AMD64 || HAVE_I386 - } - - // Returns base class. - const GfUtil<Crc> &Base() const { return base_; } - - protected: - // Canonical, byte-by-byte CRC computation. - Crc CrcByte(const void *data, size_t bytes, const Crc &start) const { - const uint8 *src = static_cast<const uint8 *>(data); - Crc crc = start ^ Base().Canonize(); - for (const uint8 *end = src + bytes; src < end; ++src) { - CRC_BYTE(this, crc, *src); - } - return (crc ^ Base().Canonize()); - } - - // Byte-by-byte CRC with main loop unrolled. - Crc CrcByteUnrolled(const void *data, size_t bytes, const Crc &start) const { - if (bytes == 0) { - return start; - } - - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - Crc crc = start ^ Base().Canonize(); - - // Unroll loop 4 times. - end -= 3; - for (; src < end; src += 4) { - PREFETCH(src); - CRC_BYTE(this, crc, src[0]); - CRC_BYTE(this, crc, src[1]); - CRC_BYTE(this, crc, src[2]); - CRC_BYTE(this, crc, src[3]); - } - end += 3; - - // Compute CRC of remaining bytes. - for (; src < end; ++src) { - CRC_BYTE(this, crc, *src); - } - - return (crc ^ Base().Canonize()); - } - - // Canonical, byte-by-byte CRC computation. - Crc CrcByteWord(const void *data, size_t bytes, const Crc &start) const { - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - Crc crc0 = start ^ Base().Canonize(); - - ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Crc); - if (src >= end) { - return (crc0 ^ Base().Canonize()); - } - - // Process 4*sizeof(Crc) bytes at a time. - end -= 4 * sizeof(Crc) - 1; - for (; src < end; src += 4 * sizeof(Crc)) { - for (size_t i = 0; i < 4; ++i) { - crc0 ^= reinterpret_cast<const Crc *>(src)[i]; - if (i == 0) { - PREFETCH(src); - } - for (size_t byte = 0; byte < sizeof(crc0); ++byte) { - CRC_BYTE(this, crc0, 0); - } - } - } - end += 4 * sizeof(Crc) - 1; - - // Process sizeof(Crc) bytes at a time. - end -= sizeof(Crc) - 1; - for (; src < end; src += sizeof(Crc)) { - crc0 ^= reinterpret_cast<const Crc *>(src)[0]; - for (size_t byte = 0; byte < sizeof(crc0); ++byte) { - CRC_BYTE(this, crc0, 0); - } - } - end += sizeof(Crc) - 1; - - // Compute CRC of remaining bytes. - for (;src < end; ++src) { - CRC_BYTE(this, crc0, *src); - } - - return (crc0 ^ Base().Canonize()); - } - - // Faster, word-by-word CRC. - Crc CrcWord(const void *data, size_t bytes, const Crc &start) const { - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - Crc crc0 = start ^ Base().Canonize(); - - ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); - if (src >= end) { - return (crc0 ^ Base().Canonize()); - } - - // Process 4 sizeof(Word) bytes at once. - end -= 4 * sizeof(Word) - 1; - for (; src < end; src += 4 * sizeof(Word)) { - Word buf0 = reinterpret_cast<const Word *>(src)[0]; - PREFETCH(src); - CRC_WORD(this, crc0, buf0); - buf0 = reinterpret_cast<const Word *>(src)[1]; - CRC_WORD(this, crc0, buf0); - buf0 = reinterpret_cast<const Word *>(src)[2]; - CRC_WORD(this, crc0, buf0); - buf0 = reinterpret_cast<const Word *>(src)[3]; - CRC_WORD(this, crc0, buf0); - } - end += 4 * sizeof(Word) - 1; - - // Process sizeof(Word) bytes at a time. - end -= sizeof(Word) - 1; - for (; src < end; src += sizeof(Word)) { - Word buf0 = reinterpret_cast<const Word *>(src)[0]; - CRC_WORD(this, crc0, buf0); - } - end += sizeof(Word) - 1; - - // Compute CRC of remaining bytes. - for (;src < end; ++src) { - CRC_BYTE(this, crc0, *src); - } - - return (crc0 ^ Base().Canonize()); - } - -#define REPEAT_FROM_1(macro) \ - macro(1); \ - macro(2); \ - macro(3); \ - macro(4); \ - macro(5); \ - macro(6); \ - macro(7); - -#define REPEAT_FROM_0(macro) \ - macro(0); \ - REPEAT_FROM_1(macro) - - // Faster, process adjusent blocks in parallel and concatenate CRCs. - Crc CrcBlockword(const void *data, size_t bytes, const Crc &start) const { - if (kStride < 2 || kStride > 8) { - // Unsupported configuration; - // fall back to something sensible. - return CrcWord(data, bytes, start); - } - - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - Crc crc0 = start ^ Base().Canonize(); - enum { - // Add 16 to avoid false L1 cache collisions. - kStripe = (15*1024 + 16) & ~(sizeof(Word) - 1), - }; - - ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); - if (src >= end) { - return (crc0 ^ Base().Canonize()); - } - - end -= kStride * kStripe - 1; - if (src < end) { - Crc x_pow_8kStripe = Base().Xpow8N(kStripe); - do { - const uint8 *stripe_end = src + kStripe; - -#define INIT_CRC(reg) \ - Crc crc##reg; \ - if (kStride >= reg) { \ - crc##reg = 0; \ - } - REPEAT_FROM_1(INIT_CRC); -#undef INIT_CRC - - do { -#define FIRST(reg) \ - Word buf##reg; \ - if (kStride > reg) { \ - buf##reg = reinterpret_cast<const Word *>(src + reg * kStripe)[0]; \ - buf##reg ^= Downcast<Crc, Word>(crc##reg); \ - if (sizeof(crc##reg) > sizeof(buf##reg)) { \ - crc##reg = SHIFT_RIGHT_SAFE(crc##reg, sizeof(buf##reg) * 8); \ - crc##reg ^= TABLE_ENTRY(this->crc_word_, 0, buf##reg); \ - } else { \ - crc##reg = TABLE_ENTRY(this->crc_word_, 0, buf##reg); \ - } \ - buf##reg >>= 8; \ - } - REPEAT_FROM_0(FIRST); -#undef FIRST - - for (size_t byte = 1; byte < sizeof(buf0) - 1; ++byte) { -#define NEXT(reg) do { \ - if (kStride > reg) { \ - crc##reg ^= TABLE_ENTRY(this->crc_word_, byte, buf##reg); \ - buf##reg >>= 8; \ - } \ -} while (0) - REPEAT_FROM_0(NEXT); -#undef NEXT - } - -#define LAST(reg) do { \ - if (kStride > reg) { \ - crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_, buf##reg); \ - } \ -} while (0) - REPEAT_FROM_0(LAST); -#undef LAST - - src += sizeof(Word); - } while (src < stripe_end); - -#if 0 -// The code is left for illustrational purposes only. -#define COMBINE(reg) do { \ - if (reg > 0 && kStride > reg) { \ - crc0 = Base().ChangeStartValue(crc##reg, kStripe, 0, crc0); \ - } \ -} while (0) -#else -#define COMBINE(reg) do { \ - if (reg > 0 && kStride > reg) { \ - crc0 = crc##reg ^ Base().Multiply(crc0, x_pow_8kStripe); \ - } \ -} while (0) -#endif - REPEAT_FROM_0(COMBINE); -#undef COMBINE - - src += (kStride - 1) * kStripe; - } - while (src < end); - } - end += kStride * kStripe - 1; - - // Process sizeof(Word) bytes at a time. - end -= sizeof(Word) - 1; - for (; src < end; src += sizeof(Word)) { - Word buf0 = reinterpret_cast<const Word *>(src)[0]; - CRC_WORD(this, crc0, buf0); - } - end += sizeof(Word) - 1; - - // Compute CRC of remaining bytes. - for (;src < end; ++src) { - CRC_BYTE(this, crc0, *src); - } - - return (crc0 ^ Base().Canonize()); - } - - // Fastest, interleaved multi-byte CRC. - Crc CrcMultiword(const void *data, size_t bytes, const Crc &start) const { - if (kStride < 2 || kStride > 8) { - // Unsupported configuration; - // fall back to something sensible. - return CrcWord(data, bytes, start); - } - - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - Crc crc0 = start ^ Base().Canonize(); - - ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); - if (src >= end) { - return (crc0 ^ Base().Canonize()); - } - - // Process kStride Word registers at once; - // should have have at least 2*kInterleaveBytes of data to start. - end -= 2*kInterleaveBytes - 1; - if (src < end) { - Crc crc_carryover; - if (sizeof(Crc) > sizeof(Word)) { - // crc_carryover is used if and only if Crc is wider than Word. - crc_carryover = 0; - } -#define INIT_CRC(reg) \ - Crc crc##reg; \ - if (reg > 0 && kStride > reg) { \ - crc##reg = 0; \ - } - REPEAT_FROM_1(INIT_CRC); -#undef INIT_CRC - -#define INIT_BUF(reg) \ - Word buf##reg; \ - if (kStride > reg) { \ - buf##reg = reinterpret_cast<const Word *>(src)[reg]; \ - } - REPEAT_FROM_0(INIT_BUF); -#undef INIT_BUF - - do { - PREFETCH(src); - src += kInterleaveBytes; - - if (sizeof(Crc) > sizeof(Word)) { - crc0 ^= crc_carryover; - } - -#define FIRST(reg, next_reg) do { \ - if (kStride > reg) { \ - buf##reg ^= Downcast<Crc, Word>(crc##reg); \ - if (sizeof(Crc) > sizeof(Word)) { \ - if (reg < kStride - 1) { \ - crc##next_reg ^= SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \ - } else { \ - crc_carryover = SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \ - } \ - } \ - crc##reg = TABLE_ENTRY(this->crc_word_interleaved_, 0, buf##reg); \ - buf##reg >>= 8; \ - } \ -} while (0) - FIRST(0, 1); - FIRST(1, 2); - FIRST(2, 3); - FIRST(3, 4); - FIRST(4, 5); - FIRST(5, 6); - FIRST(6, 7); - FIRST(7, 0); -#undef FIRST - - for (size_t byte = 1; byte < sizeof(Word) - 1; ++byte) { -#define NEXT(reg) do { \ - if (kStride > reg) { \ - crc##reg ^= \ - TABLE_ENTRY(this->crc_word_interleaved_, byte, buf##reg); \ - buf##reg >>= 8; \ - } \ -} while(0) - REPEAT_FROM_0(NEXT); -#undef NEXT - } - -#define LAST(reg) do { \ - if (kStride > reg) { \ - crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_interleaved_, buf##reg); \ - buf##reg = reinterpret_cast<const Word *>(src)[reg]; \ - } \ -} while(0) - REPEAT_FROM_0(LAST); -#undef LAST - } - while (src < end); - - if (sizeof(Crc) > sizeof(Word)) { - crc0 ^= crc_carryover; - } - -#define COMBINE(reg) do { \ - if (kStride > reg) { \ - if (reg != 0) { \ - crc0 ^= crc##reg; \ - } \ - CRC_WORD(this, crc0, buf##reg); \ - } \ -} while (0) - REPEAT_FROM_0(COMBINE); -#undef COMBINE - - src += kInterleaveBytes; - } - end += 2*kInterleaveBytes - 1; - - // Process sizeof(Word) bytes at once. - end -= sizeof(Word) - 1; - for (; src < end; src += sizeof(Word)) { - Word buf0 = reinterpret_cast<const Word *>(src)[0]; - CRC_WORD(this, crc0, buf0); - } - end += sizeof(Word) - 1; - - // Compute CRC of remaining bytes. - for (;src < end; ++src) { - CRC_BYTE(this, crc0, *src); - } - - return (crc0 ^ Base().Canonize()); - } - - protected: - enum { - kInterleaveBytes = sizeof(Word) * kStride, - }; - - // Multiplication tables used by CRCs. - TableEntry crc_word_interleaved_[sizeof(Word)][256]; - TableEntry crc_word_[sizeof(Word)][256]; - - // Base class stored after CRC tables so that the most frequently - // used table is at offset 0 and may be accessed faster. - GfUtil<Crc> base_; - - friend class RollingCrc< GenericCrc<Crc, TableEntry, Word, kStride> >; - - private: - // CrcMultiword on amd64 may run at 1.2 CPU cycles per byte which is - // noticeably faster than CrcWord (2.2-2.6 cycles/byte depending on - // hardware and compiler). However, there are problems with compilers. - // - // Test system: P45 chipset, Intel Q9650 CPU, 800MHz 4-4-4-12 memory. - // - // 64-bit compiler, <= 64-bit CRC, 64-bit tables, 64-bit reads: - // CL 15.00.307291.1 C++ >1.2< CPU cycles/byte - // ICL 11.1.051 -O3 C++ 1.5 CPU cycles/byte - // GCC 4.5 -O3 C++ 2.0 CPU cycles/byte - // GCC 4.x -O3 ASM >1.2< CPU cycles/byte - // - // 32-bit compiler, MMX used, <= 64-bit CRC, 64-bit tables, 64-bit reads - // CL 15.00.307291.1 C++ 2.0 CPU cycles/byte - // GCC 4.5 -O3 C++ 1.9 CPU cycles/byte - // ICL 11.1.051 -S C++ 1.6 CPU cycles/byte - // GCC 4.x -O3 ASM >1.3< CPU cycles/byte - // - // So, use inline ASM code for GCC for both i386 and amd64. - - Crc CrcMultiwordI386Mmx( - const void *data, size_t bytes, const Crc &start) const; - Crc CrcMultiwordGccAmd64( - const void *data, size_t bytes, const Crc &start) const; - Crc CrcMultiwordGccAmd64Sse2( - const uint8 *src, const uint8 *end, const Crc &start) const; -} GCC_ALIGN_ATTRIBUTE(16); - -#undef REPEAT_FROM_0 -#undef REPEAT_FROM_1 - - -// Specialized variants. -#if CRCUTIL_USE_ASM - -#if (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX))) - -// Declare specialized functions. -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( - const void *data, size_t bytes, const uint64 &start) const; - -#if HAVE_AMD64 && HAVE_SSE2 -template<> -uint128_sse2 -GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword( - const void *data, size_t bytes, const uint128_sse2 &start) const; -#endif // HAVE_AMD64 && HAVE_SSE2 - -#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER <= 150030729 && \ - (HAVE_I386 && HAVE_MMX) - -// Work around bug in MSC (present at least in v. 15.00.30729.1) -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( - const void *data, - size_t bytes, - const uint64 &start) const; -template<> __forceinline -uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( - const void *data, - size_t bytes, - const uint64 &start) const { - typedef uint64 Word; - typedef uint64 Crc; - if (bytes <= 12) { - const uint8 *src = static_cast<const uint8 *>(data); - uint64 crc = start ^ Base().Canonize(); - for (const uint8 *end = src + bytes; src < end; ++src) { - CRC_BYTE(this, crc, *src); - } - return (crc ^ Base().Canonize()); - } - return CrcMultiwordI386Mmx(data, bytes, start); -} - -#endif // (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX))) - -#endif // CRCUTIL_USE_ASM - - -#pragma pack(pop) - -} // namespace crcutil - -#endif // CRCUTIL_GENERIC_CRC_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Defines GenericCrc class which implements arbitrary CRCs. +// +// Please read crc.pdf to understand how it all works. + +#ifndef CRCUTIL_GENERIC_CRC_H_ +#define CRCUTIL_GENERIC_CRC_H_ + +#include "base_types.h" // uint8 +#include "crc_casts.h" // TO_BYTE(), Downcast<>. +#include "gf_util.h" // GfUtil<Crc> class. +#include "platform.h" // GCC_ALIGN_ATTRIBUTE(16) +#include "uint128_sse2.h" // uint128_sse2 type (if necessary) + +namespace crcutil { + +#pragma pack(push, 16) + +// Extends CRC by one byte. +// Technically, if degree of a polynomial does not exceed 8, +// right shift by 8 bits is not required, but who cares about CRC-8? +#define CRC_BYTE(table, crc, byte) do { \ + crc = ((sizeof(crc) > 1) ? SHIFT_RIGHT_SAFE(crc, 8) : 0) ^ \ + table->crc_word_[sizeof(Word) - 1][TO_BYTE(crc) ^ (byte)]; \ +} while (0) + +#define TABLE_ENTRY(table, byte, buf) \ + table[byte][Downcast<Word, uint8>(buf)] + +#define TABLE_ENTRY_LAST(table, buf) \ + table[sizeof(Word) - 1][buf] + +// Extends CRC by one word. +#define CRC_WORD(table, crc, buf) do { \ + buf ^= Downcast<Crc, Word>(crc); \ + if (sizeof(crc) > sizeof(buf)) { \ + crc = SHIFT_RIGHT_SAFE(crc, sizeof(buf) * 8); \ + crc ^= TABLE_ENTRY(table->crc_word_, 0, buf); \ + } else { \ + crc = TABLE_ENTRY(table->crc_word_, 0, buf); \ + } \ + buf >>= 8; \ + for (size_t byte = 1; byte < sizeof(buf) - 1; ++byte) { \ + crc ^= TABLE_ENTRY(table->crc_word_, byte, buf); \ + buf >>= 8; \ + } \ + crc ^= TABLE_ENTRY_LAST(table->crc_word_, buf); \ +} while (0) + +// Process beginning of data block byte by byte until source pointer +// becomes perfectly aligned on Word boundary. +#define ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word) do { \ + while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { \ + if (src >= end) { \ + return (crc ^ table->Base().Canonize()); \ + } \ + CRC_BYTE(table, crc, *src); \ + src += 1; \ + } \ +} while (0) + + +// On amd64, enforcing alignment is 2-4% slower on small (<= 64 bytes) blocks +// but 6-10% faster on larger blocks (>= 2KB). +// Break-even point (+-1%) is around 1KB (Q9650, E6600). +// +#define ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, table, src, end, crc, Word) \ +do { \ + if (sizeof(Word) > 8 || (bytes) > CRCUTIL_MIN_ALIGN_SIZE) { \ + ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word); \ + } \ +} while (0) + +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4127) // conditional expression is constant +#endif // defined(_MSC_VER) + +// Forward declarations. +template<typename CrcImplementation> class RollingCrc; + +// Crc is the type used internally and to return values of N-bit CRC. +// It should be at least as large as "TableEntry" and "Word" but +// may be larger (e.g. for 16-bit CRC, TableEntry and Word may be +// set to uint16 but Crc may be set to uint32). +// +// TableEntry is the type of values stored in the tables. +// To implement N-bit CRC, TableEntry should be large enough +// to store N bits. +// +// Word is the type used to read data sizeof(Word) at a time. +// Ideally, it shoulde be "most suitable for given architecture" +// integer type -- typically "size_t". +// +// kStride is the number of words processed in interleaved manner by +// CrcMultiword() and CrcWordblock(). Shall be either 3 or 4. +// Optimal value depends on hardware architecture (AMD64, ARM, etc). +// +template<typename _Crc, typename _TableEntry, typename _Word, int kStride> + class GenericCrc { + public: + // Make Crc, TableEntry, and Word types visible (used by RollingCrc etc.) + typedef _Crc Crc; + typedef _TableEntry TableEntry; + typedef _Word Word; + + GenericCrc() {} + + // Initializes the tables given generating polynomial of degree. + // If "canonical" is true, crc value will be XOR'ed with (-1) before and + // after actual CRC computation. + GenericCrc(const Crc &generating_polynomial, size_t degree, bool canonical) { + Init(generating_polynomial, degree, canonical); + } + void Init(const Crc &generating_polynomial, size_t degree, bool canonical) { + base_.Init(generating_polynomial, degree, canonical); + + // Instead of computing + // table[j][i] = MultiplyUnnormalized(i, 8, k), + // for all i = 0...255, we may notice that + // if i = 2**n then for all m = 1...(i-1) + // MultiplyUnnormalized(i + m, 8, k) = + // MultiplyUnnormalized(i ^ m, 8, k) = + // MultiplyUnnormalized(i, 8, k) ^ MultiplyUnnormalized(m, 8, k) = + // MultiplyUnnormalized(i, 8, k) ^ crc_word_interleaved[j][m] = + // table[i] ^ table[m]. +#if 0 + for (size_t j = 0; j < sizeof(Word); ++j) { + Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree); + for (size_t i = 0; i < 256; ++i) { + Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k); + this->crc_word_interleaved_[j][i] = Downcast<Crc, TableEntry>(temp); + } + } +#else + for (size_t j = 0; j < sizeof(Word); ++j) { + Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree); + TableEntry *table = this->crc_word_interleaved_[j]; + table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0. + for (size_t i = 1; i < 256; i <<= 1) { + TableEntry value = Downcast<Crc, TableEntry>( + Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k)); + table[i] = value; + for (size_t m = 1; m < i; ++m) { + table[i + m] = value ^ table[m]; + } + } + } +#endif + +#if 0 + for (size_t j = 0; j < sizeof(Word); ++j) { + Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree); + for (size_t i = 0; i < 256; ++i) { + Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k); + this->crc_word_[j][i] = Downcast<Crc, TableEntry>(temp); + } + } +#else + for (size_t j = 0; j < sizeof(Word); ++j) { + Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree); + TableEntry *table = this->crc_word_[j]; + table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0. + for (size_t i = 1; i < 256; i <<= 1) { + TableEntry value = Downcast<Crc, TableEntry>( + Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k)); + table[i] = value; + for (size_t m = 1; m < i; ++m) { + table[i + m] = value ^ table[m]; + } + } + } +#endif + } + + // Default CRC implementation + Crc CrcDefault(const void *data, size_t bytes, const Crc &start) const { +#if HAVE_AMD64 || HAVE_I386 + return CrcMultiword(data, bytes, start); +#else + // Very few CPUs have multiple ALUs and speculative execution + // (Itanium is an exception) so sophisticated algorithms will + // not perform better than good old Sarwate algorithm. + return CrcByteUnrolled(data, bytes, start); +#endif // HAVE_AMD64 || HAVE_I386 + } + + // Returns base class. + const GfUtil<Crc> &Base() const { return base_; } + + protected: + // Canonical, byte-by-byte CRC computation. + Crc CrcByte(const void *data, size_t bytes, const Crc &start) const { + const uint8 *src = static_cast<const uint8 *>(data); + Crc crc = start ^ Base().Canonize(); + for (const uint8 *end = src + bytes; src < end; ++src) { + CRC_BYTE(this, crc, *src); + } + return (crc ^ Base().Canonize()); + } + + // Byte-by-byte CRC with main loop unrolled. + Crc CrcByteUnrolled(const void *data, size_t bytes, const Crc &start) const { + if (bytes == 0) { + return start; + } + + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + Crc crc = start ^ Base().Canonize(); + + // Unroll loop 4 times. + end -= 3; + for (; src < end; src += 4) { + PREFETCH(src); + CRC_BYTE(this, crc, src[0]); + CRC_BYTE(this, crc, src[1]); + CRC_BYTE(this, crc, src[2]); + CRC_BYTE(this, crc, src[3]); + } + end += 3; + + // Compute CRC of remaining bytes. + for (; src < end; ++src) { + CRC_BYTE(this, crc, *src); + } + + return (crc ^ Base().Canonize()); + } + + // Canonical, byte-by-byte CRC computation. + Crc CrcByteWord(const void *data, size_t bytes, const Crc &start) const { + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + Crc crc0 = start ^ Base().Canonize(); + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Crc); + if (src >= end) { + return (crc0 ^ Base().Canonize()); + } + + // Process 4*sizeof(Crc) bytes at a time. + end -= 4 * sizeof(Crc) - 1; + for (; src < end; src += 4 * sizeof(Crc)) { + for (size_t i = 0; i < 4; ++i) { + crc0 ^= reinterpret_cast<const Crc *>(src)[i]; + if (i == 0) { + PREFETCH(src); + } + for (size_t byte = 0; byte < sizeof(crc0); ++byte) { + CRC_BYTE(this, crc0, 0); + } + } + } + end += 4 * sizeof(Crc) - 1; + + // Process sizeof(Crc) bytes at a time. + end -= sizeof(Crc) - 1; + for (; src < end; src += sizeof(Crc)) { + crc0 ^= reinterpret_cast<const Crc *>(src)[0]; + for (size_t byte = 0; byte < sizeof(crc0); ++byte) { + CRC_BYTE(this, crc0, 0); + } + } + end += sizeof(Crc) - 1; + + // Compute CRC of remaining bytes. + for (;src < end; ++src) { + CRC_BYTE(this, crc0, *src); + } + + return (crc0 ^ Base().Canonize()); + } + + // Faster, word-by-word CRC. + Crc CrcWord(const void *data, size_t bytes, const Crc &start) const { + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + Crc crc0 = start ^ Base().Canonize(); + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); + if (src >= end) { + return (crc0 ^ Base().Canonize()); + } + + // Process 4 sizeof(Word) bytes at once. + end -= 4 * sizeof(Word) - 1; + for (; src < end; src += 4 * sizeof(Word)) { + Word buf0 = reinterpret_cast<const Word *>(src)[0]; + PREFETCH(src); + CRC_WORD(this, crc0, buf0); + buf0 = reinterpret_cast<const Word *>(src)[1]; + CRC_WORD(this, crc0, buf0); + buf0 = reinterpret_cast<const Word *>(src)[2]; + CRC_WORD(this, crc0, buf0); + buf0 = reinterpret_cast<const Word *>(src)[3]; + CRC_WORD(this, crc0, buf0); + } + end += 4 * sizeof(Word) - 1; + + // Process sizeof(Word) bytes at a time. + end -= sizeof(Word) - 1; + for (; src < end; src += sizeof(Word)) { + Word buf0 = reinterpret_cast<const Word *>(src)[0]; + CRC_WORD(this, crc0, buf0); + } + end += sizeof(Word) - 1; + + // Compute CRC of remaining bytes. + for (;src < end; ++src) { + CRC_BYTE(this, crc0, *src); + } + + return (crc0 ^ Base().Canonize()); + } + +#define REPEAT_FROM_1(macro) \ + macro(1); \ + macro(2); \ + macro(3); \ + macro(4); \ + macro(5); \ + macro(6); \ + macro(7); + +#define REPEAT_FROM_0(macro) \ + macro(0); \ + REPEAT_FROM_1(macro) + + // Faster, process adjusent blocks in parallel and concatenate CRCs. + Crc CrcBlockword(const void *data, size_t bytes, const Crc &start) const { + if (kStride < 2 || kStride > 8) { + // Unsupported configuration; + // fall back to something sensible. + return CrcWord(data, bytes, start); + } + + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + Crc crc0 = start ^ Base().Canonize(); + enum { + // Add 16 to avoid false L1 cache collisions. + kStripe = (15*1024 + 16) & ~(sizeof(Word) - 1), + }; + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); + if (src >= end) { + return (crc0 ^ Base().Canonize()); + } + + end -= kStride * kStripe - 1; + if (src < end) { + Crc x_pow_8kStripe = Base().Xpow8N(kStripe); + do { + const uint8 *stripe_end = src + kStripe; + +#define INIT_CRC(reg) \ + Crc crc##reg; \ + if (kStride >= reg) { \ + crc##reg = 0; \ + } + REPEAT_FROM_1(INIT_CRC); +#undef INIT_CRC + + do { +#define FIRST(reg) \ + Word buf##reg; \ + if (kStride > reg) { \ + buf##reg = reinterpret_cast<const Word *>(src + reg * kStripe)[0]; \ + buf##reg ^= Downcast<Crc, Word>(crc##reg); \ + if (sizeof(crc##reg) > sizeof(buf##reg)) { \ + crc##reg = SHIFT_RIGHT_SAFE(crc##reg, sizeof(buf##reg) * 8); \ + crc##reg ^= TABLE_ENTRY(this->crc_word_, 0, buf##reg); \ + } else { \ + crc##reg = TABLE_ENTRY(this->crc_word_, 0, buf##reg); \ + } \ + buf##reg >>= 8; \ + } + REPEAT_FROM_0(FIRST); +#undef FIRST + + for (size_t byte = 1; byte < sizeof(buf0) - 1; ++byte) { +#define NEXT(reg) do { \ + if (kStride > reg) { \ + crc##reg ^= TABLE_ENTRY(this->crc_word_, byte, buf##reg); \ + buf##reg >>= 8; \ + } \ +} while (0) + REPEAT_FROM_0(NEXT); +#undef NEXT + } + +#define LAST(reg) do { \ + if (kStride > reg) { \ + crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_, buf##reg); \ + } \ +} while (0) + REPEAT_FROM_0(LAST); +#undef LAST + + src += sizeof(Word); + } while (src < stripe_end); + +#if 0 +// The code is left for illustrational purposes only. +#define COMBINE(reg) do { \ + if (reg > 0 && kStride > reg) { \ + crc0 = Base().ChangeStartValue(crc##reg, kStripe, 0, crc0); \ + } \ +} while (0) +#else +#define COMBINE(reg) do { \ + if (reg > 0 && kStride > reg) { \ + crc0 = crc##reg ^ Base().Multiply(crc0, x_pow_8kStripe); \ + } \ +} while (0) +#endif + REPEAT_FROM_0(COMBINE); +#undef COMBINE + + src += (kStride - 1) * kStripe; + } + while (src < end); + } + end += kStride * kStripe - 1; + + // Process sizeof(Word) bytes at a time. + end -= sizeof(Word) - 1; + for (; src < end; src += sizeof(Word)) { + Word buf0 = reinterpret_cast<const Word *>(src)[0]; + CRC_WORD(this, crc0, buf0); + } + end += sizeof(Word) - 1; + + // Compute CRC of remaining bytes. + for (;src < end; ++src) { + CRC_BYTE(this, crc0, *src); + } + + return (crc0 ^ Base().Canonize()); + } + + // Fastest, interleaved multi-byte CRC. + Crc CrcMultiword(const void *data, size_t bytes, const Crc &start) const { + if (kStride < 2 || kStride > 8) { + // Unsupported configuration; + // fall back to something sensible. + return CrcWord(data, bytes, start); + } + + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + Crc crc0 = start ^ Base().Canonize(); + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); + if (src >= end) { + return (crc0 ^ Base().Canonize()); + } + + // Process kStride Word registers at once; + // should have have at least 2*kInterleaveBytes of data to start. + end -= 2*kInterleaveBytes - 1; + if (src < end) { + Crc crc_carryover; + if (sizeof(Crc) > sizeof(Word)) { + // crc_carryover is used if and only if Crc is wider than Word. + crc_carryover = 0; + } +#define INIT_CRC(reg) \ + Crc crc##reg; \ + if (reg > 0 && kStride > reg) { \ + crc##reg = 0; \ + } + REPEAT_FROM_1(INIT_CRC); +#undef INIT_CRC + +#define INIT_BUF(reg) \ + Word buf##reg; \ + if (kStride > reg) { \ + buf##reg = reinterpret_cast<const Word *>(src)[reg]; \ + } + REPEAT_FROM_0(INIT_BUF); +#undef INIT_BUF + + do { + PREFETCH(src); + src += kInterleaveBytes; + + if (sizeof(Crc) > sizeof(Word)) { + crc0 ^= crc_carryover; + } + +#define FIRST(reg, next_reg) do { \ + if (kStride > reg) { \ + buf##reg ^= Downcast<Crc, Word>(crc##reg); \ + if (sizeof(Crc) > sizeof(Word)) { \ + if (reg < kStride - 1) { \ + crc##next_reg ^= SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \ + } else { \ + crc_carryover = SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \ + } \ + } \ + crc##reg = TABLE_ENTRY(this->crc_word_interleaved_, 0, buf##reg); \ + buf##reg >>= 8; \ + } \ +} while (0) + FIRST(0, 1); + FIRST(1, 2); + FIRST(2, 3); + FIRST(3, 4); + FIRST(4, 5); + FIRST(5, 6); + FIRST(6, 7); + FIRST(7, 0); +#undef FIRST + + for (size_t byte = 1; byte < sizeof(Word) - 1; ++byte) { +#define NEXT(reg) do { \ + if (kStride > reg) { \ + crc##reg ^= \ + TABLE_ENTRY(this->crc_word_interleaved_, byte, buf##reg); \ + buf##reg >>= 8; \ + } \ +} while(0) + REPEAT_FROM_0(NEXT); +#undef NEXT + } + +#define LAST(reg) do { \ + if (kStride > reg) { \ + crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_interleaved_, buf##reg); \ + buf##reg = reinterpret_cast<const Word *>(src)[reg]; \ + } \ +} while(0) + REPEAT_FROM_0(LAST); +#undef LAST + } + while (src < end); + + if (sizeof(Crc) > sizeof(Word)) { + crc0 ^= crc_carryover; + } + +#define COMBINE(reg) do { \ + if (kStride > reg) { \ + if (reg != 0) { \ + crc0 ^= crc##reg; \ + } \ + CRC_WORD(this, crc0, buf##reg); \ + } \ +} while (0) + REPEAT_FROM_0(COMBINE); +#undef COMBINE + + src += kInterleaveBytes; + } + end += 2*kInterleaveBytes - 1; + + // Process sizeof(Word) bytes at once. + end -= sizeof(Word) - 1; + for (; src < end; src += sizeof(Word)) { + Word buf0 = reinterpret_cast<const Word *>(src)[0]; + CRC_WORD(this, crc0, buf0); + } + end += sizeof(Word) - 1; + + // Compute CRC of remaining bytes. + for (;src < end; ++src) { + CRC_BYTE(this, crc0, *src); + } + + return (crc0 ^ Base().Canonize()); + } + + protected: + enum { + kInterleaveBytes = sizeof(Word) * kStride, + }; + + // Multiplication tables used by CRCs. + TableEntry crc_word_interleaved_[sizeof(Word)][256]; + TableEntry crc_word_[sizeof(Word)][256]; + + // Base class stored after CRC tables so that the most frequently + // used table is at offset 0 and may be accessed faster. + GfUtil<Crc> base_; + + friend class RollingCrc< GenericCrc<Crc, TableEntry, Word, kStride> >; + + private: + // CrcMultiword on amd64 may run at 1.2 CPU cycles per byte which is + // noticeably faster than CrcWord (2.2-2.6 cycles/byte depending on + // hardware and compiler). However, there are problems with compilers. + // + // Test system: P45 chipset, Intel Q9650 CPU, 800MHz 4-4-4-12 memory. + // + // 64-bit compiler, <= 64-bit CRC, 64-bit tables, 64-bit reads: + // CL 15.00.307291.1 C++ >1.2< CPU cycles/byte + // ICL 11.1.051 -O3 C++ 1.5 CPU cycles/byte + // GCC 4.5 -O3 C++ 2.0 CPU cycles/byte + // GCC 4.x -O3 ASM >1.2< CPU cycles/byte + // + // 32-bit compiler, MMX used, <= 64-bit CRC, 64-bit tables, 64-bit reads + // CL 15.00.307291.1 C++ 2.0 CPU cycles/byte + // GCC 4.5 -O3 C++ 1.9 CPU cycles/byte + // ICL 11.1.051 -S C++ 1.6 CPU cycles/byte + // GCC 4.x -O3 ASM >1.3< CPU cycles/byte + // + // So, use inline ASM code for GCC for both i386 and amd64. + + Crc CrcMultiwordI386Mmx( + const void *data, size_t bytes, const Crc &start) const; + Crc CrcMultiwordGccAmd64( + const void *data, size_t bytes, const Crc &start) const; + Crc CrcMultiwordGccAmd64Sse2( + const uint8 *src, const uint8 *end, const Crc &start) const; +} GCC_ALIGN_ATTRIBUTE(16); + +#undef REPEAT_FROM_0 +#undef REPEAT_FROM_1 + + +// Specialized variants. +#if CRCUTIL_USE_ASM + +#if (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX))) + +// Declare specialized functions. +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( + const void *data, size_t bytes, const uint64 &start) const; + +#if HAVE_AMD64 && HAVE_SSE2 +template<> +uint128_sse2 +GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword( + const void *data, size_t bytes, const uint128_sse2 &start) const; +#endif // HAVE_AMD64 && HAVE_SSE2 + +#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER <= 150030729 && \ + (HAVE_I386 && HAVE_MMX) + +// Work around bug in MSC (present at least in v. 15.00.30729.1) +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( + const void *data, + size_t bytes, + const uint64 &start) const; +template<> __forceinline +uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( + const void *data, + size_t bytes, + const uint64 &start) const { + typedef uint64 Word; + typedef uint64 Crc; + if (bytes <= 12) { + const uint8 *src = static_cast<const uint8 *>(data); + uint64 crc = start ^ Base().Canonize(); + for (const uint8 *end = src + bytes; src < end; ++src) { + CRC_BYTE(this, crc, *src); + } + return (crc ^ Base().Canonize()); + } + return CrcMultiwordI386Mmx(data, bytes, start); +} + +#endif // (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX))) + +#endif // CRCUTIL_USE_ASM + + +#pragma pack(pop) + +} // namespace crcutil + +#endif // CRCUTIL_GENERIC_CRC_H_ diff --git a/contrib/libs/crcutil/gf_util.h b/contrib/libs/crcutil/gf_util.h index 43c1d6b216..2a98dfd9e4 100644 --- a/contrib/libs/crcutil/gf_util.h +++ b/contrib/libs/crcutil/gf_util.h @@ -1,304 +1,304 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Defines GfUtil template class which implements -// 1. some useful operations in GF(2^n), -// 2. CRC helper function (e.g. concatenation of CRCs) which are -// not affected by specific implemenation of CRC computation per se. -// -// Please read crc.pdf to understand how it all works. - -#ifndef CRCUTIL_GF_UTIL_H_ -#define CRCUTIL_GF_UTIL_H_ - -#include "base_types.h" // uint8, uint64 -#include "crc_casts.h" // TO_BYTE() -#include "platform.h" // GCC_ALIGN_ATTRIBUTE(16), SHIFT_*_SAFE - -namespace crcutil { - -#pragma pack(push, 16) - -// "Crc" is the type used internally and to return values of N-bit CRC. -template<typename Crc> class GfUtil { - public: - // Initializes the tables given generating polynomial of degree (degree). - // If "canonical" is true, starting CRC value and computed CRC value will be - // XOR-ed with 111...111. - GfUtil() {} - GfUtil(const Crc &generating_polynomial, size_t degree, bool canonical) { - Init(generating_polynomial, degree, canonical); - } - void Init(const Crc &generating_polynomial, size_t degree, bool canonical) { - Crc one = 1; - one <<= degree - 1; - this->generating_polynomial_ = generating_polynomial; - this->crc_bytes_ = (degree + 7) >> 3; - this->degree_ = degree; - this->one_ = one; - if (canonical) { - this->canonize_ = one | (one - 1); - } else { - this->canonize_ = 0; - } - this->normalize_[0] = 0; - this->normalize_[1] = generating_polynomial; - - Crc k = one >> 1; - for (size_t i = 0; i < sizeof(uint64) * 8; ++i) { - this->x_pow_2n_[i] = k; - k = Multiply(k, k); - } - - this->crc_of_crc_ = Multiply(this->canonize_, - this->one_ ^ Xpow8N((degree + 7) >> 3)); - - FindLCD(Xpow8N(this->crc_bytes_), &this->x_pow_minus_W_); - } - - // Returns generating polynomial. - Crc GeneratingPolynomial() const { - return this->generating_polynomial_; - } - - // Returns number of bits in CRC (degree of generating polynomial). - size_t Degree() const { - return this->degree_; - } - - // Returns start/finish adjustment constant. - Crc Canonize() const { - return this->canonize_; - } - - // Returns normalized value of 1. - Crc One() const { - return this->one_; - } - - // Returns value of CRC(A, |A|, start_new) given known - // crc=CRC(A, |A|, start_old) -- without touching the data. - Crc ChangeStartValue(const Crc &crc, uint64 bytes, - const Crc &start_old, - const Crc &start_new) const { - return (crc ^ Multiply(start_new ^ start_old, Xpow8N(bytes))); - } - - // Returns CRC of concatenation of blocks A and B when CRCs - // of blocks A and B are known -- without touching the data. - // - // To be precise, given CRC(A, |A|, startA) and CRC(B, |B|, 0), - // returns CRC(AB, |AB|, startA). - Crc Concatenate(const Crc &crc_A, const Crc &crc_B, uint64 bytes_B) const { - return ChangeStartValue(crc_B, bytes_B, 0 /* start_B */, crc_A); - } - - // Returns CRC of sequence of zeroes -- without touching the data. - Crc CrcOfZeroes(uint64 bytes, const Crc &start) const { - Crc tmp = Multiply(start ^ this->canonize_, Xpow8N(bytes)); - return (tmp ^ this->canonize_); - } - - // Given CRC of a message, stores extra (degree + 7)/8 bytes after - // the message so that CRC(message+extra, start) = result. - // Does not change CRC start value (use ChangeStartValue for that). - // Returns number of stored bytes. - size_t StoreComplementaryCrc(void *dst, - const Crc &message_crc, - const Crc &result) const { - Crc crc0 = Multiply(result ^ this->canonize_, this->x_pow_minus_W_); - crc0 ^= message_crc ^ this->canonize_; - uint8 *d = reinterpret_cast<uint8 *>(dst); - for (size_t i = 0; i < this->crc_bytes_; ++i) { - d[i] = TO_BYTE(crc0); - crc0 >>= 8; - } - return this->crc_bytes_; - } - - // Stores given CRC of a message as (degree + 7)/8 bytes filled - // with 0s to the right. Returns number of stored bytes. - // CRC of the message and stored CRC is a constant value returned - // by CrcOfCrc() -- it does not depend on contents of the message. - size_t StoreCrc(void *dst, const Crc &crc) const { - uint8 *d = reinterpret_cast<uint8 *>(dst); - Crc crc0 = crc; - for (size_t i = 0; i < this->crc_bytes_; ++i) { - d[i] = TO_BYTE(crc0); - crc0 >>= 8; - } - return this->crc_bytes_; - } - - // Returns expected CRC value of CRC(Message,CRC(Message)) - // when CRC is stored after the message. This value is fixed - // and does not depend on the message or CRC start value. - Crc CrcOfCrc() const { - return this->crc_of_crc_; - } - - // Returns ((a * b) mod P) where "a" and "b" are of degree <= (D-1). - Crc Multiply(const Crc &aa, const Crc &bb) const { - Crc a = aa; - Crc b = bb; - if ((a ^ (a - 1)) < (b ^ (b - 1))) { - Crc temp = a; - a = b; - b = temp; - } - - if (a == 0) { - return a; - } - - Crc product = 0; - Crc one = this->one_; - for (; a != 0; a <<= 1) { - if ((a & one) != 0) { - product ^= b; - a ^= one; - } - b = (b >> 1) ^ this->normalize_[Downcast<Crc, size_t>(b & 1)]; - } - - return product; - } - - // Returns ((unnorm * m) mod P) where degree of m is <= (D-1) - // and degree of value "unnorm" is provided explicitly. - Crc MultiplyUnnormalized(const Crc &unnorm, size_t degree, - const Crc &m) const { - Crc v = unnorm; - Crc result = 0; - while (degree > this->degree_) { - degree -= this->degree_; - Crc value = v & (this->one_ | (this->one_ - 1)); - result ^= Multiply(value, Multiply(m, XpowN(degree))); - v >>= this->degree_; - } - result ^= Multiply(v << (this->degree_ - degree), m); - return result; - } - - // returns ((x ** n) mod P). - Crc XpowN(uint64 n) const { - Crc one = this->one_; - Crc result = one; - - for (size_t i = 0; n != 0; ++i, n >>= 1) { - if (n & 1) { - result = Multiply(result, this->x_pow_2n_[i]); - } - } - - return result; - } - - // Returns (x ** (8 * n) mod P). - Crc Xpow8N(uint64 n) const { - return XpowN(n << 3); - } - - // Returns remainder (A mod B) and sets *q = (A/B) of division - // of two polynomials: - // A = dividend + dividend_x_pow_D_coef * x**degree, - // B = divisor. - Crc Divide(const Crc ÷nd0, int dividend_x_pow_D_coef, - const Crc &divisor0, Crc *q) const { - Crc divisor = divisor0; - Crc dividend = dividend0; - Crc quotient = 0; - Crc coef = this->one_; - - while ((divisor & 1) == 0) { - divisor >>= 1; - coef >>= 1; - } - - if (dividend_x_pow_D_coef) { - quotient = coef >> 1; - dividend ^= divisor >> 1; - } - - Crc x_pow_degree_b = 1; - for (;;) { - if ((dividend & x_pow_degree_b) != 0) { - dividend ^= divisor; - quotient ^= coef; - } - if (coef == this->one_) { - break; - } - coef <<= 1; - x_pow_degree_b <<= 1; - divisor <<= 1; - } - - *q = quotient; - return dividend; - } - - // Extended Euclid's algorith -- for given A finds LCD(A, P) and - // value B such that (A * B) mod P = LCD(A, P). - Crc FindLCD(const Crc &A, Crc *B) const { - if (A == 0 || A == this->one_) { - *B = A; - return A; - } - - // Actually, generating polynomial is - // (generating_polynomial_ + x**degree). - int r0_x_pow_D_coef = 1; - Crc r0 = this->generating_polynomial_; - Crc b0 = 0; - Crc r1 = A; - Crc b1 = this->one_; - - for (;;) { - Crc q; - Crc r = Divide(r0, r0_x_pow_D_coef, r1, &q); - if (r == 0) { - break; - } - r0_x_pow_D_coef = 0; - - r0 = r1; - r1 = r; - - Crc b = b0 ^ Multiply(q, b1); - b0 = b1; - b1 = b; - } - - *B = b1; - return r1; - } - - protected: - Crc canonize_; - Crc x_pow_2n_[sizeof(uint64) * 8]; - Crc generating_polynomial_; - Crc one_; - Crc x_pow_minus_W_; - Crc crc_of_crc_; - Crc normalize_[2]; - size_t crc_bytes_; - size_t degree_; -} GCC_ALIGN_ATTRIBUTE(16); - -#pragma pack(pop) - -} // namespace crcutil - -#endif // CRCUTIL_GF_UTIL_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Defines GfUtil template class which implements +// 1. some useful operations in GF(2^n), +// 2. CRC helper function (e.g. concatenation of CRCs) which are +// not affected by specific implemenation of CRC computation per se. +// +// Please read crc.pdf to understand how it all works. + +#ifndef CRCUTIL_GF_UTIL_H_ +#define CRCUTIL_GF_UTIL_H_ + +#include "base_types.h" // uint8, uint64 +#include "crc_casts.h" // TO_BYTE() +#include "platform.h" // GCC_ALIGN_ATTRIBUTE(16), SHIFT_*_SAFE + +namespace crcutil { + +#pragma pack(push, 16) + +// "Crc" is the type used internally and to return values of N-bit CRC. +template<typename Crc> class GfUtil { + public: + // Initializes the tables given generating polynomial of degree (degree). + // If "canonical" is true, starting CRC value and computed CRC value will be + // XOR-ed with 111...111. + GfUtil() {} + GfUtil(const Crc &generating_polynomial, size_t degree, bool canonical) { + Init(generating_polynomial, degree, canonical); + } + void Init(const Crc &generating_polynomial, size_t degree, bool canonical) { + Crc one = 1; + one <<= degree - 1; + this->generating_polynomial_ = generating_polynomial; + this->crc_bytes_ = (degree + 7) >> 3; + this->degree_ = degree; + this->one_ = one; + if (canonical) { + this->canonize_ = one | (one - 1); + } else { + this->canonize_ = 0; + } + this->normalize_[0] = 0; + this->normalize_[1] = generating_polynomial; + + Crc k = one >> 1; + for (size_t i = 0; i < sizeof(uint64) * 8; ++i) { + this->x_pow_2n_[i] = k; + k = Multiply(k, k); + } + + this->crc_of_crc_ = Multiply(this->canonize_, + this->one_ ^ Xpow8N((degree + 7) >> 3)); + + FindLCD(Xpow8N(this->crc_bytes_), &this->x_pow_minus_W_); + } + + // Returns generating polynomial. + Crc GeneratingPolynomial() const { + return this->generating_polynomial_; + } + + // Returns number of bits in CRC (degree of generating polynomial). + size_t Degree() const { + return this->degree_; + } + + // Returns start/finish adjustment constant. + Crc Canonize() const { + return this->canonize_; + } + + // Returns normalized value of 1. + Crc One() const { + return this->one_; + } + + // Returns value of CRC(A, |A|, start_new) given known + // crc=CRC(A, |A|, start_old) -- without touching the data. + Crc ChangeStartValue(const Crc &crc, uint64 bytes, + const Crc &start_old, + const Crc &start_new) const { + return (crc ^ Multiply(start_new ^ start_old, Xpow8N(bytes))); + } + + // Returns CRC of concatenation of blocks A and B when CRCs + // of blocks A and B are known -- without touching the data. + // + // To be precise, given CRC(A, |A|, startA) and CRC(B, |B|, 0), + // returns CRC(AB, |AB|, startA). + Crc Concatenate(const Crc &crc_A, const Crc &crc_B, uint64 bytes_B) const { + return ChangeStartValue(crc_B, bytes_B, 0 /* start_B */, crc_A); + } + + // Returns CRC of sequence of zeroes -- without touching the data. + Crc CrcOfZeroes(uint64 bytes, const Crc &start) const { + Crc tmp = Multiply(start ^ this->canonize_, Xpow8N(bytes)); + return (tmp ^ this->canonize_); + } + + // Given CRC of a message, stores extra (degree + 7)/8 bytes after + // the message so that CRC(message+extra, start) = result. + // Does not change CRC start value (use ChangeStartValue for that). + // Returns number of stored bytes. + size_t StoreComplementaryCrc(void *dst, + const Crc &message_crc, + const Crc &result) const { + Crc crc0 = Multiply(result ^ this->canonize_, this->x_pow_minus_W_); + crc0 ^= message_crc ^ this->canonize_; + uint8 *d = reinterpret_cast<uint8 *>(dst); + for (size_t i = 0; i < this->crc_bytes_; ++i) { + d[i] = TO_BYTE(crc0); + crc0 >>= 8; + } + return this->crc_bytes_; + } + + // Stores given CRC of a message as (degree + 7)/8 bytes filled + // with 0s to the right. Returns number of stored bytes. + // CRC of the message and stored CRC is a constant value returned + // by CrcOfCrc() -- it does not depend on contents of the message. + size_t StoreCrc(void *dst, const Crc &crc) const { + uint8 *d = reinterpret_cast<uint8 *>(dst); + Crc crc0 = crc; + for (size_t i = 0; i < this->crc_bytes_; ++i) { + d[i] = TO_BYTE(crc0); + crc0 >>= 8; + } + return this->crc_bytes_; + } + + // Returns expected CRC value of CRC(Message,CRC(Message)) + // when CRC is stored after the message. This value is fixed + // and does not depend on the message or CRC start value. + Crc CrcOfCrc() const { + return this->crc_of_crc_; + } + + // Returns ((a * b) mod P) where "a" and "b" are of degree <= (D-1). + Crc Multiply(const Crc &aa, const Crc &bb) const { + Crc a = aa; + Crc b = bb; + if ((a ^ (a - 1)) < (b ^ (b - 1))) { + Crc temp = a; + a = b; + b = temp; + } + + if (a == 0) { + return a; + } + + Crc product = 0; + Crc one = this->one_; + for (; a != 0; a <<= 1) { + if ((a & one) != 0) { + product ^= b; + a ^= one; + } + b = (b >> 1) ^ this->normalize_[Downcast<Crc, size_t>(b & 1)]; + } + + return product; + } + + // Returns ((unnorm * m) mod P) where degree of m is <= (D-1) + // and degree of value "unnorm" is provided explicitly. + Crc MultiplyUnnormalized(const Crc &unnorm, size_t degree, + const Crc &m) const { + Crc v = unnorm; + Crc result = 0; + while (degree > this->degree_) { + degree -= this->degree_; + Crc value = v & (this->one_ | (this->one_ - 1)); + result ^= Multiply(value, Multiply(m, XpowN(degree))); + v >>= this->degree_; + } + result ^= Multiply(v << (this->degree_ - degree), m); + return result; + } + + // returns ((x ** n) mod P). + Crc XpowN(uint64 n) const { + Crc one = this->one_; + Crc result = one; + + for (size_t i = 0; n != 0; ++i, n >>= 1) { + if (n & 1) { + result = Multiply(result, this->x_pow_2n_[i]); + } + } + + return result; + } + + // Returns (x ** (8 * n) mod P). + Crc Xpow8N(uint64 n) const { + return XpowN(n << 3); + } + + // Returns remainder (A mod B) and sets *q = (A/B) of division + // of two polynomials: + // A = dividend + dividend_x_pow_D_coef * x**degree, + // B = divisor. + Crc Divide(const Crc ÷nd0, int dividend_x_pow_D_coef, + const Crc &divisor0, Crc *q) const { + Crc divisor = divisor0; + Crc dividend = dividend0; + Crc quotient = 0; + Crc coef = this->one_; + + while ((divisor & 1) == 0) { + divisor >>= 1; + coef >>= 1; + } + + if (dividend_x_pow_D_coef) { + quotient = coef >> 1; + dividend ^= divisor >> 1; + } + + Crc x_pow_degree_b = 1; + for (;;) { + if ((dividend & x_pow_degree_b) != 0) { + dividend ^= divisor; + quotient ^= coef; + } + if (coef == this->one_) { + break; + } + coef <<= 1; + x_pow_degree_b <<= 1; + divisor <<= 1; + } + + *q = quotient; + return dividend; + } + + // Extended Euclid's algorith -- for given A finds LCD(A, P) and + // value B such that (A * B) mod P = LCD(A, P). + Crc FindLCD(const Crc &A, Crc *B) const { + if (A == 0 || A == this->one_) { + *B = A; + return A; + } + + // Actually, generating polynomial is + // (generating_polynomial_ + x**degree). + int r0_x_pow_D_coef = 1; + Crc r0 = this->generating_polynomial_; + Crc b0 = 0; + Crc r1 = A; + Crc b1 = this->one_; + + for (;;) { + Crc q; + Crc r = Divide(r0, r0_x_pow_D_coef, r1, &q); + if (r == 0) { + break; + } + r0_x_pow_D_coef = 0; + + r0 = r1; + r1 = r; + + Crc b = b0 ^ Multiply(q, b1); + b0 = b1; + b1 = b; + } + + *B = b1; + return r1; + } + + protected: + Crc canonize_; + Crc x_pow_2n_[sizeof(uint64) * 8]; + Crc generating_polynomial_; + Crc one_; + Crc x_pow_minus_W_; + Crc crc_of_crc_; + Crc normalize_[2]; + size_t crc_bytes_; + size_t degree_; +} GCC_ALIGN_ATTRIBUTE(16); + +#pragma pack(pop) + +} // namespace crcutil + +#endif // CRCUTIL_GF_UTIL_H_ diff --git a/contrib/libs/crcutil/interface.cc b/contrib/libs/crcutil/interface.cc index dfefebba06..cecc0e3c95 100644 --- a/contrib/libs/crcutil/interface.cc +++ b/contrib/libs/crcutil/interface.cc @@ -1,307 +1,307 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This is the only file where all details of CRC implementation are buried. - -#include "interface.h" - -#include "aligned_alloc.h" -#include "crc32c_sse4.h" -#include "generic_crc.h" -#include "protected_crc.h" -#include "rolling_crc.h" - -// Align all CRC tables on kAlign boundary. -// Shall be exact power of 2. -static size_t kAlign = 4 * 1024; - -using namespace crcutil; - +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This is the only file where all details of CRC implementation are buried. + +#include "interface.h" + +#include "aligned_alloc.h" +#include "crc32c_sse4.h" +#include "generic_crc.h" +#include "protected_crc.h" +#include "rolling_crc.h" + +// Align all CRC tables on kAlign boundary. +// Shall be exact power of 2. +static size_t kAlign = 4 * 1024; + +using namespace crcutil; + #if (!defined(__clang__) && defined(__GNUC__)) -// Suppress 'invalid access to non-static data member ... of NULL object' -#undef offsetof -#define offsetof(TYPE, MEMBER) (reinterpret_cast <size_t> \ - ((&reinterpret_cast <const char &>( \ - reinterpret_cast <const TYPE *>(1)->MEMBER))) - 1) -#endif // defined(__GNUC__) - -namespace crcutil_interface { - -template<typename CrcImplementation, typename RollingCrcImplementation> - class Implementation : public CRC { - public: - typedef typename CrcImplementation::Crc Crc; - typedef Implementation<CrcImplementation, RollingCrcImplementation> Self; - - Implementation(const Crc &poly, - size_t degree, - bool canonical, - const Crc &roll_start_value, - size_t roll_length) - : crc_(poly, degree, canonical), - rolling_crc_(crc_, roll_length, roll_start_value) { - } - - static Self *Create(const Crc &poly, - size_t degree, - bool canonical, - const Crc &roll_start_value, - size_t roll_length, - const void **allocated_memory) { - void *memory = AlignedAlloc(sizeof(Self), - offsetof(Self, crc_), - kAlign, - allocated_memory); - return new(memory) Self(poly, - degree, - canonical, - roll_start_value, - roll_length); - } - - virtual void Delete() { - AlignedFree(this); - } - - void *operator new(size_t, void *p) { - return p; - } - - virtual void GeneratingPolynomial(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const { - SetValue(crc_.Base().GeneratingPolynomial(), lo, hi); - } - - virtual size_t Degree() const { - return crc_.Base().Degree(); - } - - virtual void CanonizeValue(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const { - SetValue(crc_.Base().Canonize(), lo, hi); - } - - virtual void RollStartValue(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const { - SetValue(rolling_crc_.StartValue(), lo, hi); - } - - virtual size_t RollWindowBytes() const { - return rolling_crc_.WindowBytes(); - } - - virtual void SelfCheckValue(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const { - Crc crc = crc_.CrcDefault(&crc_, sizeof(crc_), 0); - crc = crc_.CrcDefault(&rolling_crc_, sizeof(rolling_crc_), crc); - SetValue(crc, lo, hi); - } - - virtual void Compute(const void *data, - size_t bytes, - /* INOUT */ UINT64 *lo, - /* INOUT */ UINT64 *hi = NULL) const { - SetValue(crc_.CrcDefault(data, bytes, GetValue(lo, hi)), lo, hi); - } - - virtual void RollStart(const void *data, - /* INOUT */ UINT64 *lo, - /* INOUT */ UINT64 *hi = NULL) const { - SetValue(rolling_crc_.Start(data), lo, hi); - } - - virtual void Roll(size_t byte_out, - size_t byte_in, - /* INOUT */ UINT64 *lo, - /* INOUT */ UINT64 *hi = NULL) const { - SetValue(rolling_crc_.Roll(GetValue(lo, hi), byte_out, byte_in), lo, hi); - } - - virtual void CrcOfZeroes(UINT64 bytes, - /* INOUT */ UINT64 *lo, - /* INOUT */ UINT64 *hi = NULL) const { - SetValue(crc_.Base().CrcOfZeroes(bytes, GetValue(lo, hi)), lo, hi); - } - - virtual void ChangeStartValue( - UINT64 start_old_lo, UINT64 start_old_hi, - UINT64 start_new_lo, UINT64 start_new_hi, - UINT64 bytes, - /* INOUT */ UINT64 *lo, - /* INOUT */ UINT64 *hi = NULL) const { - SetValue(crc_.Base().ChangeStartValue( - GetValue(lo, hi), - bytes, - GetValue(start_old_lo, start_old_hi), - GetValue(start_new_lo, start_new_hi)), - lo, - hi); - } - - virtual void Concatenate(UINT64 crcB_lo, UINT64 crcB_hi, - UINT64 bytes_B, - /* INOUT */ UINT64* crcA_lo, - /* INOUT */ UINT64* crcA_hi = NULL) const { - SetValue(crc_.Base().Concatenate(GetValue(crcA_lo, crcA_hi), - GetValue(crcB_lo, crcB_hi), - bytes_B), - crcA_lo, - crcA_hi); - } - - virtual size_t StoreComplementaryCrc( - void *dst, - UINT64 message_crc_lo, UINT64 message_crc_hi, - UINT64 result_crc_lo, UINT64 result_crc_hi = 0) const { - return crc_.Base().StoreComplementaryCrc( - dst, - GetValue(message_crc_lo, message_crc_hi), - GetValue(result_crc_lo, result_crc_hi)); - } - - virtual size_t StoreCrc(void *dst, - UINT64 lo, - UINT64 hi = 0) const { - return crc_.Base().StoreCrc(dst, GetValue(lo, hi)); - } - - virtual void CrcOfCrc(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const { - SetValue(crc_.Base().CrcOfCrc(), lo, hi); - } - - private: - static Crc GetValue(UINT64 *lo, UINT64 *hi) { - if (sizeof(Crc) <= sizeof(*lo)) { - return CrcFromUint64<Crc>(*lo); - } else { - return CrcFromUint64<Crc>(*lo, *hi); - } - } - - static Crc GetValue(UINT64 lo, UINT64 hi) { - return CrcFromUint64<Crc>(lo, hi); - } - - static void SetValue(const Crc &crc, UINT64 *lo, UINT64 *hi) { - Uint64FromCrc<Crc>(crc, - reinterpret_cast<crcutil::uint64 *>(lo), - reinterpret_cast<crcutil::uint64 *>(hi)); - } - - const CrcImplementation crc_; - const RollingCrcImplementation rolling_crc_; - - const Self &operator =(const Self &) {} -}; - -#if defined(_MSC_VER) -// 'use_sse4_2' : unreferenced formal parameter -#pragma warning(disable: 4100) -#endif // defined(_MSC_VER) - -bool CRC::IsSSE42Available() { -#if HAVE_AMD64 || HAVE_I386 - return Crc32cSSE4::IsSSE42Available(); -#else - return false; -#endif // HAVE_AMD64 || HAVE_I386 -} - -CRC::~CRC() {} -CRC::CRC() {} - -CRC *CRC::Create(UINT64 poly_lo, - UINT64 poly_hi, - size_t degree, - bool canonical, - UINT64 roll_start_value_lo, - UINT64 roll_start_value_hi, - size_t roll_length, - bool use_sse4_2, - const void **allocated_memory) { - if (degree == 0) { - return NULL; - } - - if (degree > 64) { -#if !HAVE_SSE2 - return NULL; -#else - if (degree > 128) { - return NULL; - } - uint128_sse2 poly = CrcFromUint64<uint128_sse2>(poly_lo, poly_hi); - if (degree != 128 && (poly >> degree) != 0) { - return NULL; - } - uint128_sse2 roll_start_value = - CrcFromUint64<uint128_sse2>(roll_start_value_lo, roll_start_value_hi); - if (degree != 128 && (roll_start_value >> degree) != 0) { - return NULL; - } -#if HAVE_I386 - typedef GenericCrc<uint128_sse2, uint128_sse2, crcutil::uint32, 3> Crc128; -#elif defined(__GNUC__) && GCC_VERSION_AVAILABLE(4, 5) - typedef GenericCrc<uint128_sse2, uint128_sse2, crcutil::uint64, 6> Crc128; -#else - typedef GenericCrc<uint128_sse2, uint128_sse2, crcutil::uint64, 4> Crc128; -#endif // HAVE_I386 - return Implementation<Crc128, RollingCrc<Crc128> >::Create( - poly, - degree, - canonical, - roll_start_value, - roll_length, - allocated_memory); -#endif // !HAVE_SSE2 - } - -#if CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64) - if (use_sse4_2 && - degree == Crc32cSSE4::FixedDegree() && - poly_lo == Crc32cSSE4::FixedGeneratingPolynomial() && - poly_hi == 0) { - if (roll_start_value_hi != 0 || (roll_start_value_lo >> 32) != 0) { - return NULL; - } - return Implementation<Crc32cSSE4, RollingCrc32cSSE4>::Create( - static_cast<size_t>(poly_lo), - degree, - canonical, - static_cast<size_t>(roll_start_value_lo), - static_cast<size_t>(roll_length), - allocated_memory); - } -#endif // CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64) - - if (poly_hi != 0 || (degree != 64 && (poly_lo >> degree) != 0)) { - return NULL; - } - if (roll_start_value_hi != 0 || - (degree != 64 && (roll_start_value_lo >> degree) != 0)) { - return NULL; - } - typedef GenericCrc<crcutil::uint64, crcutil::uint64, crcutil::uint64, 4> - Crc64; - return Implementation<Crc64, RollingCrc<Crc64> >::Create( - poly_lo, - degree, - canonical, - roll_start_value_lo, - roll_length, - allocated_memory); -} - -} // namespace crcutil_interface +// Suppress 'invalid access to non-static data member ... of NULL object' +#undef offsetof +#define offsetof(TYPE, MEMBER) (reinterpret_cast <size_t> \ + ((&reinterpret_cast <const char &>( \ + reinterpret_cast <const TYPE *>(1)->MEMBER))) - 1) +#endif // defined(__GNUC__) + +namespace crcutil_interface { + +template<typename CrcImplementation, typename RollingCrcImplementation> + class Implementation : public CRC { + public: + typedef typename CrcImplementation::Crc Crc; + typedef Implementation<CrcImplementation, RollingCrcImplementation> Self; + + Implementation(const Crc &poly, + size_t degree, + bool canonical, + const Crc &roll_start_value, + size_t roll_length) + : crc_(poly, degree, canonical), + rolling_crc_(crc_, roll_length, roll_start_value) { + } + + static Self *Create(const Crc &poly, + size_t degree, + bool canonical, + const Crc &roll_start_value, + size_t roll_length, + const void **allocated_memory) { + void *memory = AlignedAlloc(sizeof(Self), + offsetof(Self, crc_), + kAlign, + allocated_memory); + return new(memory) Self(poly, + degree, + canonical, + roll_start_value, + roll_length); + } + + virtual void Delete() { + AlignedFree(this); + } + + void *operator new(size_t, void *p) { + return p; + } + + virtual void GeneratingPolynomial(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const { + SetValue(crc_.Base().GeneratingPolynomial(), lo, hi); + } + + virtual size_t Degree() const { + return crc_.Base().Degree(); + } + + virtual void CanonizeValue(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const { + SetValue(crc_.Base().Canonize(), lo, hi); + } + + virtual void RollStartValue(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const { + SetValue(rolling_crc_.StartValue(), lo, hi); + } + + virtual size_t RollWindowBytes() const { + return rolling_crc_.WindowBytes(); + } + + virtual void SelfCheckValue(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const { + Crc crc = crc_.CrcDefault(&crc_, sizeof(crc_), 0); + crc = crc_.CrcDefault(&rolling_crc_, sizeof(rolling_crc_), crc); + SetValue(crc, lo, hi); + } + + virtual void Compute(const void *data, + size_t bytes, + /* INOUT */ UINT64 *lo, + /* INOUT */ UINT64 *hi = NULL) const { + SetValue(crc_.CrcDefault(data, bytes, GetValue(lo, hi)), lo, hi); + } + + virtual void RollStart(const void *data, + /* INOUT */ UINT64 *lo, + /* INOUT */ UINT64 *hi = NULL) const { + SetValue(rolling_crc_.Start(data), lo, hi); + } + + virtual void Roll(size_t byte_out, + size_t byte_in, + /* INOUT */ UINT64 *lo, + /* INOUT */ UINT64 *hi = NULL) const { + SetValue(rolling_crc_.Roll(GetValue(lo, hi), byte_out, byte_in), lo, hi); + } + + virtual void CrcOfZeroes(UINT64 bytes, + /* INOUT */ UINT64 *lo, + /* INOUT */ UINT64 *hi = NULL) const { + SetValue(crc_.Base().CrcOfZeroes(bytes, GetValue(lo, hi)), lo, hi); + } + + virtual void ChangeStartValue( + UINT64 start_old_lo, UINT64 start_old_hi, + UINT64 start_new_lo, UINT64 start_new_hi, + UINT64 bytes, + /* INOUT */ UINT64 *lo, + /* INOUT */ UINT64 *hi = NULL) const { + SetValue(crc_.Base().ChangeStartValue( + GetValue(lo, hi), + bytes, + GetValue(start_old_lo, start_old_hi), + GetValue(start_new_lo, start_new_hi)), + lo, + hi); + } + + virtual void Concatenate(UINT64 crcB_lo, UINT64 crcB_hi, + UINT64 bytes_B, + /* INOUT */ UINT64* crcA_lo, + /* INOUT */ UINT64* crcA_hi = NULL) const { + SetValue(crc_.Base().Concatenate(GetValue(crcA_lo, crcA_hi), + GetValue(crcB_lo, crcB_hi), + bytes_B), + crcA_lo, + crcA_hi); + } + + virtual size_t StoreComplementaryCrc( + void *dst, + UINT64 message_crc_lo, UINT64 message_crc_hi, + UINT64 result_crc_lo, UINT64 result_crc_hi = 0) const { + return crc_.Base().StoreComplementaryCrc( + dst, + GetValue(message_crc_lo, message_crc_hi), + GetValue(result_crc_lo, result_crc_hi)); + } + + virtual size_t StoreCrc(void *dst, + UINT64 lo, + UINT64 hi = 0) const { + return crc_.Base().StoreCrc(dst, GetValue(lo, hi)); + } + + virtual void CrcOfCrc(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const { + SetValue(crc_.Base().CrcOfCrc(), lo, hi); + } + + private: + static Crc GetValue(UINT64 *lo, UINT64 *hi) { + if (sizeof(Crc) <= sizeof(*lo)) { + return CrcFromUint64<Crc>(*lo); + } else { + return CrcFromUint64<Crc>(*lo, *hi); + } + } + + static Crc GetValue(UINT64 lo, UINT64 hi) { + return CrcFromUint64<Crc>(lo, hi); + } + + static void SetValue(const Crc &crc, UINT64 *lo, UINT64 *hi) { + Uint64FromCrc<Crc>(crc, + reinterpret_cast<crcutil::uint64 *>(lo), + reinterpret_cast<crcutil::uint64 *>(hi)); + } + + const CrcImplementation crc_; + const RollingCrcImplementation rolling_crc_; + + const Self &operator =(const Self &) {} +}; + +#if defined(_MSC_VER) +// 'use_sse4_2' : unreferenced formal parameter +#pragma warning(disable: 4100) +#endif // defined(_MSC_VER) + +bool CRC::IsSSE42Available() { +#if HAVE_AMD64 || HAVE_I386 + return Crc32cSSE4::IsSSE42Available(); +#else + return false; +#endif // HAVE_AMD64 || HAVE_I386 +} + +CRC::~CRC() {} +CRC::CRC() {} + +CRC *CRC::Create(UINT64 poly_lo, + UINT64 poly_hi, + size_t degree, + bool canonical, + UINT64 roll_start_value_lo, + UINT64 roll_start_value_hi, + size_t roll_length, + bool use_sse4_2, + const void **allocated_memory) { + if (degree == 0) { + return NULL; + } + + if (degree > 64) { +#if !HAVE_SSE2 + return NULL; +#else + if (degree > 128) { + return NULL; + } + uint128_sse2 poly = CrcFromUint64<uint128_sse2>(poly_lo, poly_hi); + if (degree != 128 && (poly >> degree) != 0) { + return NULL; + } + uint128_sse2 roll_start_value = + CrcFromUint64<uint128_sse2>(roll_start_value_lo, roll_start_value_hi); + if (degree != 128 && (roll_start_value >> degree) != 0) { + return NULL; + } +#if HAVE_I386 + typedef GenericCrc<uint128_sse2, uint128_sse2, crcutil::uint32, 3> Crc128; +#elif defined(__GNUC__) && GCC_VERSION_AVAILABLE(4, 5) + typedef GenericCrc<uint128_sse2, uint128_sse2, crcutil::uint64, 6> Crc128; +#else + typedef GenericCrc<uint128_sse2, uint128_sse2, crcutil::uint64, 4> Crc128; +#endif // HAVE_I386 + return Implementation<Crc128, RollingCrc<Crc128> >::Create( + poly, + degree, + canonical, + roll_start_value, + roll_length, + allocated_memory); +#endif // !HAVE_SSE2 + } + +#if CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64) + if (use_sse4_2 && + degree == Crc32cSSE4::FixedDegree() && + poly_lo == Crc32cSSE4::FixedGeneratingPolynomial() && + poly_hi == 0) { + if (roll_start_value_hi != 0 || (roll_start_value_lo >> 32) != 0) { + return NULL; + } + return Implementation<Crc32cSSE4, RollingCrc32cSSE4>::Create( + static_cast<size_t>(poly_lo), + degree, + canonical, + static_cast<size_t>(roll_start_value_lo), + static_cast<size_t>(roll_length), + allocated_memory); + } +#endif // CRCUTIL_USE_MM_CRC32 && (HAVE_I386 || HAVE_AMD64) + + if (poly_hi != 0 || (degree != 64 && (poly_lo >> degree) != 0)) { + return NULL; + } + if (roll_start_value_hi != 0 || + (degree != 64 && (roll_start_value_lo >> degree) != 0)) { + return NULL; + } + typedef GenericCrc<crcutil::uint64, crcutil::uint64, crcutil::uint64, 4> + Crc64; + return Implementation<Crc64, RollingCrc<Crc64> >::Create( + poly_lo, + degree, + canonical, + roll_start_value_lo, + roll_length, + allocated_memory); +} + +} // namespace crcutil_interface diff --git a/contrib/libs/crcutil/interface.h b/contrib/libs/crcutil/interface.h index 2b3e2ee939..2c43f95ba9 100644 --- a/contrib/libs/crcutil/interface.h +++ b/contrib/libs/crcutil/interface.h @@ -1,204 +1,204 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Example how to use CRC implementation via the interface which -// hides details of implementation. -// -// The raw implementation is not indended to be used in a project -// directly because: -// - Implementation lives in the header files because that is the -// only way to use templates efficiently. -// - Header files are quite "dirty" -- they define and use a -// lot of macros. Bringing these macros to all files in -// a project is not particularly good idea. -// - The code takes forever to compile with GCC (e.g. GCC -// 4.4.3 and 4.5.0 compile the unittest for about 30 seconds). -// -// Solution: -// - Create your own, clean interface. -// - Do not expose interface internals in a header file. -// - Proxy all calls to your interface to CRC implementation. -// - Keep only one copy of actual implementation. - -#ifndef CRCUTIL_INTERFACE_H_ -#define CRCUTIL_INTERFACE_H_ - -#include "std_headers.h" // size_t - -namespace crcutil_interface { - -// Many projects define their own uint64. Do it here. -typedef unsigned long long UINT64; - -class CRC { - public: - // Creates new instance of CRC class. - // If arguments are illegal (e.g. provided generating polynomial - // has more bits than provided degree), returns NULL. - // - // poly_* - generating polynomial (reversed bit format). - // degree - degree of generating polynomial. - // canonical - if true, input CRC value will be XOR'ed with - // (inverted) before and after CRC computation. - // roll_start_value - starting value of rolling CRC. - // roll_window_bytes - length of rolling CRC window in bytes. - // If roll_length is 0, roll_start_value - // shall be 0. - // use_sse4_2 - if true, use SSE4.2 crc32 instruction to compute - // CRC when generating polynomial is CRC32C (Castagnoli) - // allocated_memory - optional (may be NULL) address of a variable - // to store the address of actually allocated memory. - static CRC *Create(UINT64 poly_lo, - UINT64 poly_hi, - size_t degree, - bool canonical, - UINT64 roll_start_value_lo, - UINT64 roll_start_value_hi, - size_t roll_window_bytes, - bool use_sse4_2, - const void **allocated_memory); - - // Deletes the instance of CRC class. - virtual void Delete() = 0; - - // Returns true if SSE4.2 is available. - static bool IsSSE42Available(); - - // Returns generating polynomial. - virtual void GeneratingPolynomial(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const = 0; - - // Returns degree of generating polynomial. - virtual size_t Degree() const = 0; - - // Returns canonization constant used to XOR crc value - // before and after CRC computation. - virtual void CanonizeValue(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const = 0; - - // Returns rolling CRC starting value. - virtual void RollStartValue(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const = 0; - - // Returns length of rolling CRC window. - virtual size_t RollWindowBytes() const = 0; - - // Returns CRC of CRC tables to enable verification - // of integrity of CRC function itself by comparing - // the result with pre-computed value. - virtual void SelfCheckValue(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const = 0; - - // Given CRC value of previous chunk of data, - // extends it to new chunk, retuning the result in-place. - // - // If degree of CRC polynomial is 64 or less, - // (*hi) will not be touched. - virtual void Compute(const void *data, - size_t bytes, - /* INOUT */ UINT64 *lo, - /* INOUT */ UINT64 *hi = NULL) const = 0; - - // Starts rolling CRC by computing CRC of first - // "roll_length" bytes of "data", using "roll_start_value" - // as starting value (see Create()). - // Should not be called if the value of "roll_value" was 0. - virtual void RollStart(const void *data, - /* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const = 0; - - // Rolls CRC by 1 byte, given the bytes leaving and - // entering the window of "roll_length" bytes. - // RollStart() should be called before "Roll". - // Should not be called if the value of "roll_value" was 0. - virtual void Roll(size_t byte_out, - size_t byte_in, - /* INOUT */ UINT64 *lo, - /* INOUT */ UINT64 *hi = NULL) const = 0; - - // Computes CRC of sequence of zeroes -- without touching the data. - virtual void CrcOfZeroes(UINT64 bytes, - /* INOUT */ UINT64 *lo, - /* INOUT */ UINT64 *hi = NULL) const = 0; - - // Computes value of CRC(A, bytes, start_new) given known - // crc=CRC(A, bytes, start_old) -- without touching the data. - virtual void ChangeStartValue( - UINT64 start_old_lo, UINT64 start_old_hi, - UINT64 start_new_lo, UINT64 start_new_hi, - UINT64 bytes, - /* INOUT */ UINT64 *lo, - /* INOUT */ UINT64 *hi = NULL) const = 0; - - // Returns CRC of concatenation of blocks A and B when CRCs - // of blocks A and B are known -- without touching the data. - // - // To be precise, given CRC(A, |A|, startA) and CRC(B, |B|, 0), - // returns CRC(AB, |AB|, startA). - virtual void Concatenate(UINT64 crcB_lo, UINT64 crcB_hi, - UINT64 bytes_B, - /* INOUT */ UINT64* crcA_lo, - /* INOUT */ UINT64* crcA_hi = NULL) const = 0; - - // Given CRC of a message, stores extra (degree + 7)/8 bytes after - // the message so that CRC(message+extra, start) = result. - // Does not change CRC start value (use ChangeStartValue for that). - // Returns number of stored bytes. - virtual size_t StoreComplementaryCrc( - void *dst, - UINT64 message_crc_lo, UINT64 message_crc_hi, - UINT64 result_crc_lo, UINT64 result_crc_hi = 0) const = 0; - - // Stores given CRC of a message as (degree + 7)/8 bytes filled - // with 0s to the right. Returns number of stored bytes. - // CRC of the message and stored CRC is a constant value returned - // by CrcOfCrc() -- it does not depend on contents of the message. - virtual size_t StoreCrc(/* OUT */ void *dst, - UINT64 lo, - UINT64 hi = 0) const = 0; - - // Computes expected CRC value of CRC(Message,CRC(Message)) - // when CRC is stored after the message. This value is fixed - // and does not depend on the message or CRC start value. - virtual void CrcOfCrc(/* OUT */ UINT64 *lo, - /* OUT */ UINT64 *hi = NULL) const = 0; - - protected: - // CRC instance should be created only once (most of the time): - // - Creation and initializion is relatively expensive. - // - CRC is fully defined by its generating polynomials - // (well, and few more parameters). - // - CRC instances are pure constants. There is no - // reason to have 2 instances of the same CRC. - // - There are not too many generating polynomials that are - // used on practice. It is hard to imagine a project - // which uses 50 different generating polynomials. - // Thus, a handful of CRC instances is sufficient - // to cover the needs of even very large project. - // - Finally and most importantly, CRC tables should be - // aligned properly. No, the instances of CRC class - // are not created by blind "new" -- they use placement - // "new" and, in absense of placement "delete", - // should be deleted by calling explicit Delete() method. - virtual ~CRC(); - - // Cannot instantiate the class -- instances may be created - // by CRC::Create() only. - CRC(); -}; - -} // namespace crcutil_interface - - -#endif // CRCUTIL_INTERFACE_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Example how to use CRC implementation via the interface which +// hides details of implementation. +// +// The raw implementation is not indended to be used in a project +// directly because: +// - Implementation lives in the header files because that is the +// only way to use templates efficiently. +// - Header files are quite "dirty" -- they define and use a +// lot of macros. Bringing these macros to all files in +// a project is not particularly good idea. +// - The code takes forever to compile with GCC (e.g. GCC +// 4.4.3 and 4.5.0 compile the unittest for about 30 seconds). +// +// Solution: +// - Create your own, clean interface. +// - Do not expose interface internals in a header file. +// - Proxy all calls to your interface to CRC implementation. +// - Keep only one copy of actual implementation. + +#ifndef CRCUTIL_INTERFACE_H_ +#define CRCUTIL_INTERFACE_H_ + +#include "std_headers.h" // size_t + +namespace crcutil_interface { + +// Many projects define their own uint64. Do it here. +typedef unsigned long long UINT64; + +class CRC { + public: + // Creates new instance of CRC class. + // If arguments are illegal (e.g. provided generating polynomial + // has more bits than provided degree), returns NULL. + // + // poly_* - generating polynomial (reversed bit format). + // degree - degree of generating polynomial. + // canonical - if true, input CRC value will be XOR'ed with + // (inverted) before and after CRC computation. + // roll_start_value - starting value of rolling CRC. + // roll_window_bytes - length of rolling CRC window in bytes. + // If roll_length is 0, roll_start_value + // shall be 0. + // use_sse4_2 - if true, use SSE4.2 crc32 instruction to compute + // CRC when generating polynomial is CRC32C (Castagnoli) + // allocated_memory - optional (may be NULL) address of a variable + // to store the address of actually allocated memory. + static CRC *Create(UINT64 poly_lo, + UINT64 poly_hi, + size_t degree, + bool canonical, + UINT64 roll_start_value_lo, + UINT64 roll_start_value_hi, + size_t roll_window_bytes, + bool use_sse4_2, + const void **allocated_memory); + + // Deletes the instance of CRC class. + virtual void Delete() = 0; + + // Returns true if SSE4.2 is available. + static bool IsSSE42Available(); + + // Returns generating polynomial. + virtual void GeneratingPolynomial(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const = 0; + + // Returns degree of generating polynomial. + virtual size_t Degree() const = 0; + + // Returns canonization constant used to XOR crc value + // before and after CRC computation. + virtual void CanonizeValue(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const = 0; + + // Returns rolling CRC starting value. + virtual void RollStartValue(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const = 0; + + // Returns length of rolling CRC window. + virtual size_t RollWindowBytes() const = 0; + + // Returns CRC of CRC tables to enable verification + // of integrity of CRC function itself by comparing + // the result with pre-computed value. + virtual void SelfCheckValue(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const = 0; + + // Given CRC value of previous chunk of data, + // extends it to new chunk, retuning the result in-place. + // + // If degree of CRC polynomial is 64 or less, + // (*hi) will not be touched. + virtual void Compute(const void *data, + size_t bytes, + /* INOUT */ UINT64 *lo, + /* INOUT */ UINT64 *hi = NULL) const = 0; + + // Starts rolling CRC by computing CRC of first + // "roll_length" bytes of "data", using "roll_start_value" + // as starting value (see Create()). + // Should not be called if the value of "roll_value" was 0. + virtual void RollStart(const void *data, + /* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const = 0; + + // Rolls CRC by 1 byte, given the bytes leaving and + // entering the window of "roll_length" bytes. + // RollStart() should be called before "Roll". + // Should not be called if the value of "roll_value" was 0. + virtual void Roll(size_t byte_out, + size_t byte_in, + /* INOUT */ UINT64 *lo, + /* INOUT */ UINT64 *hi = NULL) const = 0; + + // Computes CRC of sequence of zeroes -- without touching the data. + virtual void CrcOfZeroes(UINT64 bytes, + /* INOUT */ UINT64 *lo, + /* INOUT */ UINT64 *hi = NULL) const = 0; + + // Computes value of CRC(A, bytes, start_new) given known + // crc=CRC(A, bytes, start_old) -- without touching the data. + virtual void ChangeStartValue( + UINT64 start_old_lo, UINT64 start_old_hi, + UINT64 start_new_lo, UINT64 start_new_hi, + UINT64 bytes, + /* INOUT */ UINT64 *lo, + /* INOUT */ UINT64 *hi = NULL) const = 0; + + // Returns CRC of concatenation of blocks A and B when CRCs + // of blocks A and B are known -- without touching the data. + // + // To be precise, given CRC(A, |A|, startA) and CRC(B, |B|, 0), + // returns CRC(AB, |AB|, startA). + virtual void Concatenate(UINT64 crcB_lo, UINT64 crcB_hi, + UINT64 bytes_B, + /* INOUT */ UINT64* crcA_lo, + /* INOUT */ UINT64* crcA_hi = NULL) const = 0; + + // Given CRC of a message, stores extra (degree + 7)/8 bytes after + // the message so that CRC(message+extra, start) = result. + // Does not change CRC start value (use ChangeStartValue for that). + // Returns number of stored bytes. + virtual size_t StoreComplementaryCrc( + void *dst, + UINT64 message_crc_lo, UINT64 message_crc_hi, + UINT64 result_crc_lo, UINT64 result_crc_hi = 0) const = 0; + + // Stores given CRC of a message as (degree + 7)/8 bytes filled + // with 0s to the right. Returns number of stored bytes. + // CRC of the message and stored CRC is a constant value returned + // by CrcOfCrc() -- it does not depend on contents of the message. + virtual size_t StoreCrc(/* OUT */ void *dst, + UINT64 lo, + UINT64 hi = 0) const = 0; + + // Computes expected CRC value of CRC(Message,CRC(Message)) + // when CRC is stored after the message. This value is fixed + // and does not depend on the message or CRC start value. + virtual void CrcOfCrc(/* OUT */ UINT64 *lo, + /* OUT */ UINT64 *hi = NULL) const = 0; + + protected: + // CRC instance should be created only once (most of the time): + // - Creation and initializion is relatively expensive. + // - CRC is fully defined by its generating polynomials + // (well, and few more parameters). + // - CRC instances are pure constants. There is no + // reason to have 2 instances of the same CRC. + // - There are not too many generating polynomials that are + // used on practice. It is hard to imagine a project + // which uses 50 different generating polynomials. + // Thus, a handful of CRC instances is sufficient + // to cover the needs of even very large project. + // - Finally and most importantly, CRC tables should be + // aligned properly. No, the instances of CRC class + // are not created by blind "new" -- they use placement + // "new" and, in absense of placement "delete", + // should be deleted by calling explicit Delete() method. + virtual ~CRC(); + + // Cannot instantiate the class -- instances may be created + // by CRC::Create() only. + CRC(); +}; + +} // namespace crcutil_interface + + +#endif // CRCUTIL_INTERFACE_H_ diff --git a/contrib/libs/crcutil/multiword_128_64_gcc_amd64_sse2.cc b/contrib/libs/crcutil/multiword_128_64_gcc_amd64_sse2.cc index f94fd1fa36..92713cabd5 100644 --- a/contrib/libs/crcutil/multiword_128_64_gcc_amd64_sse2.cc +++ b/contrib/libs/crcutil/multiword_128_64_gcc_amd64_sse2.cc @@ -1,291 +1,291 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements multiword CRC for GCC on i386. -// -// Small comment: the trick described in -// http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor -// (replace "movdqa dst, src" with "pshufd $0xE4, src, dst") -// did not work: execution time increased from -// 1.8 CPU cycles/byte to 2.1 CPU cycles/byte. -// So it may be good idea on P4 but it's not on newer CPUs. -// -// movaps/xorps vs. movdqa/pxor did not make any difference. - -#include "generic_crc.h" -#include "uint128_sse2.h" - -#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 && HAVE_SSE2 - -namespace crcutil { - -template<> uint128_sse2 -GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiwordGccAmd64Sse2( - const uint8 *src, const uint8 *end, const uint128_sse2 &start) const; - -template<> -uint128_sse2 GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword( - const void *data, size_t bytes, const uint128_sse2 &start) const { - const uint8 *src = static_cast<const uint8 *>(data); - uint128_sse2 crc = start ^ this->Base().Canonize(); - const uint8 *end = src + bytes; - if (bytes <= 7) { - for (; src < end; ++src) { - CRC_BYTE(this, crc, *src); - } - return (crc ^ this->Base().Canonize()); - } - - ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc, uint64); - if (src >= end) { - return (crc ^ this->Base().Canonize()); - } - - return CrcMultiwordGccAmd64Sse2(src, end, crc); -} - -#define CRC_WORD_ASM() \ - SSE2_MOVQ " %[crc0], %[tmp0]\n" \ - "xorq %[tmp0], %[buf0]\n" \ - "psrldq $8, %[crc0]\n" \ - "movzbq %b[buf0], %[tmp0]\n" \ - "shrq $8, %[buf0]\n" \ - "addq %[tmp0], %[tmp0]\n" \ - "pxor (%[table_word], %[tmp0], 8), %[crc0]\n" \ - "movzbq %b[buf0], %[tmp1]\n" \ - "shrq $8, %[buf0]\n" \ - "addq %[tmp1], %[tmp1]\n" \ - "pxor 1*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \ - "movzbq %b[buf0], %[tmp0]\n" \ - "shrq $8, %[buf0]\n" \ - "addq %[tmp0], %[tmp0]\n" \ - "pxor 2*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \ - "movzbq %b[buf0], %[tmp1]\n" \ - "shrq $8, %[buf0]\n" \ - "addq %[tmp1], %[tmp1]\n" \ - "pxor 3*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \ - "movzbq %b[buf0], %[tmp0]\n" \ - "shrq $8, %[buf0]\n" \ - "addq %[tmp0], %[tmp0]\n" \ - "pxor 4*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \ - "movzbq %b[buf0], %[tmp1]\n" \ - "shrq $8, %[buf0]\n" \ - "addq %[tmp1], %[tmp1]\n" \ - "pxor 5*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \ - "movzbq %b[buf0], %[tmp0]\n" \ - "shrq $8, %[buf0]\n" \ - "addq %[tmp0], %[tmp0]\n" \ - "pxor 6*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \ - "addq %[buf0], %[buf0]\n" \ - "pxor 7*256*16(%[table_word], %[buf0], 8), %[crc0]\n" - -template<> uint128_sse2 -GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiwordGccAmd64Sse2( - const uint8 *src, const uint8 *end, const uint128_sse2 &start) const { - __m128i crc0 = start; - __m128i crc1; - __m128i crc2; - __m128i crc3; - __m128i crc_carryover; - - uint64 buf0; - uint64 buf1; - uint64 buf2; - uint64 buf3; - - uint64 tmp0; - uint64 tmp1; - - asm( - "sub $2*4*8 - 1, %[end]\n" - "cmpq %[src], %[end]\n" - "jbe 2f\n" - - "pxor %[crc1], %[crc1]\n" - "pxor %[crc2], %[crc2]\n" - "pxor %[crc3], %[crc3]\n" - "pxor %[crc_carryover], %[crc_carryover]\n" - "movq (%[src]), %[buf0]\n" - "movq 1*8(%[src]), %[buf1]\n" - "movq 2*8(%[src]), %[buf2]\n" - "movq 3*8(%[src]), %[buf3]\n" - - "1:\n" -#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 - "prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n" -#endif -#if GCC_VERSION_AVAILABLE(4, 5) - // Bug in GCC 4.2.4? - "add $4*8, %[src]\n" -#else - "lea 4*8(%[src]), %[src]\n" -#endif - "pxor %[crc_carryover], %[crc0]\n" - - SSE2_MOVQ " %[crc0], %[tmp0]\n" - "psrldq $8, %[crc0]\n" - "xorq %[tmp0], %[buf0]\n" - "movzbq %b[buf0], %[tmp0]\n" - "pxor %[crc0], %[crc1]\n" - "addq %[tmp0], %[tmp0]\n" - "shrq $8, %[buf0]\n" - "movdqa (%[table], %[tmp0], 8), %[crc0]\n" - - SSE2_MOVQ " %[crc1], %[tmp1]\n" - "psrldq $8, %[crc1]\n" - "xorq %[tmp1], %[buf1]\n" - "movzbq %b[buf1], %[tmp1]\n" - "pxor %[crc1], %[crc2]\n" - "addq %[tmp1], %[tmp1]\n" - "shrq $8, %[buf1]\n" - "movdqa (%[table], %[tmp1], 8), %[crc1]\n" - - SSE2_MOVQ " %[crc2], %[tmp0]\n" - "psrldq $8, %[crc2]\n" - "xorq %[tmp0], %[buf2]\n" - "movzbq %b[buf2], %[tmp0]\n" - "pxor %[crc2], %[crc3]\n" - "addq %[tmp0], %[tmp0]\n" - "shrq $8, %[buf2]\n" - "movdqa (%[table], %[tmp0], 8), %[crc2]\n" - - SSE2_MOVQ " %[crc3], %[tmp1]\n" - "psrldq $8, %[crc3]\n" - "xorq %[tmp1], %[buf3]\n" - "movzbq %b[buf3], %[tmp1]\n" - "movdqa %[crc3], %[crc_carryover]\n" - "addq %[tmp1], %[tmp1]\n" - "shrq $8, %[buf3]\n" - "movdqa (%[table], %[tmp1], 8), %[crc3]\n" - -#define XOR(byte) \ - "movzbq %b[buf0], %[tmp0]\n" \ - "shrq $8, %[buf0]\n" \ - "addq %[tmp0], %[tmp0]\n" \ - "pxor " #byte "*256*16(%[table], %[tmp0], 8), %[crc0]\n" \ - "movzbq %b[buf1], %[tmp1]\n" \ - "shrq $8, %[buf1]\n" \ - "addq %[tmp1], %[tmp1]\n" \ - "pxor " #byte "*256*16(%[table], %[tmp1], 8), %[crc1]\n" \ - "movzbq %b[buf2], %[tmp0]\n" \ - "shrq $8, %[buf2]\n" \ - "addq %[tmp0], %[tmp0]\n" \ - "pxor " #byte "*256*16(%[table], %[tmp0], 8), %[crc2]\n" \ - "movzbq %b[buf3], %[tmp1]\n" \ - "shrq $8, %[buf3]\n" \ - "addq %[tmp1], %[tmp1]\n" \ - "pxor " #byte "*256*16(%[table], %[tmp1], 8), %[crc3]\n" - - XOR(1) - XOR(2) - XOR(3) - XOR(4) - XOR(5) - XOR(6) -#undef XOR - - "addq %[buf0], %[buf0]\n" - "pxor 7*256*16(%[table], %[buf0], 8), %[crc0]\n" - "movq (%[src]), %[buf0]\n" - - "addq %[buf1], %[buf1]\n" - "pxor 7*256*16(%[table], %[buf1], 8), %[crc1]\n" - "movq 1*8(%[src]), %[buf1]\n" - - "addq %[buf2], %[buf2]\n" - "pxor 7*256*16(%[table], %[buf2], 8), %[crc2]\n" - "movq 2*8(%[src]), %[buf2]\n" - - "addq %[buf3], %[buf3]\n" - "pxor 7*256*16(%[table], %[buf3], 8), %[crc3]\n" - "movq 3*8(%[src]), %[buf3]\n" - - "cmpq %[src], %[end]\n" - "ja 1b\n" - - "pxor %[crc_carryover], %[crc0]\n" - CRC_WORD_ASM() - - "pxor %[crc1], %[crc0]\n" - "movq %[buf1], %[buf0]\n" - CRC_WORD_ASM() - - "pxor %[crc2], %[crc0]\n" - "movq %[buf2], %[buf0]\n" - CRC_WORD_ASM() - - "pxor %[crc3], %[crc0]\n" - "movq %[buf3], %[buf0]\n" - CRC_WORD_ASM() - - "add $4*8, %[src]\n" - "2:\n" - - "add $2*4*8 - 8, %[end]\n" - - "cmpq %[src], %[end]\n" - "jbe 4f\n" - "3:\n" - "movq (%[src]), %[buf0]\n" - "addq $8, %[src]\n" - CRC_WORD_ASM() - "cmpq %[src], %[end]\n" - "ja 3b\n" - - "4:\n" - "add $7, %[end]\n" - - "cmpq %[src], %[end]\n" - "jbe 6f\n" - - "5:\n" - "movzbq (%[src]), %[buf0]\n" - "add $1, %[src]\n" - SSE2_MOVQ " %[crc0], %[tmp0]\n" - "movzx %b[tmp0], %[tmp0]\n" - "psrldq $1, %[crc0]\n" - "xor %[buf0], %[tmp0]\n" - "addq %[tmp0], %[tmp0]\n" - "pxor 7*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" - - "cmpq %[src], %[end]\n" - "ja 5b\n" - - "6:\n" - - : // outputs - [src] "+r" (src), - [end] "+r" (end), - [crc0] "+x" (crc0), - [crc1] "=&x" (crc1), - [crc2] "=&x" (crc2), - [crc3] "=&x" (crc3), - [crc_carryover] "=&x" (crc_carryover), - [buf0] "=&r" (buf0), - [buf1] "=&r" (buf1), - [buf2] "=&r" (buf2), - [buf3] "=&r" (buf3), - [tmp0] "=&r" (tmp0), - [tmp1] "=&r" (tmp1) - - : // inputs - [table_word] "r" (this->crc_word_), - [table] "r" (this->crc_word_interleaved_)); - - return (this->Base().Canonize() ^ crc0); -} - -} // namespace crcutil - -#endif // defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 && HAVE_SSE2 +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements multiword CRC for GCC on i386. +// +// Small comment: the trick described in +// http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor +// (replace "movdqa dst, src" with "pshufd $0xE4, src, dst") +// did not work: execution time increased from +// 1.8 CPU cycles/byte to 2.1 CPU cycles/byte. +// So it may be good idea on P4 but it's not on newer CPUs. +// +// movaps/xorps vs. movdqa/pxor did not make any difference. + +#include "generic_crc.h" +#include "uint128_sse2.h" + +#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 && HAVE_SSE2 + +namespace crcutil { + +template<> uint128_sse2 +GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiwordGccAmd64Sse2( + const uint8 *src, const uint8 *end, const uint128_sse2 &start) const; + +template<> +uint128_sse2 GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword( + const void *data, size_t bytes, const uint128_sse2 &start) const { + const uint8 *src = static_cast<const uint8 *>(data); + uint128_sse2 crc = start ^ this->Base().Canonize(); + const uint8 *end = src + bytes; + if (bytes <= 7) { + for (; src < end; ++src) { + CRC_BYTE(this, crc, *src); + } + return (crc ^ this->Base().Canonize()); + } + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc, uint64); + if (src >= end) { + return (crc ^ this->Base().Canonize()); + } + + return CrcMultiwordGccAmd64Sse2(src, end, crc); +} + +#define CRC_WORD_ASM() \ + SSE2_MOVQ " %[crc0], %[tmp0]\n" \ + "xorq %[tmp0], %[buf0]\n" \ + "psrldq $8, %[crc0]\n" \ + "movzbq %b[buf0], %[tmp0]\n" \ + "shrq $8, %[buf0]\n" \ + "addq %[tmp0], %[tmp0]\n" \ + "pxor (%[table_word], %[tmp0], 8), %[crc0]\n" \ + "movzbq %b[buf0], %[tmp1]\n" \ + "shrq $8, %[buf0]\n" \ + "addq %[tmp1], %[tmp1]\n" \ + "pxor 1*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \ + "movzbq %b[buf0], %[tmp0]\n" \ + "shrq $8, %[buf0]\n" \ + "addq %[tmp0], %[tmp0]\n" \ + "pxor 2*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \ + "movzbq %b[buf0], %[tmp1]\n" \ + "shrq $8, %[buf0]\n" \ + "addq %[tmp1], %[tmp1]\n" \ + "pxor 3*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \ + "movzbq %b[buf0], %[tmp0]\n" \ + "shrq $8, %[buf0]\n" \ + "addq %[tmp0], %[tmp0]\n" \ + "pxor 4*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \ + "movzbq %b[buf0], %[tmp1]\n" \ + "shrq $8, %[buf0]\n" \ + "addq %[tmp1], %[tmp1]\n" \ + "pxor 5*256*16(%[table_word], %[tmp1], 8), %[crc0]\n" \ + "movzbq %b[buf0], %[tmp0]\n" \ + "shrq $8, %[buf0]\n" \ + "addq %[tmp0], %[tmp0]\n" \ + "pxor 6*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" \ + "addq %[buf0], %[buf0]\n" \ + "pxor 7*256*16(%[table_word], %[buf0], 8), %[crc0]\n" + +template<> uint128_sse2 +GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiwordGccAmd64Sse2( + const uint8 *src, const uint8 *end, const uint128_sse2 &start) const { + __m128i crc0 = start; + __m128i crc1; + __m128i crc2; + __m128i crc3; + __m128i crc_carryover; + + uint64 buf0; + uint64 buf1; + uint64 buf2; + uint64 buf3; + + uint64 tmp0; + uint64 tmp1; + + asm( + "sub $2*4*8 - 1, %[end]\n" + "cmpq %[src], %[end]\n" + "jbe 2f\n" + + "pxor %[crc1], %[crc1]\n" + "pxor %[crc2], %[crc2]\n" + "pxor %[crc3], %[crc3]\n" + "pxor %[crc_carryover], %[crc_carryover]\n" + "movq (%[src]), %[buf0]\n" + "movq 1*8(%[src]), %[buf1]\n" + "movq 2*8(%[src]), %[buf2]\n" + "movq 3*8(%[src]), %[buf3]\n" + + "1:\n" +#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 + "prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n" +#endif +#if GCC_VERSION_AVAILABLE(4, 5) + // Bug in GCC 4.2.4? + "add $4*8, %[src]\n" +#else + "lea 4*8(%[src]), %[src]\n" +#endif + "pxor %[crc_carryover], %[crc0]\n" + + SSE2_MOVQ " %[crc0], %[tmp0]\n" + "psrldq $8, %[crc0]\n" + "xorq %[tmp0], %[buf0]\n" + "movzbq %b[buf0], %[tmp0]\n" + "pxor %[crc0], %[crc1]\n" + "addq %[tmp0], %[tmp0]\n" + "shrq $8, %[buf0]\n" + "movdqa (%[table], %[tmp0], 8), %[crc0]\n" + + SSE2_MOVQ " %[crc1], %[tmp1]\n" + "psrldq $8, %[crc1]\n" + "xorq %[tmp1], %[buf1]\n" + "movzbq %b[buf1], %[tmp1]\n" + "pxor %[crc1], %[crc2]\n" + "addq %[tmp1], %[tmp1]\n" + "shrq $8, %[buf1]\n" + "movdqa (%[table], %[tmp1], 8), %[crc1]\n" + + SSE2_MOVQ " %[crc2], %[tmp0]\n" + "psrldq $8, %[crc2]\n" + "xorq %[tmp0], %[buf2]\n" + "movzbq %b[buf2], %[tmp0]\n" + "pxor %[crc2], %[crc3]\n" + "addq %[tmp0], %[tmp0]\n" + "shrq $8, %[buf2]\n" + "movdqa (%[table], %[tmp0], 8), %[crc2]\n" + + SSE2_MOVQ " %[crc3], %[tmp1]\n" + "psrldq $8, %[crc3]\n" + "xorq %[tmp1], %[buf3]\n" + "movzbq %b[buf3], %[tmp1]\n" + "movdqa %[crc3], %[crc_carryover]\n" + "addq %[tmp1], %[tmp1]\n" + "shrq $8, %[buf3]\n" + "movdqa (%[table], %[tmp1], 8), %[crc3]\n" + +#define XOR(byte) \ + "movzbq %b[buf0], %[tmp0]\n" \ + "shrq $8, %[buf0]\n" \ + "addq %[tmp0], %[tmp0]\n" \ + "pxor " #byte "*256*16(%[table], %[tmp0], 8), %[crc0]\n" \ + "movzbq %b[buf1], %[tmp1]\n" \ + "shrq $8, %[buf1]\n" \ + "addq %[tmp1], %[tmp1]\n" \ + "pxor " #byte "*256*16(%[table], %[tmp1], 8), %[crc1]\n" \ + "movzbq %b[buf2], %[tmp0]\n" \ + "shrq $8, %[buf2]\n" \ + "addq %[tmp0], %[tmp0]\n" \ + "pxor " #byte "*256*16(%[table], %[tmp0], 8), %[crc2]\n" \ + "movzbq %b[buf3], %[tmp1]\n" \ + "shrq $8, %[buf3]\n" \ + "addq %[tmp1], %[tmp1]\n" \ + "pxor " #byte "*256*16(%[table], %[tmp1], 8), %[crc3]\n" + + XOR(1) + XOR(2) + XOR(3) + XOR(4) + XOR(5) + XOR(6) +#undef XOR + + "addq %[buf0], %[buf0]\n" + "pxor 7*256*16(%[table], %[buf0], 8), %[crc0]\n" + "movq (%[src]), %[buf0]\n" + + "addq %[buf1], %[buf1]\n" + "pxor 7*256*16(%[table], %[buf1], 8), %[crc1]\n" + "movq 1*8(%[src]), %[buf1]\n" + + "addq %[buf2], %[buf2]\n" + "pxor 7*256*16(%[table], %[buf2], 8), %[crc2]\n" + "movq 2*8(%[src]), %[buf2]\n" + + "addq %[buf3], %[buf3]\n" + "pxor 7*256*16(%[table], %[buf3], 8), %[crc3]\n" + "movq 3*8(%[src]), %[buf3]\n" + + "cmpq %[src], %[end]\n" + "ja 1b\n" + + "pxor %[crc_carryover], %[crc0]\n" + CRC_WORD_ASM() + + "pxor %[crc1], %[crc0]\n" + "movq %[buf1], %[buf0]\n" + CRC_WORD_ASM() + + "pxor %[crc2], %[crc0]\n" + "movq %[buf2], %[buf0]\n" + CRC_WORD_ASM() + + "pxor %[crc3], %[crc0]\n" + "movq %[buf3], %[buf0]\n" + CRC_WORD_ASM() + + "add $4*8, %[src]\n" + "2:\n" + + "add $2*4*8 - 8, %[end]\n" + + "cmpq %[src], %[end]\n" + "jbe 4f\n" + "3:\n" + "movq (%[src]), %[buf0]\n" + "addq $8, %[src]\n" + CRC_WORD_ASM() + "cmpq %[src], %[end]\n" + "ja 3b\n" + + "4:\n" + "add $7, %[end]\n" + + "cmpq %[src], %[end]\n" + "jbe 6f\n" + + "5:\n" + "movzbq (%[src]), %[buf0]\n" + "add $1, %[src]\n" + SSE2_MOVQ " %[crc0], %[tmp0]\n" + "movzx %b[tmp0], %[tmp0]\n" + "psrldq $1, %[crc0]\n" + "xor %[buf0], %[tmp0]\n" + "addq %[tmp0], %[tmp0]\n" + "pxor 7*256*16(%[table_word], %[tmp0], 8), %[crc0]\n" + + "cmpq %[src], %[end]\n" + "ja 5b\n" + + "6:\n" + + : // outputs + [src] "+r" (src), + [end] "+r" (end), + [crc0] "+x" (crc0), + [crc1] "=&x" (crc1), + [crc2] "=&x" (crc2), + [crc3] "=&x" (crc3), + [crc_carryover] "=&x" (crc_carryover), + [buf0] "=&r" (buf0), + [buf1] "=&r" (buf1), + [buf2] "=&r" (buf2), + [buf3] "=&r" (buf3), + [tmp0] "=&r" (tmp0), + [tmp1] "=&r" (tmp1) + + : // inputs + [table_word] "r" (this->crc_word_), + [table] "r" (this->crc_word_interleaved_)); + + return (this->Base().Canonize() ^ crc0); +} + +} // namespace crcutil + +#endif // defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 && HAVE_SSE2 diff --git a/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc b/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc index af7352aa46..4706d9c005 100644 --- a/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc +++ b/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc @@ -1,304 +1,304 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements 64-bit multiword CRC for Microsoft and Intel compilers -// using MMX instructions (i386). - -#include "generic_crc.h" - -#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER) - -namespace crcutil { - -#define CRC_WORD_MMX() \ - __asm pxor BUF0, CRC0 \ - __asm movd TMP0, BUF0 \ - __asm psrlq BUF0, 32 \ - __asm movzx TEMP, TMP0L \ - __asm shr TMP0, 8 \ - __asm movq CRC0, [TABLE + TEMP * 8] \ - __asm movzx TEMP, TMP0L \ - __asm shr TMP0, 8 \ - __asm pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \ - __asm movzx TEMP, TMP0L \ - __asm shr TMP0, 8 \ - __asm pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \ - __asm pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \ - __asm movd TMP0, BUF0 \ - __asm movzx TEMP, TMP0L \ - __asm shr TMP0, 8 \ - __asm pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \ - __asm movzx TEMP, TMP0L \ - __asm shr TMP0, 8 \ - __asm pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \ - __asm movzx TEMP, TMP0L \ - __asm shr TMP0, 8 \ - __asm pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \ - __asm pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8] - -// frame pointer register 'ebp' modified by inline assembly code -#pragma warning(disable: 4731) - -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( - const void *data, - size_t bytes, - const uint64 &start) const { - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - uint64 crc0 = start ^ this->Base().Canonize(); - - ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64); - if (src >= end) { - return (crc0 ^ this->Base().Canonize()); - } - -#define CRC0 mm0 -#define CRC1 mm1 -#define CRC2 mm2 -#define CRC3 mm3 -#define BUF0 mm4 -#define BUF1 mm5 -#define BUF2 mm6 -#define BUF3 mm7 -#define TMP0 eax -#define TMP0L al -#define TMP0H ah -#define TMP1 ebx -#define TMP1L bl -#define TMP1H bh -#define TMP2 ecx -#define TMP2L cl -#define TMP2H ch -#define TMP3 edx -#define TMP3L dl -#define TMP3H dh -#define TEMP edi -#define SRC esi -#define END [esp] -#define TABLE ebp - - - const uint64 *interleaved_table_address = - &this->crc_word_interleaved_[0][0]; - const uint64 *word_table_address = &this->crc_word_[0][0]; - - __asm { - push ebp - - mov TMP0, interleaved_table_address - - movq CRC0, crc0 - mov SRC, src - mov TMP1, end - sub TMP1, 2*4*8 - 1 - cmp SRC, TMP1 - mov TABLE, word_table_address - jae end_main_loop - - push TABLE - mov TABLE, TMP0 - push TMP1 - - pxor CRC1, CRC1 - pxor CRC2, CRC2 - pxor CRC3, CRC3 - - movq BUF0, [SRC] - movq BUF1, [SRC + 1 * 8] - movq BUF2, [SRC + 2 * 8] - movq BUF3, [SRC + 3 * 8] - - main_loop: -#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 - prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH] -#endif - add SRC, 32 - pxor BUF0, CRC0 - pxor BUF1, CRC1 - pxor BUF2, CRC2 - pxor BUF3, CRC3 - - movd TMP0, BUF0 - psrlq BUF0, 32 - movd TMP1, BUF1 - psrlq BUF1, 32 - movd TMP2, BUF2 - psrlq BUF2, 32 - movd TMP3, BUF3 - psrlq BUF3, 32 - - movzx TEMP, TMP0L - movq CRC0, [TABLE + TEMP * 8] - movzx TEMP, TMP1L - movq CRC1, [TABLE + TEMP * 8] - movzx TEMP, TMP2L - movq CRC2, [TABLE + TEMP * 8] - movzx TEMP, TMP3L - movq CRC3, [TABLE + TEMP * 8] - - movzx TEMP, TMP0H - shr TMP0, 16 - pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] - movzx TEMP, TMP1H - shr TMP1, 16 - pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8] - movzx TEMP, TMP2H - shr TMP2, 16 - pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8] - movzx TEMP, TMP3H - shr TMP3, 16 - pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8] - - movzx TEMP, TMP0L - shr TMP0, 8 - pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] - movzx TEMP, TMP1L - shr TMP1, 8 - pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8] - movzx TEMP, TMP2L - shr TMP2, 8 - pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8] - movzx TEMP, TMP3L - shr TMP3, 8 - pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8] - - pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] - movd TMP0, BUF0 - pxor CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8] - movd TMP1, BUF1 - pxor CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8] - movd TMP2, BUF2 - pxor CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8] - movd TMP3, BUF3 - - movzx TEMP, TMP0L - pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] - movzx TEMP, TMP1L - pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8] - movzx TEMP, TMP2L - pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8] - movzx TEMP, TMP3L - pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8] - - movzx TEMP, TMP0H - shr TMP0, 16 - pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] - movzx TEMP, TMP1H - shr TMP1, 16 - pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8] - movzx TEMP, TMP2H - shr TMP2, 16 - pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8] - movzx TEMP, TMP3H - shr TMP3, 16 - pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8] - - movzx TEMP, TMP0L - shr TMP0, 8 - pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] - movzx TEMP, TMP1L - shr TMP1, 8 - pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8] - movzx TEMP, TMP2L - shr TMP2, 8 - pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8] - movzx TEMP, TMP3L - shr TMP3, 8 - pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8] - - pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8] - movq BUF0, [SRC] - pxor CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8] - movq BUF1, [SRC + 1 * 8] - pxor CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8] - movq BUF2, [SRC + 2 * 8] - pxor CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8] - movq BUF3, [SRC + 3 * 8] - - cmp END, SRC - ja main_loop - -#undef END -#define END TMP1 - pop END - pop TABLE - add SRC, 32 - - CRC_WORD_MMX() - - pxor BUF1, CRC1 - movq BUF0, BUF1 - CRC_WORD_MMX() - - pxor BUF2, CRC2 - movq BUF0, BUF2 - CRC_WORD_MMX() - - pxor BUF3, CRC3 - movq BUF0, BUF3 - CRC_WORD_MMX() - - end_main_loop: - add END, 2*4*8 - 8 - cmp SRC, END - jae end_word_loop - - word_loop: - movq BUF0, [SRC] - add SRC, 8 - CRC_WORD_MMX() - cmp END, SRC - ja word_loop - end_word_loop: - -#if 0 // Plain C version is faster? - add END, 7 - cmp SRC, END - jae end_byte_loop - - byte_loop: - movd TMP0, CRC0 - movzx TEMP, byte ptr [SRC] - movzx TMP0, TMP0L - psrlq CRC0, 8 - xor TEMP, TMP0 - add SRC, 1 - pxor CRC0, [TABLE + TEMP*8 + 7*256*8] - cmp END, SRC - ja byte_loop - end_byte_loop: -#endif - - pop ebp - - mov src, SRC - movq crc0, CRC0 - - emms - } - -#if 1 - // Compute CRC of remaining bytes. - for (;src < end; ++src) { - CRC_BYTE(this, crc0, *src); - } -#endif - - return (crc0 ^ this->Base().Canonize()); -} - - -} // namespace crcutil - -#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER) +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements 64-bit multiword CRC for Microsoft and Intel compilers +// using MMX instructions (i386). + +#include "generic_crc.h" + +#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER) + +namespace crcutil { + +#define CRC_WORD_MMX() \ + __asm pxor BUF0, CRC0 \ + __asm movd TMP0, BUF0 \ + __asm psrlq BUF0, 32 \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm movq CRC0, [TABLE + TEMP * 8] \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \ + __asm pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \ + __asm movd TMP0, BUF0 \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \ + __asm pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8] + +// frame pointer register 'ebp' modified by inline assembly code +#pragma warning(disable: 4731) + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( + const void *data, + size_t bytes, + const uint64 &start) const { + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + uint64 crc0 = start ^ this->Base().Canonize(); + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64); + if (src >= end) { + return (crc0 ^ this->Base().Canonize()); + } + +#define CRC0 mm0 +#define CRC1 mm1 +#define CRC2 mm2 +#define CRC3 mm3 +#define BUF0 mm4 +#define BUF1 mm5 +#define BUF2 mm6 +#define BUF3 mm7 +#define TMP0 eax +#define TMP0L al +#define TMP0H ah +#define TMP1 ebx +#define TMP1L bl +#define TMP1H bh +#define TMP2 ecx +#define TMP2L cl +#define TMP2H ch +#define TMP3 edx +#define TMP3L dl +#define TMP3H dh +#define TEMP edi +#define SRC esi +#define END [esp] +#define TABLE ebp + + + const uint64 *interleaved_table_address = + &this->crc_word_interleaved_[0][0]; + const uint64 *word_table_address = &this->crc_word_[0][0]; + + __asm { + push ebp + + mov TMP0, interleaved_table_address + + movq CRC0, crc0 + mov SRC, src + mov TMP1, end + sub TMP1, 2*4*8 - 1 + cmp SRC, TMP1 + mov TABLE, word_table_address + jae end_main_loop + + push TABLE + mov TABLE, TMP0 + push TMP1 + + pxor CRC1, CRC1 + pxor CRC2, CRC2 + pxor CRC3, CRC3 + + movq BUF0, [SRC] + movq BUF1, [SRC + 1 * 8] + movq BUF2, [SRC + 2 * 8] + movq BUF3, [SRC + 3 * 8] + + main_loop: +#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 + prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH] +#endif + add SRC, 32 + pxor BUF0, CRC0 + pxor BUF1, CRC1 + pxor BUF2, CRC2 + pxor BUF3, CRC3 + + movd TMP0, BUF0 + psrlq BUF0, 32 + movd TMP1, BUF1 + psrlq BUF1, 32 + movd TMP2, BUF2 + psrlq BUF2, 32 + movd TMP3, BUF3 + psrlq BUF3, 32 + + movzx TEMP, TMP0L + movq CRC0, [TABLE + TEMP * 8] + movzx TEMP, TMP1L + movq CRC1, [TABLE + TEMP * 8] + movzx TEMP, TMP2L + movq CRC2, [TABLE + TEMP * 8] + movzx TEMP, TMP3L + movq CRC3, [TABLE + TEMP * 8] + + movzx TEMP, TMP0H + shr TMP0, 16 + pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] + movzx TEMP, TMP1H + shr TMP1, 16 + pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8] + movzx TEMP, TMP2H + shr TMP2, 16 + pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8] + movzx TEMP, TMP3H + shr TMP3, 16 + pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8] + + movzx TEMP, TMP0L + shr TMP0, 8 + pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] + movzx TEMP, TMP1L + shr TMP1, 8 + pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8] + movzx TEMP, TMP2L + shr TMP2, 8 + pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8] + movzx TEMP, TMP3L + shr TMP3, 8 + pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8] + + pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] + movd TMP0, BUF0 + pxor CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8] + movd TMP1, BUF1 + pxor CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8] + movd TMP2, BUF2 + pxor CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8] + movd TMP3, BUF3 + + movzx TEMP, TMP0L + pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] + movzx TEMP, TMP1L + pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8] + movzx TEMP, TMP2L + pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8] + movzx TEMP, TMP3L + pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8] + + movzx TEMP, TMP0H + shr TMP0, 16 + pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] + movzx TEMP, TMP1H + shr TMP1, 16 + pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8] + movzx TEMP, TMP2H + shr TMP2, 16 + pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8] + movzx TEMP, TMP3H + shr TMP3, 16 + pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8] + + movzx TEMP, TMP0L + shr TMP0, 8 + pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] + movzx TEMP, TMP1L + shr TMP1, 8 + pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8] + movzx TEMP, TMP2L + shr TMP2, 8 + pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8] + movzx TEMP, TMP3L + shr TMP3, 8 + pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8] + + pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8] + movq BUF0, [SRC] + pxor CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8] + movq BUF1, [SRC + 1 * 8] + pxor CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8] + movq BUF2, [SRC + 2 * 8] + pxor CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8] + movq BUF3, [SRC + 3 * 8] + + cmp END, SRC + ja main_loop + +#undef END +#define END TMP1 + pop END + pop TABLE + add SRC, 32 + + CRC_WORD_MMX() + + pxor BUF1, CRC1 + movq BUF0, BUF1 + CRC_WORD_MMX() + + pxor BUF2, CRC2 + movq BUF0, BUF2 + CRC_WORD_MMX() + + pxor BUF3, CRC3 + movq BUF0, BUF3 + CRC_WORD_MMX() + + end_main_loop: + add END, 2*4*8 - 8 + cmp SRC, END + jae end_word_loop + + word_loop: + movq BUF0, [SRC] + add SRC, 8 + CRC_WORD_MMX() + cmp END, SRC + ja word_loop + end_word_loop: + +#if 0 // Plain C version is faster? + add END, 7 + cmp SRC, END + jae end_byte_loop + + byte_loop: + movd TMP0, CRC0 + movzx TEMP, byte ptr [SRC] + movzx TMP0, TMP0L + psrlq CRC0, 8 + xor TEMP, TMP0 + add SRC, 1 + pxor CRC0, [TABLE + TEMP*8 + 7*256*8] + cmp END, SRC + ja byte_loop + end_byte_loop: +#endif + + pop ebp + + mov src, SRC + movq crc0, CRC0 + + emms + } + +#if 1 + // Compute CRC of remaining bytes. + for (;src < end; ++src) { + CRC_BYTE(this, crc0, *src); + } +#endif + + return (crc0 ^ this->Base().Canonize()); +} + + +} // namespace crcutil + +#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER) diff --git a/contrib/libs/crcutil/multiword_64_64_gcc_amd64_asm.cc b/contrib/libs/crcutil/multiword_64_64_gcc_amd64_asm.cc index 36630fd1ac..1a64b85c5e 100644 --- a/contrib/libs/crcutil/multiword_64_64_gcc_amd64_asm.cc +++ b/contrib/libs/crcutil/multiword_64_64_gcc_amd64_asm.cc @@ -1,298 +1,298 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements multiword CRC for GCC on AMD64. -// -// Accoding to "Software Optimization Guide for AMD Family 10h Processors" -// http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/40546.pdf -// instead of -// movzbq %al, %rsi -// shrq $8, %rax -// [use %rsi] -// movzbq %al, %rsi -// shrq $8, %rax -// [use %rsi] -// it is better to use 32-bit registers -// (high 32 bits will be cleared on assignment), i.e. -// movzbl %al, %esi -// [use %rsi] -// movzbl %ah, %esi -// shrq $16, %rax -// [use %rsi] -// Makes instructions shorter and removes one shift -// (the latter is not such a big deal as it's execution time -// is nicely masked by [use %rsi] instruction). -// -// Performance difference: -// About 10% degradation on bytes = 8 .. 16 -// (clobbering registers that should be saved) -// Break even at 32 bytes. -// 3% improvement starting from 64 bytes. - -#include "generic_crc.h" - -#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 - -namespace crcutil { - -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordGccAmd64( - const void *data, size_t bytes, const uint64 &start) const; - -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( - const void *data, - size_t bytes, - const uint64 &start) const { - if (bytes <= 6 * sizeof(Word) - 1) { - const uint8 *src = static_cast<const uint8 *>(data); - uint64 crc = start ^ this->Base().Canonize(); - const uint8 *end = src + bytes; -#define PROCESS_ONE_WORD() do { \ - Word buf = reinterpret_cast<const Word *>(src)[0]; \ - CRC_WORD(this, crc, buf); \ - src += sizeof(Word); \ -} while (0) - if (bytes >= 1 * sizeof(Word)) { - PROCESS_ONE_WORD(); - if (bytes >= 2 * sizeof(Word)) { - PROCESS_ONE_WORD(); - if (bytes >= 3 * sizeof(Word)) { - PROCESS_ONE_WORD(); - if (bytes >= 4 * sizeof(Word)) { - PROCESS_ONE_WORD(); - if (bytes >= 5 * sizeof(Word)) { - PROCESS_ONE_WORD(); - } - } - } - } - } - for (; src < end; ++src) { - CRC_BYTE(this, crc, *src); - } - return (crc ^ this->Base().Canonize()); - } - return this->CrcMultiwordGccAmd64(data, bytes, start); -} - -#define TMP0 "%%rsi" -#define TMP0W "%%esi" - -#define BUF0 "%%rax" -#define BUF0L "%%al" -#define BUF0H "%%ah" - -#define BUF1 "%%rbx" -#define BUF1L "%%bl" -#define BUF1H "%%bh" - -#define BUF2 "%%rcx" -#define BUF2L "%%cl" -#define BUF2H "%%ch" - -#define BUF3 "%%rdx" -#define BUF3L "%%dl" -#define BUF3H "%%dh" - -#define CRC_WORD_ASM() \ - "xorq %[crc0], " BUF0 "\n" \ - "movzbq " BUF0L ", " TMP0 "\n" \ - "movq (%[table_word], " TMP0 ", 8), %[crc0]\n" \ - "movzbl " BUF0H ", " TMP0W "\n" \ - "shrq $16, " BUF0 "\n" \ - "xorq 1*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ - "movzbq " BUF0L ", " TMP0 "\n" \ - "xorq 2*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ - "movzbl " BUF0H ", " TMP0W "\n" \ - "shrq $16, " BUF0 "\n" \ - "xorq 3*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ - "movzbq " BUF0L ", " TMP0 "\n" \ - "xorq 4*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ - "movzbl " BUF0H ", " TMP0W "\n" \ - "shrq $16, " BUF0 "\n" \ - "xorq 5*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ - "movzbq " BUF0L ", " TMP0 "\n" \ - "xorq 6*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ - "movzbl " BUF0H ", " TMP0W "\n" \ - "xorq 7*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" - -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordGccAmd64( - const void *data, size_t bytes, const uint64 &start) const { - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - uint64 crc0 = start ^ this->Base().Canonize(); - - ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64); - if (src >= end) { - return (crc0 ^ this->Base().Canonize()); - } - - uint64 crc1; - uint64 crc2; - uint64 crc3; - - asm( - "sub $2*4*8 - 1, %[end]\n" - "cmpq %[src], %[end]\n" - "jbe 2f\n" - "xorq %[crc1], %[crc1]\n" - "movq (%[src]), " BUF0 "\n" - "movq 1*8(%[src]), " BUF1 "\n" - "movq 2*8(%[src]), " BUF2 "\n" - "movq 3*8(%[src]), " BUF3 "\n" - "movq %[crc1], %[crc2]\n" - "movq %[crc1], %[crc3]\n" - - "1:\n" -#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 - "prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n" -#endif // HAVE_SSE - "add $4*8, %[src]\n" - - // Set buffer data. - "xorq %[crc0], " BUF0 "\n" - "xorq %[crc1], " BUF1 "\n" - "xorq %[crc2], " BUF2 "\n" - "xorq %[crc3], " BUF3 "\n" - - // LOAD crc of byte 0 and shift buffers. - "movzbl " BUF0L ", " TMP0W "\n" - "movq (%[table], " TMP0 ", 8), %[crc0]\n" - "movzbl " BUF1L ", " TMP0W "\n" - "movq (%[table], " TMP0 ", 8), %[crc1]\n" - "movzbl " BUF2L ", " TMP0W "\n" - "movq (%[table], " TMP0 ", 8), %[crc2]\n" - "movzbl " BUF3L ", " TMP0W "\n" - "movq (%[table], " TMP0 ", 8), %[crc3]\n" - -#define XOR1(byte1) \ - "movzbl " BUF0L ", " TMP0W "\n" \ - "xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc0]\n" \ - "movzbl " BUF1L ", " TMP0W "\n" \ - "xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc1]\n" \ - "movzbl " BUF2L ", " TMP0W "\n" \ - "xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc2]\n" \ - "movzbl " BUF3L ", " TMP0W "\n" \ - "xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc3]\n" - -#define XOR2(byte2) \ - "movzbl " BUF0H ", " TMP0W "\n" \ - "shrq $16, " BUF0 "\n" \ - "xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc0]\n" \ - "movzbl " BUF1H ", " TMP0W "\n" \ - "shrq $16, " BUF1 "\n" \ - "xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc1]\n" \ - "movzbl " BUF2H ", " TMP0W "\n" \ - "shrq $16, " BUF2 "\n" \ - "xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc2]\n" \ - "movzbl " BUF3H ", " TMP0W "\n" \ - "shrq $16, " BUF3 "\n" \ - "xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc3]\n" - - XOR2(1) - XOR1(2) - XOR2(3) - XOR1(4) - XOR2(5) - XOR1(6) - - // Update CRC registers and load buffers. - "movzbl " BUF0H ", " TMP0W "\n" - "xorq 7*256*8(%[table], " TMP0 ", 8), %[crc0]\n" - "movq (%[src]), " BUF0 "\n" - "movzbl " BUF1H ", " TMP0W "\n" - "xorq 7*256*8(%[table], " TMP0 ", 8), %[crc1]\n" - "movq 1*8(%[src]), " BUF1 "\n" - "movzbl " BUF2H ", " TMP0W "\n" - "xorq 7*256*8(%[table], " TMP0 ", 8), %[crc2]\n" - "movq 2*8(%[src]), " BUF2 "\n" - "movzbl " BUF3H ", " TMP0W "\n" - "xorq 7*256*8(%[table], " TMP0 ", 8), %[crc3]\n" - "movq 3*8(%[src]), " BUF3 "\n" - - "cmpq %[src], %[end]\n" - "ja 1b\n" - - CRC_WORD_ASM() - - "xorq %[crc1], " BUF1 "\n" - "movq " BUF1 ", " BUF0 "\n" - CRC_WORD_ASM() - - "xorq %[crc2], " BUF2 "\n" - "movq " BUF2 ", " BUF0 "\n" - CRC_WORD_ASM() - - "xorq %[crc3], " BUF3 "\n" - "movq " BUF3 ", " BUF0 "\n" - CRC_WORD_ASM() - - "add $4*8, %[src]\n" - - "2:\n" - "add $2*4*8 - 8, %[end]\n" - "cmpq %[src], %[end]\n" - "jbe 4f\n" - - "3:\n" - "movq (%[src]), " BUF0 "\n" - "add $8, %[src]\n" - CRC_WORD_ASM() - "cmpq %[src], %[end]\n" - "ja 3b\n" - - "4:\n" - "add $7, %[end]\n" - - "cmpq %[src], %[end]\n" - "jbe 6f\n" - - "5:\n" - "movzbq (%[src]), " BUF0 "\n" - "movzbq %b[crc0], " TMP0 "\n" - "shrq $8, %[crc0]\n" - "xorq " BUF0 ", " TMP0 "\n" - "add $1, %[src]\n" - "xorq 7*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" - "cmpq %[src], %[end]\n" - "ja 5b\n" - - "6:\n" - - - : // outputs - [src] "+r" (src), - [end] "+r" (end), - [crc0] "+r" (crc0), - [crc1] "=&r" (crc1), - [crc2] "=&r" (crc2), - [crc3] "=&r" (crc3) - - : // inputs - [table] "r" (&this->crc_word_interleaved_[0][0]), - [table_word] "r" (&this->crc_word_[0][0]) - - : // clobbers - "%rax", // BUF0 - "%rbx", // BUF1 - "%rcx", // BUF2 - "%rdx", // BUF3 - "%rsi" // TMP0 - ); - - return (crc0 ^ this->Base().Canonize()); -} - -} // namespace crcutil - -#endif // defined(__GNUC__) && HAVE_AMD64 && CRCUTIL_USE_ASM +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements multiword CRC for GCC on AMD64. +// +// Accoding to "Software Optimization Guide for AMD Family 10h Processors" +// http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/40546.pdf +// instead of +// movzbq %al, %rsi +// shrq $8, %rax +// [use %rsi] +// movzbq %al, %rsi +// shrq $8, %rax +// [use %rsi] +// it is better to use 32-bit registers +// (high 32 bits will be cleared on assignment), i.e. +// movzbl %al, %esi +// [use %rsi] +// movzbl %ah, %esi +// shrq $16, %rax +// [use %rsi] +// Makes instructions shorter and removes one shift +// (the latter is not such a big deal as it's execution time +// is nicely masked by [use %rsi] instruction). +// +// Performance difference: +// About 10% degradation on bytes = 8 .. 16 +// (clobbering registers that should be saved) +// Break even at 32 bytes. +// 3% improvement starting from 64 bytes. + +#include "generic_crc.h" + +#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_AMD64 + +namespace crcutil { + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordGccAmd64( + const void *data, size_t bytes, const uint64 &start) const; + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( + const void *data, + size_t bytes, + const uint64 &start) const { + if (bytes <= 6 * sizeof(Word) - 1) { + const uint8 *src = static_cast<const uint8 *>(data); + uint64 crc = start ^ this->Base().Canonize(); + const uint8 *end = src + bytes; +#define PROCESS_ONE_WORD() do { \ + Word buf = reinterpret_cast<const Word *>(src)[0]; \ + CRC_WORD(this, crc, buf); \ + src += sizeof(Word); \ +} while (0) + if (bytes >= 1 * sizeof(Word)) { + PROCESS_ONE_WORD(); + if (bytes >= 2 * sizeof(Word)) { + PROCESS_ONE_WORD(); + if (bytes >= 3 * sizeof(Word)) { + PROCESS_ONE_WORD(); + if (bytes >= 4 * sizeof(Word)) { + PROCESS_ONE_WORD(); + if (bytes >= 5 * sizeof(Word)) { + PROCESS_ONE_WORD(); + } + } + } + } + } + for (; src < end; ++src) { + CRC_BYTE(this, crc, *src); + } + return (crc ^ this->Base().Canonize()); + } + return this->CrcMultiwordGccAmd64(data, bytes, start); +} + +#define TMP0 "%%rsi" +#define TMP0W "%%esi" + +#define BUF0 "%%rax" +#define BUF0L "%%al" +#define BUF0H "%%ah" + +#define BUF1 "%%rbx" +#define BUF1L "%%bl" +#define BUF1H "%%bh" + +#define BUF2 "%%rcx" +#define BUF2L "%%cl" +#define BUF2H "%%ch" + +#define BUF3 "%%rdx" +#define BUF3L "%%dl" +#define BUF3H "%%dh" + +#define CRC_WORD_ASM() \ + "xorq %[crc0], " BUF0 "\n" \ + "movzbq " BUF0L ", " TMP0 "\n" \ + "movq (%[table_word], " TMP0 ", 8), %[crc0]\n" \ + "movzbl " BUF0H ", " TMP0W "\n" \ + "shrq $16, " BUF0 "\n" \ + "xorq 1*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ + "movzbq " BUF0L ", " TMP0 "\n" \ + "xorq 2*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ + "movzbl " BUF0H ", " TMP0W "\n" \ + "shrq $16, " BUF0 "\n" \ + "xorq 3*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ + "movzbq " BUF0L ", " TMP0 "\n" \ + "xorq 4*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ + "movzbl " BUF0H ", " TMP0W "\n" \ + "shrq $16, " BUF0 "\n" \ + "xorq 5*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ + "movzbq " BUF0L ", " TMP0 "\n" \ + "xorq 6*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" \ + "movzbl " BUF0H ", " TMP0W "\n" \ + "xorq 7*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordGccAmd64( + const void *data, size_t bytes, const uint64 &start) const { + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + uint64 crc0 = start ^ this->Base().Canonize(); + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64); + if (src >= end) { + return (crc0 ^ this->Base().Canonize()); + } + + uint64 crc1; + uint64 crc2; + uint64 crc3; + + asm( + "sub $2*4*8 - 1, %[end]\n" + "cmpq %[src], %[end]\n" + "jbe 2f\n" + "xorq %[crc1], %[crc1]\n" + "movq (%[src]), " BUF0 "\n" + "movq 1*8(%[src]), " BUF1 "\n" + "movq 2*8(%[src]), " BUF2 "\n" + "movq 3*8(%[src]), " BUF3 "\n" + "movq %[crc1], %[crc2]\n" + "movq %[crc1], %[crc3]\n" + + "1:\n" +#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 + "prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n" +#endif // HAVE_SSE + "add $4*8, %[src]\n" + + // Set buffer data. + "xorq %[crc0], " BUF0 "\n" + "xorq %[crc1], " BUF1 "\n" + "xorq %[crc2], " BUF2 "\n" + "xorq %[crc3], " BUF3 "\n" + + // LOAD crc of byte 0 and shift buffers. + "movzbl " BUF0L ", " TMP0W "\n" + "movq (%[table], " TMP0 ", 8), %[crc0]\n" + "movzbl " BUF1L ", " TMP0W "\n" + "movq (%[table], " TMP0 ", 8), %[crc1]\n" + "movzbl " BUF2L ", " TMP0W "\n" + "movq (%[table], " TMP0 ", 8), %[crc2]\n" + "movzbl " BUF3L ", " TMP0W "\n" + "movq (%[table], " TMP0 ", 8), %[crc3]\n" + +#define XOR1(byte1) \ + "movzbl " BUF0L ", " TMP0W "\n" \ + "xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc0]\n" \ + "movzbl " BUF1L ", " TMP0W "\n" \ + "xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc1]\n" \ + "movzbl " BUF2L ", " TMP0W "\n" \ + "xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc2]\n" \ + "movzbl " BUF3L ", " TMP0W "\n" \ + "xorq " #byte1 "*256*8(%[table], " TMP0 ", 8), %[crc3]\n" + +#define XOR2(byte2) \ + "movzbl " BUF0H ", " TMP0W "\n" \ + "shrq $16, " BUF0 "\n" \ + "xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc0]\n" \ + "movzbl " BUF1H ", " TMP0W "\n" \ + "shrq $16, " BUF1 "\n" \ + "xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc1]\n" \ + "movzbl " BUF2H ", " TMP0W "\n" \ + "shrq $16, " BUF2 "\n" \ + "xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc2]\n" \ + "movzbl " BUF3H ", " TMP0W "\n" \ + "shrq $16, " BUF3 "\n" \ + "xorq " #byte2 "*256*8(%[table], " TMP0 ", 8), %[crc3]\n" + + XOR2(1) + XOR1(2) + XOR2(3) + XOR1(4) + XOR2(5) + XOR1(6) + + // Update CRC registers and load buffers. + "movzbl " BUF0H ", " TMP0W "\n" + "xorq 7*256*8(%[table], " TMP0 ", 8), %[crc0]\n" + "movq (%[src]), " BUF0 "\n" + "movzbl " BUF1H ", " TMP0W "\n" + "xorq 7*256*8(%[table], " TMP0 ", 8), %[crc1]\n" + "movq 1*8(%[src]), " BUF1 "\n" + "movzbl " BUF2H ", " TMP0W "\n" + "xorq 7*256*8(%[table], " TMP0 ", 8), %[crc2]\n" + "movq 2*8(%[src]), " BUF2 "\n" + "movzbl " BUF3H ", " TMP0W "\n" + "xorq 7*256*8(%[table], " TMP0 ", 8), %[crc3]\n" + "movq 3*8(%[src]), " BUF3 "\n" + + "cmpq %[src], %[end]\n" + "ja 1b\n" + + CRC_WORD_ASM() + + "xorq %[crc1], " BUF1 "\n" + "movq " BUF1 ", " BUF0 "\n" + CRC_WORD_ASM() + + "xorq %[crc2], " BUF2 "\n" + "movq " BUF2 ", " BUF0 "\n" + CRC_WORD_ASM() + + "xorq %[crc3], " BUF3 "\n" + "movq " BUF3 ", " BUF0 "\n" + CRC_WORD_ASM() + + "add $4*8, %[src]\n" + + "2:\n" + "add $2*4*8 - 8, %[end]\n" + "cmpq %[src], %[end]\n" + "jbe 4f\n" + + "3:\n" + "movq (%[src]), " BUF0 "\n" + "add $8, %[src]\n" + CRC_WORD_ASM() + "cmpq %[src], %[end]\n" + "ja 3b\n" + + "4:\n" + "add $7, %[end]\n" + + "cmpq %[src], %[end]\n" + "jbe 6f\n" + + "5:\n" + "movzbq (%[src]), " BUF0 "\n" + "movzbq %b[crc0], " TMP0 "\n" + "shrq $8, %[crc0]\n" + "xorq " BUF0 ", " TMP0 "\n" + "add $1, %[src]\n" + "xorq 7*256*8(%[table_word], " TMP0 ", 8), %[crc0]\n" + "cmpq %[src], %[end]\n" + "ja 5b\n" + + "6:\n" + + + : // outputs + [src] "+r" (src), + [end] "+r" (end), + [crc0] "+r" (crc0), + [crc1] "=&r" (crc1), + [crc2] "=&r" (crc2), + [crc3] "=&r" (crc3) + + : // inputs + [table] "r" (&this->crc_word_interleaved_[0][0]), + [table_word] "r" (&this->crc_word_[0][0]) + + : // clobbers + "%rax", // BUF0 + "%rbx", // BUF1 + "%rcx", // BUF2 + "%rdx", // BUF3 + "%rsi" // TMP0 + ); + + return (crc0 ^ this->Base().Canonize()); +} + +} // namespace crcutil + +#endif // defined(__GNUC__) && HAVE_AMD64 && CRCUTIL_USE_ASM diff --git a/contrib/libs/crcutil/multiword_64_64_gcc_i386_mmx.cc b/contrib/libs/crcutil/multiword_64_64_gcc_i386_mmx.cc index 6bd4ead5bd..2bb6e59f3b 100644 --- a/contrib/libs/crcutil/multiword_64_64_gcc_i386_mmx.cc +++ b/contrib/libs/crcutil/multiword_64_64_gcc_i386_mmx.cc @@ -1,261 +1,261 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements multiword CRC for GCC on i386. - -#include "generic_crc.h" - -#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX - -namespace crcutil { - -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( - const void *data, size_t bytes, const uint64 &start) - const GCC_OMIT_FRAME_POINTER; - -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( - const void *data, size_t bytes, const uint64 &start) const { - if (bytes <= 7) { - const uint8 *src = static_cast<const uint8 *>(data); - uint64 crc = start ^ this->Base().Canonize(); - for (const uint8 *end = src + bytes; src < end; ++src) { - CRC_BYTE(this, crc, *src); - } - return (crc ^ this->Base().Canonize()); - } - return CrcMultiwordI386Mmx(data, bytes, start); -} - -#define CRC_WORD_MMX() \ - "pxor %[crc0], %[buf0]\n" \ - "movd %[buf0], %[tmp0]\n" \ - "psrlq $32, %[buf0]\n" \ - "movzbl %b[tmp0], %[temp]\n" \ - "shrl $8, %[tmp0]\n" \ - "movq (%[table], %[temp], 8), %[crc0]\n" \ - "movzbl %b[tmp0], %[temp]\n" \ - "shrl $8, %[tmp0]\n" \ - "pxor 1*256*8(%[table], %[temp], 8), %[crc0]\n" \ - "movzbl %b[tmp0], %[temp]\n" \ - "shrl $8, %[tmp0]\n" \ - "pxor 2*256*8(%[table], %[temp], 8), %[crc0]\n" \ - "pxor 3*256*8(%[table], %[tmp0], 8), %[crc0]\n" \ - "movd %[buf0], %[tmp0]\n" \ - "movzbl %b[tmp0], %[temp]\n" \ - "shrl $8, %[tmp0]\n" \ - "pxor 4*256*8(%[table], %[temp], 8), %[crc0]\n" \ - "movzbl %b[tmp0], %[temp]\n" \ - "shrl $8, %[tmp0]\n" \ - "pxor 5*256*8(%[table], %[temp], 8), %[crc0]\n" \ - "movzbl %b[tmp0], %[temp]\n" \ - "shrl $8, %[tmp0]\n" \ - "pxor 6*256*8(%[table], %[temp], 8), %[crc0]\n" \ - "pxor 7*256*8(%[table], %[tmp0], 8), %[crc0]\n" - -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( - const void *data, size_t bytes, const uint64 &start) const { - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - uint64 crc0 = start ^ this->Base().Canonize(); - - ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64); - if (src >= end) { - return (crc0 ^ this->Base().Canonize()); - } - - uint64 crc1; - uint64 crc2; - uint64 crc3; - - uint64 buf0; - uint64 buf1; - uint64 buf2; - uint64 buf3; - - uint32 tmp0; - uint32 tmp1; - uint32 tmp2; - uint32 tmp3; - - uint32 temp; - - void *table_ptr; - const uint64 *table_interleaved = &this->crc_word_interleaved_[0][0]; - const uint64 *table_word = &this->crc_word_[0][0]; - - asm( +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements multiword CRC for GCC on i386. + +#include "generic_crc.h" + +#if defined(__GNUC__) && CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX + +namespace crcutil { + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( + const void *data, size_t bytes, const uint64 &start) + const GCC_OMIT_FRAME_POINTER; + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( + const void *data, size_t bytes, const uint64 &start) const { + if (bytes <= 7) { + const uint8 *src = static_cast<const uint8 *>(data); + uint64 crc = start ^ this->Base().Canonize(); + for (const uint8 *end = src + bytes; src < end; ++src) { + CRC_BYTE(this, crc, *src); + } + return (crc ^ this->Base().Canonize()); + } + return CrcMultiwordI386Mmx(data, bytes, start); +} + +#define CRC_WORD_MMX() \ + "pxor %[crc0], %[buf0]\n" \ + "movd %[buf0], %[tmp0]\n" \ + "psrlq $32, %[buf0]\n" \ + "movzbl %b[tmp0], %[temp]\n" \ + "shrl $8, %[tmp0]\n" \ + "movq (%[table], %[temp], 8), %[crc0]\n" \ + "movzbl %b[tmp0], %[temp]\n" \ + "shrl $8, %[tmp0]\n" \ + "pxor 1*256*8(%[table], %[temp], 8), %[crc0]\n" \ + "movzbl %b[tmp0], %[temp]\n" \ + "shrl $8, %[tmp0]\n" \ + "pxor 2*256*8(%[table], %[temp], 8), %[crc0]\n" \ + "pxor 3*256*8(%[table], %[tmp0], 8), %[crc0]\n" \ + "movd %[buf0], %[tmp0]\n" \ + "movzbl %b[tmp0], %[temp]\n" \ + "shrl $8, %[tmp0]\n" \ + "pxor 4*256*8(%[table], %[temp], 8), %[crc0]\n" \ + "movzbl %b[tmp0], %[temp]\n" \ + "shrl $8, %[tmp0]\n" \ + "pxor 5*256*8(%[table], %[temp], 8), %[crc0]\n" \ + "movzbl %b[tmp0], %[temp]\n" \ + "shrl $8, %[tmp0]\n" \ + "pxor 6*256*8(%[table], %[temp], 8), %[crc0]\n" \ + "pxor 7*256*8(%[table], %[tmp0], 8), %[crc0]\n" + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( + const void *data, size_t bytes, const uint64 &start) const { + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + uint64 crc0 = start ^ this->Base().Canonize(); + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64); + if (src >= end) { + return (crc0 ^ this->Base().Canonize()); + } + + uint64 crc1; + uint64 crc2; + uint64 crc3; + + uint64 buf0; + uint64 buf1; + uint64 buf2; + uint64 buf3; + + uint32 tmp0; + uint32 tmp1; + uint32 tmp2; + uint32 tmp3; + + uint32 temp; + + void *table_ptr; + const uint64 *table_interleaved = &this->crc_word_interleaved_[0][0]; + const uint64 *table_word = &this->crc_word_[0][0]; + + asm( "subl $2*4*8 - 1, %[end]\n" - "cmpl %[src], %[end]\n" - "jbe 2f\n" - - "pxor %[crc1], %[crc1]\n" - "pxor %[crc2], %[crc2]\n" - "pxor %[crc3], %[crc3]\n" - "movq (%[src]), %[buf0]\n" - "movq 1*8(%[src]), %[buf1]\n" - "movq 2*8(%[src]), %[buf2]\n" - "movq 3*8(%[src]), %[buf3]\n" - - "movl %[table_interleaved], %[table]\n" - "1:\n" -#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 - "prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n" -#endif - "addl $0x20, %[src]\n" - "pxor %[crc0], %[buf0]\n" - "pxor %[crc1], %[buf1]\n" - "pxor %[crc2], %[buf2]\n" - "pxor %[crc3], %[buf3]\n" - - "movd %[buf0], %[tmp0]\n" - "psrlq $32, %[buf0]\n" - "movd %[buf1], %[tmp1]\n" - "psrlq $32, %[buf1]\n" - "movd %[buf2], %[tmp2]\n" - "psrlq $32, %[buf2]\n" - "movd %[buf3], %[tmp3]\n" - "psrlq $32, %[buf3]\n" - - "movzbl %b[tmp0], %[temp]\n" - "shrl $8, %[tmp0]\n" - "movq (%[table], %[temp], 8), %[crc0]\n" - "movzbl %b[tmp1], %[temp]\n" - "shrl $8, %[tmp1]\n" - "movq (%[table], %[temp], 8), %[crc1]\n" - "movzbl %b[tmp2], %[temp]\n" - "shrl $8, %[tmp2]\n" - "movq (%[table], %[temp], 8), %[crc2]\n" - "movzbl %b[tmp3], %[temp]\n" - "shrl $8, %[tmp3]\n" - "movq (%[table], %[temp], 8), %[crc3]\n" - -#define XOR(byte) \ - "movzbl %b[tmp0], %[temp]\n" \ - "shrl $8, %[tmp0]\n" \ - "pxor " #byte "*256*8(%[table], %[temp], 8), %[crc0]\n" \ - "movzbl %b[tmp1], %[temp]\n" \ - "shrl $8, %[tmp1]\n" \ - "pxor " #byte "*256*8(%[table], %[temp], 8), %[crc1]\n" \ - "movzbl %b[tmp2], %[temp]\n" \ - "shrl $8, %[tmp2]\n" \ - "pxor " #byte "*256*8(%[table], %[temp], 8), %[crc2]\n" \ - "movzbl %b[tmp3], %[temp]\n" \ - "shrl $8, %[tmp3]\n" \ - "pxor " #byte "*256*8(%[table], %[temp], 8), %[crc3]\n" - - XOR(1) - XOR(2) - - "pxor 3*256*8(%[table], %[tmp0], 8), %[crc0]\n" - "movd %[buf0], %[tmp0]\n" - "pxor 3*256*8(%[table], %[tmp1], 8), %[crc1]\n" - "movd %[buf1], %[tmp1]\n" - "pxor 3*256*8(%[table], %[tmp2], 8), %[crc2]\n" - "movd %[buf2], %[tmp2]\n" - "pxor 3*256*8(%[table], %[tmp3], 8), %[crc3]\n" - "movd %[buf3], %[tmp3]\n" - - XOR(4) - XOR(5) - XOR(6) - - "pxor 7*256*8(%[table], %[tmp0], 8), %[crc0]\n" - "movq (%[src]), %[buf0]\n" - "pxor 7*256*8(%[table], %[tmp1], 8), %[crc1]\n" - "movq 1*8(%[src]), %[buf1]\n" - "pxor 7*256*8(%[table], %[tmp2], 8), %[crc2]\n" - "movq 2*8(%[src]), %[buf2]\n" - "pxor 7*256*8(%[table], %[tmp3], 8), %[crc3]\n" - "movq 3*8(%[src]), %[buf3]\n" - "cmpl %[src], %[end]\n" - "ja 1b\n" -#undef XOR - - "movl %[table_word], %[table]\n" - CRC_WORD_MMX() - - "pxor %[crc1], %[buf1]\n" - "movq %[buf1], %[buf0]\n" - CRC_WORD_MMX() - - "pxor %[crc2], %[buf2]\n" - "movq %[buf2], %[buf0]\n" - CRC_WORD_MMX() - - "pxor %[crc3], %[buf3]\n" - "movq %[buf3], %[buf0]\n" - CRC_WORD_MMX() - - "add $4*8, %[src]\n" - "2:\n" - "movl %[table_word], %[table]\n" - + "cmpl %[src], %[end]\n" + "jbe 2f\n" + + "pxor %[crc1], %[crc1]\n" + "pxor %[crc2], %[crc2]\n" + "pxor %[crc3], %[crc3]\n" + "movq (%[src]), %[buf0]\n" + "movq 1*8(%[src]), %[buf1]\n" + "movq 2*8(%[src]), %[buf2]\n" + "movq 3*8(%[src]), %[buf3]\n" + + "movl %[table_interleaved], %[table]\n" + "1:\n" +#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 + "prefetcht0 " TO_STRING(CRCUTIL_PREFETCH_WIDTH) "(%[src])\n" +#endif + "addl $0x20, %[src]\n" + "pxor %[crc0], %[buf0]\n" + "pxor %[crc1], %[buf1]\n" + "pxor %[crc2], %[buf2]\n" + "pxor %[crc3], %[buf3]\n" + + "movd %[buf0], %[tmp0]\n" + "psrlq $32, %[buf0]\n" + "movd %[buf1], %[tmp1]\n" + "psrlq $32, %[buf1]\n" + "movd %[buf2], %[tmp2]\n" + "psrlq $32, %[buf2]\n" + "movd %[buf3], %[tmp3]\n" + "psrlq $32, %[buf3]\n" + + "movzbl %b[tmp0], %[temp]\n" + "shrl $8, %[tmp0]\n" + "movq (%[table], %[temp], 8), %[crc0]\n" + "movzbl %b[tmp1], %[temp]\n" + "shrl $8, %[tmp1]\n" + "movq (%[table], %[temp], 8), %[crc1]\n" + "movzbl %b[tmp2], %[temp]\n" + "shrl $8, %[tmp2]\n" + "movq (%[table], %[temp], 8), %[crc2]\n" + "movzbl %b[tmp3], %[temp]\n" + "shrl $8, %[tmp3]\n" + "movq (%[table], %[temp], 8), %[crc3]\n" + +#define XOR(byte) \ + "movzbl %b[tmp0], %[temp]\n" \ + "shrl $8, %[tmp0]\n" \ + "pxor " #byte "*256*8(%[table], %[temp], 8), %[crc0]\n" \ + "movzbl %b[tmp1], %[temp]\n" \ + "shrl $8, %[tmp1]\n" \ + "pxor " #byte "*256*8(%[table], %[temp], 8), %[crc1]\n" \ + "movzbl %b[tmp2], %[temp]\n" \ + "shrl $8, %[tmp2]\n" \ + "pxor " #byte "*256*8(%[table], %[temp], 8), %[crc2]\n" \ + "movzbl %b[tmp3], %[temp]\n" \ + "shrl $8, %[tmp3]\n" \ + "pxor " #byte "*256*8(%[table], %[temp], 8), %[crc3]\n" + + XOR(1) + XOR(2) + + "pxor 3*256*8(%[table], %[tmp0], 8), %[crc0]\n" + "movd %[buf0], %[tmp0]\n" + "pxor 3*256*8(%[table], %[tmp1], 8), %[crc1]\n" + "movd %[buf1], %[tmp1]\n" + "pxor 3*256*8(%[table], %[tmp2], 8), %[crc2]\n" + "movd %[buf2], %[tmp2]\n" + "pxor 3*256*8(%[table], %[tmp3], 8), %[crc3]\n" + "movd %[buf3], %[tmp3]\n" + + XOR(4) + XOR(5) + XOR(6) + + "pxor 7*256*8(%[table], %[tmp0], 8), %[crc0]\n" + "movq (%[src]), %[buf0]\n" + "pxor 7*256*8(%[table], %[tmp1], 8), %[crc1]\n" + "movq 1*8(%[src]), %[buf1]\n" + "pxor 7*256*8(%[table], %[tmp2], 8), %[crc2]\n" + "movq 2*8(%[src]), %[buf2]\n" + "pxor 7*256*8(%[table], %[tmp3], 8), %[crc3]\n" + "movq 3*8(%[src]), %[buf3]\n" + "cmpl %[src], %[end]\n" + "ja 1b\n" +#undef XOR + + "movl %[table_word], %[table]\n" + CRC_WORD_MMX() + + "pxor %[crc1], %[buf1]\n" + "movq %[buf1], %[buf0]\n" + CRC_WORD_MMX() + + "pxor %[crc2], %[buf2]\n" + "movq %[buf2], %[buf0]\n" + CRC_WORD_MMX() + + "pxor %[crc3], %[buf3]\n" + "movq %[buf3], %[buf0]\n" + CRC_WORD_MMX() + + "add $4*8, %[src]\n" + "2:\n" + "movl %[table_word], %[table]\n" + "addl $2*4*8 - 8, %[end]\n" - "cmpl %[src], %[end]\n" - "jbe 4f\n" - "3:\n" - "movq (%[src]), %[buf0]\n" - "addl $0x8, %[src]\n" - CRC_WORD_MMX() - "cmpl %[src], %[end]\n" - "ja 3b\n" - "4:\n" + "cmpl %[src], %[end]\n" + "jbe 4f\n" + "3:\n" + "movq (%[src]), %[buf0]\n" + "addl $0x8, %[src]\n" + CRC_WORD_MMX() + "cmpl %[src], %[end]\n" + "ja 3b\n" + "4:\n" "addl $7, %[end]\n" - - "cmpl %[src], %[end]\n" - "jbe 6f\n" - - "5:\n" - "movd %[crc0], %[tmp0]\n" - "movzbl (%[src]), %[temp]\n" - "movzbl %b[tmp0], %[tmp0]\n" - "psrlq $8, %[crc0]\n" - "xorl %[tmp0], %[temp]\n" - "add $1, %[src]\n" - "pxor 7*256*8(%[table], %[temp], 8), %[crc0]\n" - "cmpl %[src], %[end]\n" - "ja 5b\n" - - "6:\n" - - : // outputs - [src] "+r" (src), - [end] "+m" (end), - [crc0] "+y" (crc0), - [crc1] "=&y" (crc1), - [crc2] "=&y" (crc2), - [crc3] "=&y" (crc3), - [buf0] "=&y" (buf0), - [buf1] "=&y" (buf1), - [buf2] "=&y" (buf2), - [buf3] "=&y" (buf3), - [tmp0] "=&q" (tmp0), - [tmp1] "=&q" (tmp1), - [tmp2] "=&q" (tmp2), - [tmp3] "=&q" (tmp3), - [temp] "=&r" (temp), - [table] "=&r" (table_ptr) - - : // inputs - [table_interleaved] "m" (table_interleaved), - [table_word] "m" (table_word)); - - asm volatile("emms"); - - return (crc0 ^ this->Base().Canonize()); -} - -} // namespace crcutil - -#endif // defined(__GNUC__) && HAVE_AMD64 && CRCUTIL_USE_ASM + + "cmpl %[src], %[end]\n" + "jbe 6f\n" + + "5:\n" + "movd %[crc0], %[tmp0]\n" + "movzbl (%[src]), %[temp]\n" + "movzbl %b[tmp0], %[tmp0]\n" + "psrlq $8, %[crc0]\n" + "xorl %[tmp0], %[temp]\n" + "add $1, %[src]\n" + "pxor 7*256*8(%[table], %[temp], 8), %[crc0]\n" + "cmpl %[src], %[end]\n" + "ja 5b\n" + + "6:\n" + + : // outputs + [src] "+r" (src), + [end] "+m" (end), + [crc0] "+y" (crc0), + [crc1] "=&y" (crc1), + [crc2] "=&y" (crc2), + [crc3] "=&y" (crc3), + [buf0] "=&y" (buf0), + [buf1] "=&y" (buf1), + [buf2] "=&y" (buf2), + [buf3] "=&y" (buf3), + [tmp0] "=&q" (tmp0), + [tmp1] "=&q" (tmp1), + [tmp2] "=&q" (tmp2), + [tmp3] "=&q" (tmp3), + [temp] "=&r" (temp), + [table] "=&r" (table_ptr) + + : // inputs + [table_interleaved] "m" (table_interleaved), + [table_word] "m" (table_word)); + + asm volatile("emms"); + + return (crc0 ^ this->Base().Canonize()); +} + +} // namespace crcutil + +#endif // defined(__GNUC__) && HAVE_AMD64 && CRCUTIL_USE_ASM diff --git a/contrib/libs/crcutil/multiword_64_64_intrinsic_i386_mmx.cc b/contrib/libs/crcutil/multiword_64_64_intrinsic_i386_mmx.cc index 302af5a5a4..a37d4d251b 100644 --- a/contrib/libs/crcutil/multiword_64_64_intrinsic_i386_mmx.cc +++ b/contrib/libs/crcutil/multiword_64_64_intrinsic_i386_mmx.cc @@ -1,243 +1,243 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements 64-bit multiword CRC using MMX built-in functions. - -#include "generic_crc.h" - -#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX - -namespace crcutil { - -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( - const void *data, size_t bytes, const uint64 &start) - const GCC_OMIT_FRAME_POINTER; - -#if !defined(_MSC_VER) -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( - const void *data, - size_t bytes, - const uint64 &start) const { - if (bytes <= 7) { - const uint8 *src = static_cast<const uint8 *>(data); - uint64 crc = start ^ Base().Canonize(); - for (const uint8 *end = src + bytes; src < end; ++src) { - CRC_BYTE(this, crc, *src); - } - return (crc ^ Base().Canonize()); - } - return CrcMultiwordI386Mmx(data, bytes, start); -} -#else -#pragma warning(push) -// CL: uninitialized local variable 'crc1' used -// Wrong: crc1 = XOR(crc1, crc1) sets it to 0. -#pragma warning(disable: 4700) - -#pragma warning(disable: 4619) // there is no warning number '592' - -// ICL: variable "crc1" is used before its value is set -// Wrong: crc1 = XOR(crc1, crc1) sets it to 0. -#pragma warning(disable: 592) -#endif // !defined(_MSC_VER) - -#define MM64(adr) reinterpret_cast<const __m64 *>(adr) -#define MM64_TABLE(byte) MM64(crc_word_interleaved_[byte]) - -#define CRC_WORD_MMX(this, crc, buf) do { \ - buf = _mm_xor_si64(buf, crc); \ - uint32 tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \ - buf = _mm_srli_si64(buf, 32); \ - crc = MM64(crc_word_[0])[TO_BYTE(tmp)]; \ - tmp >>= 8; \ - crc = _mm_xor_si64(crc, MM64(crc_word_[1])[TO_BYTE(tmp)]); \ - tmp >>= 8; \ - crc = _mm_xor_si64(crc, MM64(crc_word_[2])[TO_BYTE(tmp)]); \ - tmp >>= 8; \ - crc = _mm_xor_si64(crc, MM64(crc_word_[3])[tmp]); \ - tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \ - crc = _mm_xor_si64(crc, MM64(crc_word_[4])[TO_BYTE(tmp)]); \ - tmp >>= 8; \ - crc = _mm_xor_si64(crc, MM64(crc_word_[5])[TO_BYTE(tmp)]); \ - tmp >>= 8; \ - crc = _mm_xor_si64(crc, MM64(crc_word_[6])[TO_BYTE(tmp)]); \ - tmp >>= 8; \ - crc = _mm_xor_si64(crc, MM64(crc_word_[7])[tmp]); \ -} while (0) - -template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( - const void *data, size_t bytes, const uint64 &start) const { - const uint8 *src = static_cast<const uint8 *>(data); - const uint8 *end = src + bytes; - uint64 crc = start ^ Base().Canonize(); - - ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc, uint64); - if (src >= end) { - return (crc ^ Base().Canonize()); - } - - // Process 4 registers of sizeof(uint64) bytes at once. - bytes = static_cast<size_t>(end - src) & ~(4*8 - 1); - if (bytes > 4*8) { - const uint8 *stop = src + bytes - 4*8; - union { - __m64 m64; - uint64 u64; - } temp; - __m64 crc0; - __m64 crc1; - __m64 crc2; - __m64 crc3; - __m64 buf0 = MM64(src)[0]; - __m64 buf1 = MM64(src)[1]; - __m64 buf2 = MM64(src)[2]; - __m64 buf3 = MM64(src)[3]; - - temp.u64 = crc; - crc0 = temp.m64; -#if defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4) - // There is no way to suppress a warning in GCC; - // generate extra assignments. - temp.u64 = 0; - crc1 = temp.m64; - crc2 = temp.m64; - crc3 = temp.m64; -#else - crc1 = _mm_xor_si64(crc1, crc1); - crc2 = _mm_xor_si64(crc2, crc2); - crc3 = _mm_xor_si64(crc3, crc3); -#endif // defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4) - - do { - PREFETCH(src); - src += 4*8; - - buf0 = _mm_xor_si64(buf0, crc0); - buf1 = _mm_xor_si64(buf1, crc1); - buf2 = _mm_xor_si64(buf2, crc2); - buf3 = _mm_xor_si64(buf3, crc3); - - uint32 tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0)); - uint32 tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1)); - uint32 tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2)); - uint32 tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3)); - - buf0 = _mm_srli_si64(buf0, 32); - buf1 = _mm_srli_si64(buf1, 32); - buf2 = _mm_srli_si64(buf2, 32); - buf3 = _mm_srli_si64(buf3, 32); - - crc0 = MM64_TABLE(0)[TO_BYTE(tmp0)]; - tmp0 >>= 8; - crc1 = MM64_TABLE(0)[TO_BYTE(tmp1)]; - tmp1 >>= 8; - crc2 = MM64_TABLE(0)[TO_BYTE(tmp2)]; - tmp2 >>= 8; - crc3 = MM64_TABLE(0)[TO_BYTE(tmp3)]; - tmp3 >>= 8; - -#define XOR(byte) do { \ - crc0 = _mm_xor_si64(crc0, MM64_TABLE(byte)[TO_BYTE(tmp0)]); \ - tmp0 >>= 8; \ - crc1 = _mm_xor_si64(crc1, MM64_TABLE(byte)[TO_BYTE(tmp1)]); \ - tmp1 >>= 8; \ - crc2 = _mm_xor_si64(crc2, MM64_TABLE(byte)[TO_BYTE(tmp2)]); \ - tmp2 >>= 8; \ - crc3 = _mm_xor_si64(crc3, MM64_TABLE(byte)[TO_BYTE(tmp3)]); \ - tmp3 >>= 8; \ -} while (0) - - XOR(1); - XOR(2); - - crc0 = _mm_xor_si64(crc0, MM64_TABLE(3)[tmp0]); - tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0)); - crc1 = _mm_xor_si64(crc1, MM64_TABLE(3)[tmp1]); - tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1)); - crc2 = _mm_xor_si64(crc2, MM64_TABLE(3)[tmp2]); - tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2)); - crc3 = _mm_xor_si64(crc3, MM64_TABLE(3)[tmp3]); - tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3)); - - XOR(4); - XOR(5); - XOR(6); - -#undef XOR - - crc0 = _mm_xor_si64(crc0, MM64_TABLE(sizeof(uint64) - 1)[tmp0]); - buf0 = MM64(src)[0]; - crc1 = _mm_xor_si64(crc1, MM64_TABLE(sizeof(uint64) - 1)[tmp1]); - buf1 = MM64(src)[1]; - crc2 = _mm_xor_si64(crc2, MM64_TABLE(sizeof(uint64) - 1)[tmp2]); - buf2 = MM64(src)[2]; - crc3 = _mm_xor_si64(crc3, MM64_TABLE(sizeof(uint64) - 1)[tmp3]); - buf3 = MM64(src)[3]; - } - while (src < stop); - - CRC_WORD_MMX(this, crc0, buf0); - buf1 = _mm_xor_si64(buf1, crc1); - CRC_WORD_MMX(this, crc0, buf1); - buf2 = _mm_xor_si64(buf2, crc2); - CRC_WORD_MMX(this, crc0, buf2); - buf3 = _mm_xor_si64(buf3, crc3); - CRC_WORD_MMX(this, crc0, buf3); - - temp.m64 = crc0; - crc = temp.u64; - - _mm_empty(); - - src += 4*8; - } - - // Process sizeof(uint64) bytes at once. - bytes = static_cast<size_t>(end - src) & ~(sizeof(uint64) - 1); - if (bytes > 0) { - union { - __m64 m64; - uint64 u64; - } temp; - __m64 crc0; - - temp.u64 = crc; - crc0 = temp.m64; - - for (const uint8 *stop = src + bytes; src < stop; src += sizeof(uint64)) { - __m64 buf0 = MM64(src)[0]; - CRC_WORD_MMX(this, crc0, buf0); - } - - temp.m64 = crc0; - crc = temp.u64; - - _mm_empty(); - } - - // Compute CRC of remaining bytes. - for (;src < end; ++src) { - CRC_BYTE(this, crc, *src); - } - - return (crc ^ Base().Canonize()); -} - -#if defined(_MSC_VER) -#pragma warning(pop) -#endif // defined(_MSC_VER) - -} // namespace crcutil - -#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements 64-bit multiword CRC using MMX built-in functions. + +#include "generic_crc.h" + +#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX + +namespace crcutil { + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( + const void *data, size_t bytes, const uint64 &start) + const GCC_OMIT_FRAME_POINTER; + +#if !defined(_MSC_VER) +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( + const void *data, + size_t bytes, + const uint64 &start) const { + if (bytes <= 7) { + const uint8 *src = static_cast<const uint8 *>(data); + uint64 crc = start ^ Base().Canonize(); + for (const uint8 *end = src + bytes; src < end; ++src) { + CRC_BYTE(this, crc, *src); + } + return (crc ^ Base().Canonize()); + } + return CrcMultiwordI386Mmx(data, bytes, start); +} +#else +#pragma warning(push) +// CL: uninitialized local variable 'crc1' used +// Wrong: crc1 = XOR(crc1, crc1) sets it to 0. +#pragma warning(disable: 4700) + +#pragma warning(disable: 4619) // there is no warning number '592' + +// ICL: variable "crc1" is used before its value is set +// Wrong: crc1 = XOR(crc1, crc1) sets it to 0. +#pragma warning(disable: 592) +#endif // !defined(_MSC_VER) + +#define MM64(adr) reinterpret_cast<const __m64 *>(adr) +#define MM64_TABLE(byte) MM64(crc_word_interleaved_[byte]) + +#define CRC_WORD_MMX(this, crc, buf) do { \ + buf = _mm_xor_si64(buf, crc); \ + uint32 tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \ + buf = _mm_srli_si64(buf, 32); \ + crc = MM64(crc_word_[0])[TO_BYTE(tmp)]; \ + tmp >>= 8; \ + crc = _mm_xor_si64(crc, MM64(crc_word_[1])[TO_BYTE(tmp)]); \ + tmp >>= 8; \ + crc = _mm_xor_si64(crc, MM64(crc_word_[2])[TO_BYTE(tmp)]); \ + tmp >>= 8; \ + crc = _mm_xor_si64(crc, MM64(crc_word_[3])[tmp]); \ + tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \ + crc = _mm_xor_si64(crc, MM64(crc_word_[4])[TO_BYTE(tmp)]); \ + tmp >>= 8; \ + crc = _mm_xor_si64(crc, MM64(crc_word_[5])[TO_BYTE(tmp)]); \ + tmp >>= 8; \ + crc = _mm_xor_si64(crc, MM64(crc_word_[6])[TO_BYTE(tmp)]); \ + tmp >>= 8; \ + crc = _mm_xor_si64(crc, MM64(crc_word_[7])[tmp]); \ +} while (0) + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( + const void *data, size_t bytes, const uint64 &start) const { + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + uint64 crc = start ^ Base().Canonize(); + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc, uint64); + if (src >= end) { + return (crc ^ Base().Canonize()); + } + + // Process 4 registers of sizeof(uint64) bytes at once. + bytes = static_cast<size_t>(end - src) & ~(4*8 - 1); + if (bytes > 4*8) { + const uint8 *stop = src + bytes - 4*8; + union { + __m64 m64; + uint64 u64; + } temp; + __m64 crc0; + __m64 crc1; + __m64 crc2; + __m64 crc3; + __m64 buf0 = MM64(src)[0]; + __m64 buf1 = MM64(src)[1]; + __m64 buf2 = MM64(src)[2]; + __m64 buf3 = MM64(src)[3]; + + temp.u64 = crc; + crc0 = temp.m64; +#if defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4) + // There is no way to suppress a warning in GCC; + // generate extra assignments. + temp.u64 = 0; + crc1 = temp.m64; + crc2 = temp.m64; + crc3 = temp.m64; +#else + crc1 = _mm_xor_si64(crc1, crc1); + crc2 = _mm_xor_si64(crc2, crc2); + crc3 = _mm_xor_si64(crc3, crc3); +#endif // defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4) + + do { + PREFETCH(src); + src += 4*8; + + buf0 = _mm_xor_si64(buf0, crc0); + buf1 = _mm_xor_si64(buf1, crc1); + buf2 = _mm_xor_si64(buf2, crc2); + buf3 = _mm_xor_si64(buf3, crc3); + + uint32 tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0)); + uint32 tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1)); + uint32 tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2)); + uint32 tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3)); + + buf0 = _mm_srli_si64(buf0, 32); + buf1 = _mm_srli_si64(buf1, 32); + buf2 = _mm_srli_si64(buf2, 32); + buf3 = _mm_srli_si64(buf3, 32); + + crc0 = MM64_TABLE(0)[TO_BYTE(tmp0)]; + tmp0 >>= 8; + crc1 = MM64_TABLE(0)[TO_BYTE(tmp1)]; + tmp1 >>= 8; + crc2 = MM64_TABLE(0)[TO_BYTE(tmp2)]; + tmp2 >>= 8; + crc3 = MM64_TABLE(0)[TO_BYTE(tmp3)]; + tmp3 >>= 8; + +#define XOR(byte) do { \ + crc0 = _mm_xor_si64(crc0, MM64_TABLE(byte)[TO_BYTE(tmp0)]); \ + tmp0 >>= 8; \ + crc1 = _mm_xor_si64(crc1, MM64_TABLE(byte)[TO_BYTE(tmp1)]); \ + tmp1 >>= 8; \ + crc2 = _mm_xor_si64(crc2, MM64_TABLE(byte)[TO_BYTE(tmp2)]); \ + tmp2 >>= 8; \ + crc3 = _mm_xor_si64(crc3, MM64_TABLE(byte)[TO_BYTE(tmp3)]); \ + tmp3 >>= 8; \ +} while (0) + + XOR(1); + XOR(2); + + crc0 = _mm_xor_si64(crc0, MM64_TABLE(3)[tmp0]); + tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0)); + crc1 = _mm_xor_si64(crc1, MM64_TABLE(3)[tmp1]); + tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1)); + crc2 = _mm_xor_si64(crc2, MM64_TABLE(3)[tmp2]); + tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2)); + crc3 = _mm_xor_si64(crc3, MM64_TABLE(3)[tmp3]); + tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3)); + + XOR(4); + XOR(5); + XOR(6); + +#undef XOR + + crc0 = _mm_xor_si64(crc0, MM64_TABLE(sizeof(uint64) - 1)[tmp0]); + buf0 = MM64(src)[0]; + crc1 = _mm_xor_si64(crc1, MM64_TABLE(sizeof(uint64) - 1)[tmp1]); + buf1 = MM64(src)[1]; + crc2 = _mm_xor_si64(crc2, MM64_TABLE(sizeof(uint64) - 1)[tmp2]); + buf2 = MM64(src)[2]; + crc3 = _mm_xor_si64(crc3, MM64_TABLE(sizeof(uint64) - 1)[tmp3]); + buf3 = MM64(src)[3]; + } + while (src < stop); + + CRC_WORD_MMX(this, crc0, buf0); + buf1 = _mm_xor_si64(buf1, crc1); + CRC_WORD_MMX(this, crc0, buf1); + buf2 = _mm_xor_si64(buf2, crc2); + CRC_WORD_MMX(this, crc0, buf2); + buf3 = _mm_xor_si64(buf3, crc3); + CRC_WORD_MMX(this, crc0, buf3); + + temp.m64 = crc0; + crc = temp.u64; + + _mm_empty(); + + src += 4*8; + } + + // Process sizeof(uint64) bytes at once. + bytes = static_cast<size_t>(end - src) & ~(sizeof(uint64) - 1); + if (bytes > 0) { + union { + __m64 m64; + uint64 u64; + } temp; + __m64 crc0; + + temp.u64 = crc; + crc0 = temp.m64; + + for (const uint8 *stop = src + bytes; src < stop; src += sizeof(uint64)) { + __m64 buf0 = MM64(src)[0]; + CRC_WORD_MMX(this, crc0, buf0); + } + + temp.m64 = crc0; + crc = temp.u64; + + _mm_empty(); + } + + // Compute CRC of remaining bytes. + for (;src < end; ++src) { + CRC_BYTE(this, crc, *src); + } + + return (crc ^ Base().Canonize()); +} + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif // defined(_MSC_VER) + +} // namespace crcutil + +#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX diff --git a/contrib/libs/crcutil/platform.h b/contrib/libs/crcutil/platform.h index 936cf7d331..7a19ff2679 100644 --- a/contrib/libs/crcutil/platform.h +++ b/contrib/libs/crcutil/platform.h @@ -1,245 +1,245 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Detects configuration and defines compiler-specific macros. -// Also, sets user-defined CRUTIL_USE_* macros to default values. - -#ifndef CRCUTIL_PLATFORM_H_ -#define CRCUTIL_PLATFORM_H_ - -// Permanently disable some annoying warnings generated -// by Microsoft CL when compiling Microsoft's headers. -#include "std_headers.h" - -// Use inline asm version of the code? -#if !defined(CRCUTIL_USE_ASM) -#define CRCUTIL_USE_ASM 1 -#endif // !defined(CRCUTIL_USE_ASM) - - -#if !defined(HAVE_I386) -#if defined(__i386__) || defined(_M_IX86) -#define HAVE_I386 1 -#else -#define HAVE_I386 0 -#endif // defined(__i386__) || defined(_M_IX86) -#endif // defined(HAVE_I386) - - -#if !defined(HAVE_AMD64) -#if defined(__amd64__) || defined(_M_AMD64) -#define HAVE_AMD64 1 -#else -#define HAVE_AMD64 0 -#endif // defined(__amd64__) || defined(_M_AMD64) -#endif // defined(HAVE_AMD64) - - -#if HAVE_AMD64 || HAVE_I386 -#if defined(_MSC_VER) -#pragma warning(push) -// '_M_IX86' is not defined as a preprocessor macro -#pragma warning(disable: 4668) -#include <intrin.h> -#pragma warning(pop) -#endif // defined(_MSC_VER) - - -#if !defined(HAVE_MMX) -#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__MMX__)) -#define HAVE_MMX 1 -#else -#define HAVE_MMX 0 -#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__MMX__)) -#endif // !defined(HAVE_MMX) - - -#if !defined(HAVE_SSE) -#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE__)) -#include <xmmintrin.h> -#define HAVE_SSE 1 -#else -#define HAVE_SSE 0 -#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE__)) -#endif // !defined(HAVE_SSE) - - -#if !defined(HAVE_SSE2) -#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE2__)) -#include <emmintrin.h> -#define HAVE_SSE2 1 -#else -#define HAVE_SSE2 0 -#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE2__)) -#endif // !defined(HAVE_SSE2) - -#else - -#if !defined(HAVE_MMX) -#define HAVE_MMX 0 -#endif // !defined(HAVE_MMX) - -#if !defined(HAVE_SSE) -#define HAVE_SSE 0 -#endif // !defined(HAVE_SSE) - -#if !defined(HAVE_SSE2) -#define HAVE_SSE2 0 -#endif // !defined(HAVE_SSE2) - -#endif // HAVE_AMD64 || HAVE_I386 - -// Error checking -#if HAVE_SSE && !HAVE_MMX -#error SSE is available but not MMX? -#endif // HAVE_SSE && !HAVE_MMX - -#if HAVE_SSE2 && (!HAVE_SSE || !HAVE_MMX) -#error SSE2 is available but not SSE or MMX? -#endif // HAVE_SSE2 && (!HAVE_SSE || !HAVE_MMX) - - -#if !defined(CRCUTIL_PREFETCH_WIDTH) -// On newer X5550 CPU, heavily optimized CrcMultiword is 3% faster without -// prefetch for inputs smaller than 8MB and less than 1% slower for 8MB and -// larger blocks. On older Q9650 CPU, the code is 2-3% faster for inputs -// smaller than 8MB, 4-5% slower when length >= 8MB. -// Tested with prefetch length 256, 512, and 4096. -// -// At this moment there is no compelling reason to use prefetching. -// -#define CRCUTIL_PREFETCH_WIDTH 0 -#endif // !defined(CRCUTIL_PREFETCH_WIDTH) - - -#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 -#define PREFETCH(src) \ - _mm_prefetch(reinterpret_cast<const char *>(src) + CRCUTIL_PREFETCH_WIDTH, \ - _MM_HINT_T0) -#else -#define PREFETCH(src) -#endif // HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 - - -// If block size exceeds CRCUTIL_MIN_ALIGN_SIZE, align the data -// before accessing it at word boundary. See generic_crc.cc, -// ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() macro. -#if !defined(CRCUTIL_MIN_ALIGN_SIZE) -#if HAVE_AMD64 || HAVE_I386 -#define CRCUTIL_MIN_ALIGN_SIZE (1024) -#else -#define CRCUTIL_MIN_ALIGN_SIZE 0 -#endif // HAVE_AMD64 || HAVE_I386 -#endif // !defined(CRCUTIL_MIN_ALIGN_SIZE) - - -// Use _mm_crc32_u64/32/8 intrinics? -// If not, they will be implemented in software. -#if !HAVE_I386 && !HAVE_AMD64 - -#undef CRCUTIL_USE_MM_CRC32 -#define CRCUTIL_USE_MM_CRC32 0 - -#else - -#if !defined(CRCUTIL_USE_MM_CRC32) -#if defined(_MSC_VER) || defined(__GNUC__) -#define CRCUTIL_USE_MM_CRC32 1 -#else -#define CRCUTIL_USE_MM_CRC32 0 -#endif // defined(_MSC_VER) || defined(__GNUC__) -#endif // !defined(CRCUTIL_USE_MM_CRC32) - -#endif // !HAVE_I386 && !HAVE_AMD64 - - -// Stringize -- always handy. -#define TO_STRING_VALUE(arg) #arg -#define TO_STRING(arg) TO_STRING_VALUE(arg) - - -// Compilers give "right shift count >= width of type" warning even -// though the shift happens only under appropriate "if". -#define SHIFT_RIGHT_NO_WARNING(value, bits) \ - ((value) >> (((bits) < (8 * sizeof(value))) ? (bits) : 0)) -#define SHIFT_RIGHT_SAFE(value, bits) \ - ((bits) < (8 * sizeof(value)) ? SHIFT_RIGHT_NO_WARNING(value, bits) : 0) - -// The same for left shifts. -#define SHIFT_LEFT_NO_WARNING(value, bits) \ - ((value) << (((bits) < (8 * sizeof(value))) ? (bits) : 0)) -#define SHIFT_LEFT_SAFE(value, bits) \ - ((bits) < (8 * sizeof(value)) ? SHIFT_LEFT_NO_WARNING(value, bits) : 0) - -// GCC-specific macros. -// -#define GCC_VERSION_AVAILABLE(major, minor) \ - (defined(__GNUC__) && \ - (__GNUC__ > (major) || \ - (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))) - - -#if defined(__GNUC__) - -// The GenericCrc tables must be properly aligned. -// Penalty for misalignment? 50% performance degradation. -// For 128-bit SSE2, the penalty is access violation. -#define GCC_ALIGN_ATTRIBUTE(n) __attribute__((aligned(n))) - -#if GCC_VERSION_AVAILABLE(4, 4) -// If not marked as "omit frame pointer", -// GCC won't be able to find enough registers. -#define GCC_OMIT_FRAME_POINTER \ - __attribute__((__optimize__(2, "omit-frame-pointer"))) -#endif // GCC_VERSION_AVAILABLE(4, 4) - -#if !defined(__forceinline) -#define __forceinline __attribute__((__always_inline__)) inline -#endif // !defined(__forceinline) - -#if defined(__APPLE_CC__) -// The version of GCC used by Max OS X xCode v 5664 does not understand -// "movq xmm, r64" instruction and requires the use of "movd" (probably -// because of the bug in GCC which treats "movq/movd xmm,r64 or r64,xmm" -// the same). -// -// Leaving common sense aside, let's peek into Intel's instruction -// reference manual. That's what description of MOVD command says: -// MOVD xmm, r/m32 (opcode 66 0F 6E /r) -// MOVD r/m32, xmm (opcode 66 0F 7E /r) -// MOVQ xmm, r/m64 (opcode 66 REX.W 0F 6E /r) -// MOVQ r/m64, xmm (opcode 66 REX.W 0F 7E /r) -#define SSE2_MOVQ "movd" -#else -#define SSE2_MOVQ "movq" -#endif // defined(__APPLE_CC__) - -#endif // defined(__GNUC__) - - -// Define compiler-specific macros that were not set yet. -#if !defined(_MSC_VER) && !defined(__forceinline) -#define __forceinline inline -#endif // !defined(_MSC_VER) && !defined(__forceinline) - -#if !defined(GCC_OMIT_FRAME_POINTER) -#define GCC_OMIT_FRAME_POINTER -#endif // !defined(GCC_OMIT_FRAME_POINTER) - -#if !defined(GCC_ALIGN_ATTRIBUTE) -#define GCC_ALIGN_ATTRIBUTE(n) -#endif // !defined(GCC_ALIGN_ATTRIBUTE) - - -#endif // CRCUTIL_PLATFORM_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Detects configuration and defines compiler-specific macros. +// Also, sets user-defined CRUTIL_USE_* macros to default values. + +#ifndef CRCUTIL_PLATFORM_H_ +#define CRCUTIL_PLATFORM_H_ + +// Permanently disable some annoying warnings generated +// by Microsoft CL when compiling Microsoft's headers. +#include "std_headers.h" + +// Use inline asm version of the code? +#if !defined(CRCUTIL_USE_ASM) +#define CRCUTIL_USE_ASM 1 +#endif // !defined(CRCUTIL_USE_ASM) + + +#if !defined(HAVE_I386) +#if defined(__i386__) || defined(_M_IX86) +#define HAVE_I386 1 +#else +#define HAVE_I386 0 +#endif // defined(__i386__) || defined(_M_IX86) +#endif // defined(HAVE_I386) + + +#if !defined(HAVE_AMD64) +#if defined(__amd64__) || defined(_M_AMD64) +#define HAVE_AMD64 1 +#else +#define HAVE_AMD64 0 +#endif // defined(__amd64__) || defined(_M_AMD64) +#endif // defined(HAVE_AMD64) + + +#if HAVE_AMD64 || HAVE_I386 +#if defined(_MSC_VER) +#pragma warning(push) +// '_M_IX86' is not defined as a preprocessor macro +#pragma warning(disable: 4668) +#include <intrin.h> +#pragma warning(pop) +#endif // defined(_MSC_VER) + + +#if !defined(HAVE_MMX) +#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__MMX__)) +#define HAVE_MMX 1 +#else +#define HAVE_MMX 0 +#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__MMX__)) +#endif // !defined(HAVE_MMX) + + +#if !defined(HAVE_SSE) +#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE__)) +#include <xmmintrin.h> +#define HAVE_SSE 1 +#else +#define HAVE_SSE 0 +#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE__)) +#endif // !defined(HAVE_SSE) + + +#if !defined(HAVE_SSE2) +#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE2__)) +#include <emmintrin.h> +#define HAVE_SSE2 1 +#else +#define HAVE_SSE2 0 +#endif // defined(_MSC_VER) || (defined(__GNUC__) && defined(__SSE2__)) +#endif // !defined(HAVE_SSE2) + +#else + +#if !defined(HAVE_MMX) +#define HAVE_MMX 0 +#endif // !defined(HAVE_MMX) + +#if !defined(HAVE_SSE) +#define HAVE_SSE 0 +#endif // !defined(HAVE_SSE) + +#if !defined(HAVE_SSE2) +#define HAVE_SSE2 0 +#endif // !defined(HAVE_SSE2) + +#endif // HAVE_AMD64 || HAVE_I386 + +// Error checking +#if HAVE_SSE && !HAVE_MMX +#error SSE is available but not MMX? +#endif // HAVE_SSE && !HAVE_MMX + +#if HAVE_SSE2 && (!HAVE_SSE || !HAVE_MMX) +#error SSE2 is available but not SSE or MMX? +#endif // HAVE_SSE2 && (!HAVE_SSE || !HAVE_MMX) + + +#if !defined(CRCUTIL_PREFETCH_WIDTH) +// On newer X5550 CPU, heavily optimized CrcMultiword is 3% faster without +// prefetch for inputs smaller than 8MB and less than 1% slower for 8MB and +// larger blocks. On older Q9650 CPU, the code is 2-3% faster for inputs +// smaller than 8MB, 4-5% slower when length >= 8MB. +// Tested with prefetch length 256, 512, and 4096. +// +// At this moment there is no compelling reason to use prefetching. +// +#define CRCUTIL_PREFETCH_WIDTH 0 +#endif // !defined(CRCUTIL_PREFETCH_WIDTH) + + +#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 +#define PREFETCH(src) \ + _mm_prefetch(reinterpret_cast<const char *>(src) + CRCUTIL_PREFETCH_WIDTH, \ + _MM_HINT_T0) +#else +#define PREFETCH(src) +#endif // HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 + + +// If block size exceeds CRCUTIL_MIN_ALIGN_SIZE, align the data +// before accessing it at word boundary. See generic_crc.cc, +// ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() macro. +#if !defined(CRCUTIL_MIN_ALIGN_SIZE) +#if HAVE_AMD64 || HAVE_I386 +#define CRCUTIL_MIN_ALIGN_SIZE (1024) +#else +#define CRCUTIL_MIN_ALIGN_SIZE 0 +#endif // HAVE_AMD64 || HAVE_I386 +#endif // !defined(CRCUTIL_MIN_ALIGN_SIZE) + + +// Use _mm_crc32_u64/32/8 intrinics? +// If not, they will be implemented in software. +#if !HAVE_I386 && !HAVE_AMD64 + +#undef CRCUTIL_USE_MM_CRC32 +#define CRCUTIL_USE_MM_CRC32 0 + +#else + +#if !defined(CRCUTIL_USE_MM_CRC32) +#if defined(_MSC_VER) || defined(__GNUC__) +#define CRCUTIL_USE_MM_CRC32 1 +#else +#define CRCUTIL_USE_MM_CRC32 0 +#endif // defined(_MSC_VER) || defined(__GNUC__) +#endif // !defined(CRCUTIL_USE_MM_CRC32) + +#endif // !HAVE_I386 && !HAVE_AMD64 + + +// Stringize -- always handy. +#define TO_STRING_VALUE(arg) #arg +#define TO_STRING(arg) TO_STRING_VALUE(arg) + + +// Compilers give "right shift count >= width of type" warning even +// though the shift happens only under appropriate "if". +#define SHIFT_RIGHT_NO_WARNING(value, bits) \ + ((value) >> (((bits) < (8 * sizeof(value))) ? (bits) : 0)) +#define SHIFT_RIGHT_SAFE(value, bits) \ + ((bits) < (8 * sizeof(value)) ? SHIFT_RIGHT_NO_WARNING(value, bits) : 0) + +// The same for left shifts. +#define SHIFT_LEFT_NO_WARNING(value, bits) \ + ((value) << (((bits) < (8 * sizeof(value))) ? (bits) : 0)) +#define SHIFT_LEFT_SAFE(value, bits) \ + ((bits) < (8 * sizeof(value)) ? SHIFT_LEFT_NO_WARNING(value, bits) : 0) + +// GCC-specific macros. +// +#define GCC_VERSION_AVAILABLE(major, minor) \ + (defined(__GNUC__) && \ + (__GNUC__ > (major) || \ + (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))) + + +#if defined(__GNUC__) + +// The GenericCrc tables must be properly aligned. +// Penalty for misalignment? 50% performance degradation. +// For 128-bit SSE2, the penalty is access violation. +#define GCC_ALIGN_ATTRIBUTE(n) __attribute__((aligned(n))) + +#if GCC_VERSION_AVAILABLE(4, 4) +// If not marked as "omit frame pointer", +// GCC won't be able to find enough registers. +#define GCC_OMIT_FRAME_POINTER \ + __attribute__((__optimize__(2, "omit-frame-pointer"))) +#endif // GCC_VERSION_AVAILABLE(4, 4) + +#if !defined(__forceinline) +#define __forceinline __attribute__((__always_inline__)) inline +#endif // !defined(__forceinline) + +#if defined(__APPLE_CC__) +// The version of GCC used by Max OS X xCode v 5664 does not understand +// "movq xmm, r64" instruction and requires the use of "movd" (probably +// because of the bug in GCC which treats "movq/movd xmm,r64 or r64,xmm" +// the same). +// +// Leaving common sense aside, let's peek into Intel's instruction +// reference manual. That's what description of MOVD command says: +// MOVD xmm, r/m32 (opcode 66 0F 6E /r) +// MOVD r/m32, xmm (opcode 66 0F 7E /r) +// MOVQ xmm, r/m64 (opcode 66 REX.W 0F 6E /r) +// MOVQ r/m64, xmm (opcode 66 REX.W 0F 7E /r) +#define SSE2_MOVQ "movd" +#else +#define SSE2_MOVQ "movq" +#endif // defined(__APPLE_CC__) + +#endif // defined(__GNUC__) + + +// Define compiler-specific macros that were not set yet. +#if !defined(_MSC_VER) && !defined(__forceinline) +#define __forceinline inline +#endif // !defined(_MSC_VER) && !defined(__forceinline) + +#if !defined(GCC_OMIT_FRAME_POINTER) +#define GCC_OMIT_FRAME_POINTER +#endif // !defined(GCC_OMIT_FRAME_POINTER) + +#if !defined(GCC_ALIGN_ATTRIBUTE) +#define GCC_ALIGN_ATTRIBUTE(n) +#endif // !defined(GCC_ALIGN_ATTRIBUTE) + + +#endif // CRCUTIL_PLATFORM_H_ diff --git a/contrib/libs/crcutil/protected_crc.h b/contrib/libs/crcutil/protected_crc.h index 762fceda9b..81a7051bc8 100644 --- a/contrib/libs/crcutil/protected_crc.h +++ b/contrib/libs/crcutil/protected_crc.h @@ -1,61 +1,61 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Protects CRC tables with its own CRC. -// CRC tables get corrupted too, and if corruption is -// not caught, data poisoning becomes a reality. - -#ifndef CRCUTIL_PROTECTED_CRC_H_ -#define CRCUTIL_PROTECTED_CRC_H_ - -namespace crcutil { - -#pragma pack(push, 16) - -// Class CrcImplementation should not have virtual functions: -// vptr is stored as the very first field, vptr value is defined -// at runtime, so it is impossible to CRC(*this) once and -// guarantee that this value will not change from run to run. -// -template<typename CrcImplementation> class ProtectedCrc - : public CrcImplementation { - public: - typedef typename CrcImplementation::Crc Crc; - - // Returns check value that the caller should compare - // against pre-computed, trusted constant. - // - // Computing SelfCheckValue() after CRC initialization, - // storing it in memory, and periodically checking against - // stored value may not work: if CRC tables were initialized - // incorrectly and/or had been corrupted during initialization, - // CheckValue() will return garbage. Garbage in, garbage out. - // Consequitive checks will not detect a problem, the application - // will happily produce and save the data with corrupt CRC. - // - // The application should call SelfCheckValue() regularly: - // 1. First and foremost, on every CRC mismatch. - // 2. After CRC'ing the data but before sending it out or writing it. - // 3. Worst case, every Nth CRC'ed byte or every Nth call to CRC. - // - Crc SelfCheckValue() const { - return CrcDefault(this, sizeof(*this), 0); - } -} GCC_ALIGN_ATTRIBUTE(16); - -#pragma pack(pop) - -} // namespace crcutil - -#endif // CRCUTIL_PROTECTED_CRC_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Protects CRC tables with its own CRC. +// CRC tables get corrupted too, and if corruption is +// not caught, data poisoning becomes a reality. + +#ifndef CRCUTIL_PROTECTED_CRC_H_ +#define CRCUTIL_PROTECTED_CRC_H_ + +namespace crcutil { + +#pragma pack(push, 16) + +// Class CrcImplementation should not have virtual functions: +// vptr is stored as the very first field, vptr value is defined +// at runtime, so it is impossible to CRC(*this) once and +// guarantee that this value will not change from run to run. +// +template<typename CrcImplementation> class ProtectedCrc + : public CrcImplementation { + public: + typedef typename CrcImplementation::Crc Crc; + + // Returns check value that the caller should compare + // against pre-computed, trusted constant. + // + // Computing SelfCheckValue() after CRC initialization, + // storing it in memory, and periodically checking against + // stored value may not work: if CRC tables were initialized + // incorrectly and/or had been corrupted during initialization, + // CheckValue() will return garbage. Garbage in, garbage out. + // Consequitive checks will not detect a problem, the application + // will happily produce and save the data with corrupt CRC. + // + // The application should call SelfCheckValue() regularly: + // 1. First and foremost, on every CRC mismatch. + // 2. After CRC'ing the data but before sending it out or writing it. + // 3. Worst case, every Nth CRC'ed byte or every Nth call to CRC. + // + Crc SelfCheckValue() const { + return CrcDefault(this, sizeof(*this), 0); + } +} GCC_ALIGN_ATTRIBUTE(16); + +#pragma pack(pop) + +} // namespace crcutil + +#endif // CRCUTIL_PROTECTED_CRC_H_ diff --git a/contrib/libs/crcutil/rdtsc.h b/contrib/libs/crcutil/rdtsc.h index 57aea1f1fb..ead586f842 100644 --- a/contrib/libs/crcutil/rdtsc.h +++ b/contrib/libs/crcutil/rdtsc.h @@ -1,59 +1,59 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Reads CPU cycle counter on AMD64 and I386 (for performance measurements). -// Thanks to __rdtsc() intrinsic, it's easy with Microsoft and Intel -// compilers, but real pain with GCC. - -#ifndef CRCUTIL_RDTSC_H_ -#define CRCUTIL_RDTSC_H_ - -#include "platform.h" - -namespace crcutil { - -struct Rdtsc { - static inline uint64 Get() { -#if defined(_MSC_VER) && (HAVE_AMD64 || HAVE_I386) - return __rdtsc(); -#elif defined(__GNUC__) && HAVE_AMD64 - int64 result; - __asm__ volatile( - "rdtsc\n" - : "=a" (result)); - return result; -#elif defined(__GNUC__) && HAVE_I386 - // If "low" and "high" are defined as "uint64" to - // avoid explicit cast to uint64, GCC 4.5.0 in "-m32" mode - // fails with "impossible register constraint" error - // (no, it is not because one cannot use 64-bit value as argument - // for 32-bit register, but because its register allocator - // could not resolve a conflict under high register pressure). - uint32 low; - uint32 high; - __asm__ volatile( - "rdtsc\n" - : "=a" (low), "=d" (high)); - return ((static_cast<uint64>(high) << 32) | low); -#else - // It is hard to find low overhead timer with - // sub-millisecond resolution and granularity. - return 0; -#endif - } -}; - -} // namespace crcutil - -#endif // CRCUTIL_RDTSC_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Reads CPU cycle counter on AMD64 and I386 (for performance measurements). +// Thanks to __rdtsc() intrinsic, it's easy with Microsoft and Intel +// compilers, but real pain with GCC. + +#ifndef CRCUTIL_RDTSC_H_ +#define CRCUTIL_RDTSC_H_ + +#include "platform.h" + +namespace crcutil { + +struct Rdtsc { + static inline uint64 Get() { +#if defined(_MSC_VER) && (HAVE_AMD64 || HAVE_I386) + return __rdtsc(); +#elif defined(__GNUC__) && HAVE_AMD64 + int64 result; + __asm__ volatile( + "rdtsc\n" + : "=a" (result)); + return result; +#elif defined(__GNUC__) && HAVE_I386 + // If "low" and "high" are defined as "uint64" to + // avoid explicit cast to uint64, GCC 4.5.0 in "-m32" mode + // fails with "impossible register constraint" error + // (no, it is not because one cannot use 64-bit value as argument + // for 32-bit register, but because its register allocator + // could not resolve a conflict under high register pressure). + uint32 low; + uint32 high; + __asm__ volatile( + "rdtsc\n" + : "=a" (low), "=d" (high)); + return ((static_cast<uint64>(high) << 32) | low); +#else + // It is hard to find low overhead timer with + // sub-millisecond resolution and granularity. + return 0; +#endif + } +}; + +} // namespace crcutil + +#endif // CRCUTIL_RDTSC_H_ diff --git a/contrib/libs/crcutil/rolling_crc.h b/contrib/libs/crcutil/rolling_crc.h index ad4a947e49..0de69d2bd8 100644 --- a/contrib/libs/crcutil/rolling_crc.h +++ b/contrib/libs/crcutil/rolling_crc.h @@ -1,106 +1,106 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements rolling CRC (e.g. for Rabin fingerprinting). - -#ifndef CRCUTIL_ROLLING_CRC_H_ -#define CRCUTIL_ROLLING_CRC_H_ - -#include "base_types.h" // size_t, uint8 -#include "crc_casts.h" // TO_BYTE - -namespace crcutil { - -#pragma pack(push, 16) - -// CrcImplementation should provide: -// - typename Crc -// - typename TableEntry -// - typename Word -// - Crc CrcDefault(const void *data, size_t bytes, const Crc &start) -// - const GfUtil<Crc> &Base() const -template<typename CrcImplementation> class RollingCrc { - public: - typedef typename CrcImplementation::Crc Crc; - typedef typename CrcImplementation::TableEntry TableEntry; - typedef typename CrcImplementation::Word Word; - - RollingCrc() {} - - // Initializes internal data structures. - // Retains reference to "crc" instance -- it is used by Start(). - RollingCrc(const CrcImplementation &crc, - size_t roll_window_bytes, - const Crc &start_value) { - Init(crc, roll_window_bytes, start_value); - } - - // Computes crc of "roll_window_bytes" using - // "start_value" of "crc" (see Init()). - Crc Start(const void *data) const { - return crc_->CrcDefault(data, roll_window_bytes_, start_value_); - } - - // Computes CRC of "roll_window_bytes" starting in next position. - Crc Roll(const Crc &old_crc, size_t byte_out, size_t byte_in) const { - return (old_crc >> 8) ^ in_[TO_BYTE(old_crc) ^ byte_in] ^ out_[byte_out]; - } - - // Initializes internal data structures. - // Retains reference to "crc" instance -- it is used by Start(). - void Init(const CrcImplementation &crc, - size_t roll_window_bytes, - const Crc &start_value) { - crc_ = &crc; - roll_window_bytes_ = roll_window_bytes; - start_value_ = start_value; - - Crc add = crc.Base().Canonize() ^ start_value; - add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes)); - add ^= crc.Base().Canonize(); - Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1); - add = crc.Base().Multiply(add, mul); - - mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree()); - for (size_t i = 0; i < 256; ++i) { - out_[i] = static_cast<TableEntry>( - crc.Base().MultiplyUnnormalized( - static_cast<Crc>(i), 8, mul) ^ add); - } - for (size_t i = 0; i < 256; ++i) { - in_[i] = crc.crc_word_[sizeof(Word) - 1][i]; - } - } - - // Returns start value. - Crc StartValue() const { return start_value_; } - - // Returns length of roll window. - size_t WindowBytes() const { return roll_window_bytes_; } - - protected: - TableEntry in_[256]; - TableEntry out_[256]; - - // Used only by Start(). - Crc start_value_; - const CrcImplementation *crc_; - size_t roll_window_bytes_; -} GCC_ALIGN_ATTRIBUTE(16); - -#pragma pack(pop) - -} // namespace crcutil - -#endif // CRCUTIL_ROLLING_CRC_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements rolling CRC (e.g. for Rabin fingerprinting). + +#ifndef CRCUTIL_ROLLING_CRC_H_ +#define CRCUTIL_ROLLING_CRC_H_ + +#include "base_types.h" // size_t, uint8 +#include "crc_casts.h" // TO_BYTE + +namespace crcutil { + +#pragma pack(push, 16) + +// CrcImplementation should provide: +// - typename Crc +// - typename TableEntry +// - typename Word +// - Crc CrcDefault(const void *data, size_t bytes, const Crc &start) +// - const GfUtil<Crc> &Base() const +template<typename CrcImplementation> class RollingCrc { + public: + typedef typename CrcImplementation::Crc Crc; + typedef typename CrcImplementation::TableEntry TableEntry; + typedef typename CrcImplementation::Word Word; + + RollingCrc() {} + + // Initializes internal data structures. + // Retains reference to "crc" instance -- it is used by Start(). + RollingCrc(const CrcImplementation &crc, + size_t roll_window_bytes, + const Crc &start_value) { + Init(crc, roll_window_bytes, start_value); + } + + // Computes crc of "roll_window_bytes" using + // "start_value" of "crc" (see Init()). + Crc Start(const void *data) const { + return crc_->CrcDefault(data, roll_window_bytes_, start_value_); + } + + // Computes CRC of "roll_window_bytes" starting in next position. + Crc Roll(const Crc &old_crc, size_t byte_out, size_t byte_in) const { + return (old_crc >> 8) ^ in_[TO_BYTE(old_crc) ^ byte_in] ^ out_[byte_out]; + } + + // Initializes internal data structures. + // Retains reference to "crc" instance -- it is used by Start(). + void Init(const CrcImplementation &crc, + size_t roll_window_bytes, + const Crc &start_value) { + crc_ = &crc; + roll_window_bytes_ = roll_window_bytes; + start_value_ = start_value; + + Crc add = crc.Base().Canonize() ^ start_value; + add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes)); + add ^= crc.Base().Canonize(); + Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1); + add = crc.Base().Multiply(add, mul); + + mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree()); + for (size_t i = 0; i < 256; ++i) { + out_[i] = static_cast<TableEntry>( + crc.Base().MultiplyUnnormalized( + static_cast<Crc>(i), 8, mul) ^ add); + } + for (size_t i = 0; i < 256; ++i) { + in_[i] = crc.crc_word_[sizeof(Word) - 1][i]; + } + } + + // Returns start value. + Crc StartValue() const { return start_value_; } + + // Returns length of roll window. + size_t WindowBytes() const { return roll_window_bytes_; } + + protected: + TableEntry in_[256]; + TableEntry out_[256]; + + // Used only by Start(). + Crc start_value_; + const CrcImplementation *crc_; + size_t roll_window_bytes_; +} GCC_ALIGN_ATTRIBUTE(16); + +#pragma pack(pop) + +} // namespace crcutil + +#endif // CRCUTIL_ROLLING_CRC_H_ diff --git a/contrib/libs/crcutil/std_headers.h b/contrib/libs/crcutil/std_headers.h index 0d7e4ecb56..55746fb8be 100644 --- a/contrib/libs/crcutil/std_headers.h +++ b/contrib/libs/crcutil/std_headers.h @@ -1,53 +1,53 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Includes some standard C headers for size_t, memset, etc. -// -// Also, permanently disables a number of warnings produced -// by Microsoft's compiler when it includes standard headers -// (surprisingly, also by Microsoft). - -#ifndef CRCUTIL_STD_HEADERS_H_ -#define CRCUTIL_STD_HEADERS_H_ - -#if defined(_MSC_VER) -// '4' bytes padding added after data member ... -#pragma warning(disable:4820) - -// unreferenced inline function has been removed ... -#pragma warning(disable:4514) - -// conditional expression is constant -#pragma warning(disable: 4127) - -// function ... not inlined -#pragma warning(disable: 4710) - -// function ... selected for automatic inline expansion -#pragma warning(disable: 4711) - +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Includes some standard C headers for size_t, memset, etc. +// +// Also, permanently disables a number of warnings produced +// by Microsoft's compiler when it includes standard headers +// (surprisingly, also by Microsoft). + +#ifndef CRCUTIL_STD_HEADERS_H_ +#define CRCUTIL_STD_HEADERS_H_ + +#if defined(_MSC_VER) +// '4' bytes padding added after data member ... +#pragma warning(disable:4820) + +// unreferenced inline function has been removed ... +#pragma warning(disable:4514) + +// conditional expression is constant +#pragma warning(disable: 4127) + +// function ... not inlined +#pragma warning(disable: 4710) + +// function ... selected for automatic inline expansion +#pragma warning(disable: 4711) + #ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS #endif - -#endif // defined(_MSC_VER) - -// #define _CSTDLIB_ -#include <stdio.h> // always handy -#include <string.h> // memset -#include <stdlib.h> // size_t, _rotl/_rotl64(MSC) -#include <stddef.h> // ptrdiff_t (GNUC) -#include <stdarg.h> // va_list - -#endif // CRCUTIL_STD_HEADERS_H_ + +#endif // defined(_MSC_VER) + +// #define _CSTDLIB_ +#include <stdio.h> // always handy +#include <string.h> // memset +#include <stdlib.h> // size_t, _rotl/_rotl64(MSC) +#include <stddef.h> // ptrdiff_t (GNUC) +#include <stdarg.h> // va_list + +#endif // CRCUTIL_STD_HEADERS_H_ diff --git a/contrib/libs/crcutil/uint128_sse2.h b/contrib/libs/crcutil/uint128_sse2.h index 24b4072658..0fcc7255ed 100644 --- a/contrib/libs/crcutil/uint128_sse2.h +++ b/contrib/libs/crcutil/uint128_sse2.h @@ -1,310 +1,310 @@ -// Copyright 2010 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Implements a limited set of 128-bit arithmetic operations -// (the ones that are used by CRC) using SSE2 intrinsics. - -#ifndef CRCUTIL_UINT128_SSE2_H_ -#define CRCUTIL_UINT128_SSE2_H_ - -#include "base_types.h" -#include "crc_casts.h" // Downcast, CrcFromUint64, Uint64FromCrc -#include "platform.h" - -#if HAVE_SSE2 - -namespace crcutil { - -// Specialized functions handling __m128i. -template<> __forceinline uint64 Downcast(const __m128i &value) { -#if HAVE_AMD64 && defined(__GNUC__) - // GCC 4.4.x is too smart and, instead of MOVQ, generates SSE4 PEXTRQ - // instruction when the code is compiled with -mmsse4. - // Fixed in 4.5 which generates conversion through memory (why?). - // And -- yes, it makes quite measurable difference. - uint64 temp; - asm(SSE2_MOVQ " %[i128], %[u64]\n" : [u64] "=r" (temp) : [i128] "x" (value)); - return temp; -#elif HAVE_AMD64 && (!defined(_MSC_FULL_VER) || _MSC_FULL_VER > 150030729) - return static_cast<uint64>(_mm_cvtsi128_si64(value)); -#else - // 64-bit CL 15.00.30729.1 -O2 generates incorrect code (tests fail). - // _mm_cvtsi128_si64() is not available on i386. - uint64 temp; - _mm_storel_epi64(reinterpret_cast<__m128i *>(&temp), value); - return temp; -#endif -} - - -class uint128_sse2 { - public: - uint128_sse2() {} - ~uint128_sse2() {} - - // Default casts to uint128_sse2 and assignment operator. - __forceinline void operator =(uint64 value) { -#if HAVE_AMD64 && defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 5) - // Prevent generation of SSE4 pinsrq insruction when - // compiling with GCC 4.4.x with -msse4 flag. - asm(SSE2_MOVQ " %[u64], %[i128]\n" : [i128] "=x" (x_) : [u64] "r" (value)); -#elif HAVE_AMD64 - x_ = _mm_cvtsi64_si128(static_cast<int64>(value)); -#else - x_ = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&value)); -#endif - } - __forceinline uint128_sse2(uint64 x) { - *this = x; - } - __forceinline uint128_sse2(const __m128i x) : x_(x) { - } - __forceinline operator __m128i() const { - return x_; - } - __forceinline void operator =(const uint128_sse2 &x) { - x_ = x.x_; - } - - // Extracts 64 less significant bits. - __forceinline uint64 to_uint64() const { - return Downcast<__m128i, uint64>(x_); - } - - // Comparisons. - __forceinline bool operator ==(const uint128_sse2 &y) const { - union { - __m128i i128; - uint64 u64[2]; - } t; - t.i128 = _mm_xor_si128(x_, y.x_); - return (t.u64[0] | t.u64[1]) == 0; - } - __forceinline bool operator ==(uint64 value) const { - union { - __m128i i128; - uint64 u64[2]; - } t; - t.i128 = x_; - return (t.u64[0] == value && t.u64[1] == 0); - } - __forceinline bool operator !=(const uint128_sse2 &y) const { - union { - __m128i i128; - uint64 u64[2]; - } t; - t.i128 = _mm_xor_si128(x_, y.x_); - return (t.u64[0] | t.u64[1]) != 0; - } - __forceinline bool operator !=(uint64 value) const { - union { - __m128i i128; - uint64 u64[2]; - } t; - t.i128 = x_; - return (t.u64[0] != value || t.u64[1] != 0); - } - - __forceinline bool operator <(const uint128_sse2 &y) const { - union { - __m128i i128; - uint64 u64[2]; - } xx, yy; - xx.i128 = x_; - yy.i128 = y.x_; - return (xx.u64[0] < yy.u64[0] || - (xx.u64[0] == yy.u64[0] && xx.u64[1] < yy.u64[1])); - } - - // Bitwise logic operators. - __forceinline uint128_sse2 operator ^(const uint128_sse2 &y) const { - return _mm_xor_si128(x_, y.x_); - } - __forceinline uint128_sse2 operator &(const uint128_sse2 &y) const { - return _mm_and_si128(x_, y.x_); - } - __forceinline uint128_sse2 operator |(const uint128_sse2 &y) const { - return _mm_or_si128(x_, y.x_); - } - - __forceinline void operator ^=(const uint128_sse2 &y) { - *this = *this ^ y.x_; - } - __forceinline void operator &=(const uint128_sse2 &y) { - *this = *this & y.x_; - } - __forceinline void operator |=(const uint128_sse2 &y) { - *this = *this | y.x_; - } - - // Arithmetic operators. - __forceinline uint128_sse2 operator +(uint64 y) const { - union { - __m128i i128; - uint64 u64[2]; - } temp; - temp.i128 = x_; - // a + b >= 2**64 iff - // a + b > (2**64 - 1) iff - // a > (2**64 - 1) - b iff - // a > ~b - if (temp.u64[0] > ~y) { - temp.u64[1] += 1; - } - temp.u64[0] += y; - return temp.i128; - } - __forceinline void operator +=(uint64 x) { - *this = *this + x; - } - __forceinline uint128_sse2 operator -(uint64 y) const { - union { - __m128i i128; - uint64 u64[2]; - } temp; - temp.i128 = x_; - if (temp.u64[0] < y) { - temp.u64[1] -= 1; - } - temp.u64[0] -= y; - return temp.i128; - } - __forceinline void operator -=(uint64 x) { - *this = *this - x; - } - - // Bitwise logical shifts. - __forceinline uint128_sse2 operator >>(const int bits) const { - if (bits == 8) { - return _mm_srli_si128(x_, 1); - } else if (bits == 16) { - return _mm_srli_si128(x_, 2); - } else if (bits == 32) { - return _mm_srli_si128(x_, 4); - } else if (bits == 64) { - return _mm_srli_si128(x_, 8); - } else { - return long_shift_right(bits); - } - } - __forceinline uint128_sse2 operator >>(const size_t bits) const { - return *this >> static_cast<int>(bits); - } - __forceinline void operator >>=(const int bits) { - *this = *this >> bits; - } - __forceinline void operator >>=(const size_t bits) { - *this = *this >> static_cast<int>(bits); - } - - __forceinline uint128_sse2 operator <<(int bits) const { - if (bits == 8) { - return _mm_slli_si128(x_, 1); - } else if (bits == 16) { - return _mm_slli_si128(x_, 2); - } else if (bits == 32) { - return _mm_slli_si128(x_, 4); - } else if (bits == 64) { - return _mm_slli_si128(x_, 8); - } else { - return long_shift_left(bits); - } - } - __forceinline uint128_sse2 operator <<(size_t bits) const { - return *this << static_cast<int>(bits); - } - __forceinline void operator <<=(int bits) { - *this = *this << bits; - } - __forceinline void operator <<=(size_t bits) { - *this = *this << static_cast<int>(bits); - } - - protected: - __forceinline uint128_sse2 long_shift_right(int bits) const { - union { - __m128i i128; - uint64 u64[2]; - } x; - x.i128 = x_; - for (; bits > 0; --bits) { - x.u64[0] >>= 1; - if (x.u64[1] & 1) { - x.u64[0] |= static_cast<uint64>(1) << 63; - } - x.u64[1] >>= 1; - } - return x.i128; - } - - __forceinline uint128_sse2 long_shift_left(int bits) const { - union { - __m128i i128; - int64 i64[2]; - } x; - x.i128 = x_; - for (; bits > 0; --bits) { - x.i64[1] <<= 1; - if (x.i64[0] < 0) { - x.i64[1] |= 1; - } - x.i64[0] <<= 1; - } - return x.i128; - } - - __m128i x_; -} GCC_ALIGN_ATTRIBUTE(16); - - -// Specialized versions. -template<> __forceinline uint64 Downcast(const uint128_sse2 &x) { - return x.to_uint64(); -} -template<> __forceinline uint32 Downcast(const uint128_sse2 &x) { - return static_cast<uint32>(x.to_uint64()); -} -template<> __forceinline uint16 Downcast(const uint128_sse2 &x) { - return static_cast<uint16>(x.to_uint64()); -} -template<> __forceinline uint8 Downcast(const uint128_sse2 &x) { - return static_cast<uint8>(x.to_uint64()); -} - -template<> __forceinline uint128_sse2 CrcFromUint64(uint64 lo, uint64 hi) { - union { - __m128i i128; - uint64 u64[2]; - } temp; - temp.u64[0] = lo; - temp.u64[1] = hi; - return temp.i128; -} - -template<> __forceinline void Uint64FromCrc(const uint128_sse2 &crc, - uint64 *lo, uint64 *hi) { - union { - __m128i i128; - uint64 u64[2]; - } temp; - temp.i128 = crc; - *lo = temp.u64[0]; - *hi = temp.u64[1]; -} - -} // namespace crcutil - -#endif // HAVE_SSE2 - -#endif // CRCUTIL_UINT128_SSE2_H_ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements a limited set of 128-bit arithmetic operations +// (the ones that are used by CRC) using SSE2 intrinsics. + +#ifndef CRCUTIL_UINT128_SSE2_H_ +#define CRCUTIL_UINT128_SSE2_H_ + +#include "base_types.h" +#include "crc_casts.h" // Downcast, CrcFromUint64, Uint64FromCrc +#include "platform.h" + +#if HAVE_SSE2 + +namespace crcutil { + +// Specialized functions handling __m128i. +template<> __forceinline uint64 Downcast(const __m128i &value) { +#if HAVE_AMD64 && defined(__GNUC__) + // GCC 4.4.x is too smart and, instead of MOVQ, generates SSE4 PEXTRQ + // instruction when the code is compiled with -mmsse4. + // Fixed in 4.5 which generates conversion through memory (why?). + // And -- yes, it makes quite measurable difference. + uint64 temp; + asm(SSE2_MOVQ " %[i128], %[u64]\n" : [u64] "=r" (temp) : [i128] "x" (value)); + return temp; +#elif HAVE_AMD64 && (!defined(_MSC_FULL_VER) || _MSC_FULL_VER > 150030729) + return static_cast<uint64>(_mm_cvtsi128_si64(value)); +#else + // 64-bit CL 15.00.30729.1 -O2 generates incorrect code (tests fail). + // _mm_cvtsi128_si64() is not available on i386. + uint64 temp; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&temp), value); + return temp; +#endif +} + + +class uint128_sse2 { + public: + uint128_sse2() {} + ~uint128_sse2() {} + + // Default casts to uint128_sse2 and assignment operator. + __forceinline void operator =(uint64 value) { +#if HAVE_AMD64 && defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 5) + // Prevent generation of SSE4 pinsrq insruction when + // compiling with GCC 4.4.x with -msse4 flag. + asm(SSE2_MOVQ " %[u64], %[i128]\n" : [i128] "=x" (x_) : [u64] "r" (value)); +#elif HAVE_AMD64 + x_ = _mm_cvtsi64_si128(static_cast<int64>(value)); +#else + x_ = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&value)); +#endif + } + __forceinline uint128_sse2(uint64 x) { + *this = x; + } + __forceinline uint128_sse2(const __m128i x) : x_(x) { + } + __forceinline operator __m128i() const { + return x_; + } + __forceinline void operator =(const uint128_sse2 &x) { + x_ = x.x_; + } + + // Extracts 64 less significant bits. + __forceinline uint64 to_uint64() const { + return Downcast<__m128i, uint64>(x_); + } + + // Comparisons. + __forceinline bool operator ==(const uint128_sse2 &y) const { + union { + __m128i i128; + uint64 u64[2]; + } t; + t.i128 = _mm_xor_si128(x_, y.x_); + return (t.u64[0] | t.u64[1]) == 0; + } + __forceinline bool operator ==(uint64 value) const { + union { + __m128i i128; + uint64 u64[2]; + } t; + t.i128 = x_; + return (t.u64[0] == value && t.u64[1] == 0); + } + __forceinline bool operator !=(const uint128_sse2 &y) const { + union { + __m128i i128; + uint64 u64[2]; + } t; + t.i128 = _mm_xor_si128(x_, y.x_); + return (t.u64[0] | t.u64[1]) != 0; + } + __forceinline bool operator !=(uint64 value) const { + union { + __m128i i128; + uint64 u64[2]; + } t; + t.i128 = x_; + return (t.u64[0] != value || t.u64[1] != 0); + } + + __forceinline bool operator <(const uint128_sse2 &y) const { + union { + __m128i i128; + uint64 u64[2]; + } xx, yy; + xx.i128 = x_; + yy.i128 = y.x_; + return (xx.u64[0] < yy.u64[0] || + (xx.u64[0] == yy.u64[0] && xx.u64[1] < yy.u64[1])); + } + + // Bitwise logic operators. + __forceinline uint128_sse2 operator ^(const uint128_sse2 &y) const { + return _mm_xor_si128(x_, y.x_); + } + __forceinline uint128_sse2 operator &(const uint128_sse2 &y) const { + return _mm_and_si128(x_, y.x_); + } + __forceinline uint128_sse2 operator |(const uint128_sse2 &y) const { + return _mm_or_si128(x_, y.x_); + } + + __forceinline void operator ^=(const uint128_sse2 &y) { + *this = *this ^ y.x_; + } + __forceinline void operator &=(const uint128_sse2 &y) { + *this = *this & y.x_; + } + __forceinline void operator |=(const uint128_sse2 &y) { + *this = *this | y.x_; + } + + // Arithmetic operators. + __forceinline uint128_sse2 operator +(uint64 y) const { + union { + __m128i i128; + uint64 u64[2]; + } temp; + temp.i128 = x_; + // a + b >= 2**64 iff + // a + b > (2**64 - 1) iff + // a > (2**64 - 1) - b iff + // a > ~b + if (temp.u64[0] > ~y) { + temp.u64[1] += 1; + } + temp.u64[0] += y; + return temp.i128; + } + __forceinline void operator +=(uint64 x) { + *this = *this + x; + } + __forceinline uint128_sse2 operator -(uint64 y) const { + union { + __m128i i128; + uint64 u64[2]; + } temp; + temp.i128 = x_; + if (temp.u64[0] < y) { + temp.u64[1] -= 1; + } + temp.u64[0] -= y; + return temp.i128; + } + __forceinline void operator -=(uint64 x) { + *this = *this - x; + } + + // Bitwise logical shifts. + __forceinline uint128_sse2 operator >>(const int bits) const { + if (bits == 8) { + return _mm_srli_si128(x_, 1); + } else if (bits == 16) { + return _mm_srli_si128(x_, 2); + } else if (bits == 32) { + return _mm_srli_si128(x_, 4); + } else if (bits == 64) { + return _mm_srli_si128(x_, 8); + } else { + return long_shift_right(bits); + } + } + __forceinline uint128_sse2 operator >>(const size_t bits) const { + return *this >> static_cast<int>(bits); + } + __forceinline void operator >>=(const int bits) { + *this = *this >> bits; + } + __forceinline void operator >>=(const size_t bits) { + *this = *this >> static_cast<int>(bits); + } + + __forceinline uint128_sse2 operator <<(int bits) const { + if (bits == 8) { + return _mm_slli_si128(x_, 1); + } else if (bits == 16) { + return _mm_slli_si128(x_, 2); + } else if (bits == 32) { + return _mm_slli_si128(x_, 4); + } else if (bits == 64) { + return _mm_slli_si128(x_, 8); + } else { + return long_shift_left(bits); + } + } + __forceinline uint128_sse2 operator <<(size_t bits) const { + return *this << static_cast<int>(bits); + } + __forceinline void operator <<=(int bits) { + *this = *this << bits; + } + __forceinline void operator <<=(size_t bits) { + *this = *this << static_cast<int>(bits); + } + + protected: + __forceinline uint128_sse2 long_shift_right(int bits) const { + union { + __m128i i128; + uint64 u64[2]; + } x; + x.i128 = x_; + for (; bits > 0; --bits) { + x.u64[0] >>= 1; + if (x.u64[1] & 1) { + x.u64[0] |= static_cast<uint64>(1) << 63; + } + x.u64[1] >>= 1; + } + return x.i128; + } + + __forceinline uint128_sse2 long_shift_left(int bits) const { + union { + __m128i i128; + int64 i64[2]; + } x; + x.i128 = x_; + for (; bits > 0; --bits) { + x.i64[1] <<= 1; + if (x.i64[0] < 0) { + x.i64[1] |= 1; + } + x.i64[0] <<= 1; + } + return x.i128; + } + + __m128i x_; +} GCC_ALIGN_ATTRIBUTE(16); + + +// Specialized versions. +template<> __forceinline uint64 Downcast(const uint128_sse2 &x) { + return x.to_uint64(); +} +template<> __forceinline uint32 Downcast(const uint128_sse2 &x) { + return static_cast<uint32>(x.to_uint64()); +} +template<> __forceinline uint16 Downcast(const uint128_sse2 &x) { + return static_cast<uint16>(x.to_uint64()); +} +template<> __forceinline uint8 Downcast(const uint128_sse2 &x) { + return static_cast<uint8>(x.to_uint64()); +} + +template<> __forceinline uint128_sse2 CrcFromUint64(uint64 lo, uint64 hi) { + union { + __m128i i128; + uint64 u64[2]; + } temp; + temp.u64[0] = lo; + temp.u64[1] = hi; + return temp.i128; +} + +template<> __forceinline void Uint64FromCrc(const uint128_sse2 &crc, + uint64 *lo, uint64 *hi) { + union { + __m128i i128; + uint64 u64[2]; + } temp; + temp.i128 = crc; + *lo = temp.u64[0]; + *hi = temp.u64[1]; +} + +} // namespace crcutil + +#endif // HAVE_SSE2 + +#endif // CRCUTIL_UINT128_SSE2_H_ diff --git a/contrib/libs/crcutil/ya.make b/contrib/libs/crcutil/ya.make index 2da8ef940f..90340a33be 100644 --- a/contrib/libs/crcutil/ya.make +++ b/contrib/libs/crcutil/ya.make @@ -1,5 +1,5 @@ -LIBRARY() - +LIBRARY() + LICENSE(Apache-2.0) VERSION(1.0) @@ -13,10 +13,10 @@ OWNER( NO_UTIL() -NO_COMPILER_WARNINGS() +NO_COMPILER_WARNINGS() NO_JOIN_SRC() - + IF (GCC AND USE_LTO) CFLAGS(-DCRCUTIL_FORCE_ASM_CRC32C=1) ENDIF() @@ -61,10 +61,10 @@ IF (ARCH_I386 OR ARCH_X86_64) SRC_CPP_SSE4(crc32c_sse4.cc) ENDIF() ENDIF() - -SRCS( - interface.cc - multiword_64_64_intrinsic_i386_mmx.cc -) - -END() + +SRCS( + interface.cc + multiword_64_64_intrinsic_i386_mmx.cc +) + +END() |