Restoring authorship annotation for <f0b0s@yandex-team.ru>. Commit 2 of 2.

author: f0b0s <f0b0s@yandex-team.ru> 2022-02-10 16:46:51 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:46:51 +0300
commit: cdae02d225fb5b3afbb28990e79a7ac6c9125327 (patch)
tree: 49e222ea1c5804306084bb3ae065bb702625360f /contrib/libs/crcutil/generic_crc.h
parent: deabc5260ac2e17b8f5152ee060bec1740613540 (diff)
download: ydb-cdae02d225fb5b3afbb28990e79a7ac6c9125327.tar.gz
1 files changed, 687 insertions, 687 deletions
diff --git a/contrib/libs/crcutil/generic_crc.h b/contrib/libs/crcutil/generic_crc.h
index 4832005776..06af21c925 100644
--- a/contrib/libs/crcutil/generic_crc.h
+++ b/contrib/libs/crcutil/generic_crc.h
@@ -1,687 +1,687 @@
-// Copyright 2010 Google Inc.  All rights reserved. 
-// 
-// Licensed under the Apache License, Version 2.0 (the "License"); 
-// you may not use this file except in compliance with the License. 
-// You may obtain a copy of the License at 
-// 
-//      http://www.apache.org/licenses/LICENSE-2.0 
-// 
-// Unless required by applicable law or agreed to in writing, software 
-// distributed under the License is distributed on an "AS IS" BASIS, 
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
-// See the License for the specific language governing permissions and 
-// limitations under the License. 
- 
-// Defines GenericCrc class which implements arbitrary CRCs. 
-// 
-// Please read crc.pdf to understand how it all works. 
- 
-#ifndef CRCUTIL_GENERIC_CRC_H_ 
-#define CRCUTIL_GENERIC_CRC_H_ 
- 
-#include "base_types.h"     // uint8 
-#include "crc_casts.h"      // TO_BYTE(), Downcast<>. 
-#include "gf_util.h"        // GfUtil<Crc> class. 
-#include "platform.h"       // GCC_ALIGN_ATTRIBUTE(16) 
-#include "uint128_sse2.h"   // uint128_sse2 type (if necessary) 
- 
-namespace crcutil { 
- 
-#pragma pack(push, 16) 
- 
-// Extends CRC by one byte. 
-// Technically, if degree of a polynomial does not exceed 8, 
-// right shift by 8 bits is not required, but who cares about CRC-8? 
-#define CRC_BYTE(table, crc, byte) do { \ 
-  crc = ((sizeof(crc) > 1) ? SHIFT_RIGHT_SAFE(crc, 8) : 0) ^ \ 
-        table->crc_word_[sizeof(Word) - 1][TO_BYTE(crc) ^ (byte)]; \ 
-} while (0) 
- 
-#define TABLE_ENTRY(table, byte, buf) \ 
-  table[byte][Downcast<Word, uint8>(buf)] 
- 
-#define TABLE_ENTRY_LAST(table, buf) \ 
-  table[sizeof(Word) - 1][buf] 
- 
-// Extends CRC by one word. 
-#define CRC_WORD(table, crc, buf) do { \ 
-  buf ^= Downcast<Crc, Word>(crc); \ 
-  if (sizeof(crc) > sizeof(buf)) { \ 
-    crc = SHIFT_RIGHT_SAFE(crc, sizeof(buf) * 8); \ 
-    crc ^= TABLE_ENTRY(table->crc_word_, 0, buf); \ 
-  } else { \ 
-    crc = TABLE_ENTRY(table->crc_word_, 0, buf); \ 
-  } \ 
-  buf >>= 8; \ 
-  for (size_t byte = 1; byte < sizeof(buf) - 1; ++byte) { \ 
-    crc ^= TABLE_ENTRY(table->crc_word_, byte, buf); \ 
-    buf >>= 8; \ 
-  } \ 
-  crc ^= TABLE_ENTRY_LAST(table->crc_word_, buf); \ 
-} while (0) 
- 
-// Process beginning of data block byte by byte until source pointer 
-// becomes perfectly aligned on Word boundary. 
-#define ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word) do { \ 
-  while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { \ 
-    if (src >= end) { \ 
-      return (crc ^ table->Base().Canonize()); \ 
-    } \ 
-    CRC_BYTE(table, crc, *src); \ 
-    src += 1; \ 
-  } \ 
-} while (0) 
- 
- 
-// On amd64, enforcing alignment is 2-4% slower on small (<= 64 bytes) blocks 
-// but 6-10% faster on larger blocks (>= 2KB). 
-// Break-even point (+-1%) is around 1KB (Q9650, E6600). 
-// 
-#define ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, table, src, end, crc, Word) \ 
-do { \ 
-  if (sizeof(Word) > 8 || (bytes) > CRCUTIL_MIN_ALIGN_SIZE) { \ 
-    ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word); \ 
-  } \ 
-} while (0) 
- 
-#if defined(_MSC_VER) 
-#pragma warning(push) 
-#pragma warning(disable: 4127)  // conditional expression is constant 
-#endif  // defined(_MSC_VER) 
- 
-// Forward declarations. 
-template<typename CrcImplementation> class RollingCrc; 
- 
-// Crc        is the type used internally and to return values of N-bit CRC. 
-//            It should be at least as large as "TableEntry" and "Word" but 
-//            may be larger (e.g. for 16-bit CRC, TableEntry and Word may be 
-//            set to uint16 but Crc may be set to uint32). 
-// 
-// TableEntry is the type of values stored in the tables. 
-//            To implement N-bit CRC, TableEntry should be large enough 
-//            to store N bits. 
-// 
-// Word       is the type used to read data sizeof(Word) at a time. 
-//            Ideally, it shoulde be "most suitable for given architecture" 
-//            integer type -- typically "size_t". 
-// 
-// kStride    is the number of words processed in interleaved manner by 
-//            CrcMultiword() and CrcWordblock(). Shall be either 3 or 4. 
-//            Optimal value depends on hardware architecture (AMD64, ARM, etc). 
-// 
-template<typename _Crc, typename _TableEntry, typename _Word, int kStride> 
-    class GenericCrc { 
- public: 
-  // Make Crc, TableEntry, and Word types visible (used by RollingCrc etc.) 
-  typedef _Crc Crc; 
-  typedef _TableEntry TableEntry; 
-  typedef _Word Word; 
- 
-  GenericCrc() {} 
- 
-  // Initializes the tables given generating polynomial of degree. 
-  // If "canonical" is true, crc value will be XOR'ed with (-1) before and 
-  // after actual CRC computation. 
-  GenericCrc(const Crc &generating_polynomial, size_t degree, bool canonical) { 
-    Init(generating_polynomial, degree, canonical); 
-  } 
-  void Init(const Crc &generating_polynomial, size_t degree, bool canonical) { 
-    base_.Init(generating_polynomial, degree, canonical); 
- 
-    // Instead of computing 
-    //    table[j][i] = MultiplyUnnormalized(i, 8, k), 
-    // for all i = 0...255, we may notice that 
-    // if i = 2**n then for all m = 1...(i-1) 
-    // MultiplyUnnormalized(i + m, 8, k) = 
-    //    MultiplyUnnormalized(i ^ m, 8, k) = 
-    //    MultiplyUnnormalized(i, 8, k) ^ MultiplyUnnormalized(m, 8, k) = 
-    //    MultiplyUnnormalized(i, 8, k) ^ crc_word_interleaved[j][m] = 
-    //    table[i] ^ table[m]. 
-#if 0 
-    for (size_t j = 0; j < sizeof(Word); ++j) { 
-      Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree); 
-      for (size_t i = 0; i < 256; ++i) { 
-        Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k); 
-        this->crc_word_interleaved_[j][i] = Downcast<Crc, TableEntry>(temp); 
-      } 
-    } 
-#else 
-    for (size_t j = 0; j < sizeof(Word); ++j) { 
-      Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree); 
-      TableEntry *table = this->crc_word_interleaved_[j]; 
-      table[0] = 0;  // Init 0s entry -- multiply 0 by anything yields 0. 
-      for (size_t i = 1; i < 256; i <<= 1) { 
-        TableEntry value = Downcast<Crc, TableEntry>( 
-            Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k)); 
-        table[i] = value; 
-        for (size_t m = 1; m < i; ++m) { 
-          table[i + m] = value ^ table[m]; 
-        } 
-      } 
-    } 
-#endif 
- 
-#if 0 
-    for (size_t j = 0; j < sizeof(Word); ++j) { 
-      Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree); 
-      for (size_t i = 0; i < 256; ++i) { 
-        Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k); 
-        this->crc_word_[j][i] = Downcast<Crc, TableEntry>(temp); 
-      } 
-    } 
-#else 
-    for (size_t j = 0; j < sizeof(Word); ++j) { 
-      Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree); 
-      TableEntry *table = this->crc_word_[j]; 
-      table[0] = 0;  // Init 0s entry -- multiply 0 by anything yields 0. 
-      for (size_t i = 1; i < 256; i <<= 1) { 
-        TableEntry value = Downcast<Crc, TableEntry>( 
-            Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k)); 
-        table[i] = value; 
-        for (size_t m = 1; m < i; ++m) { 
-          table[i + m] = value ^ table[m]; 
-        } 
-      } 
-    } 
-#endif 
-  } 
- 
-  // Default CRC implementation 
-  Crc CrcDefault(const void *data, size_t bytes, const Crc &start) const { 
-#if HAVE_AMD64 || HAVE_I386 
-    return CrcMultiword(data, bytes, start); 
-#else 
-    // Very few CPUs have multiple ALUs and speculative execution 
-    // (Itanium is an exception) so sophisticated algorithms will 
-    // not perform better than good old Sarwate algorithm. 
-    return CrcByteUnrolled(data, bytes, start); 
-#endif  // HAVE_AMD64 || HAVE_I386 
-  } 
- 
-  // Returns base class. 
-  const GfUtil<Crc> &Base() const { return base_; } 
- 
- protected: 
-  // Canonical, byte-by-byte CRC computation. 
-  Crc CrcByte(const void *data, size_t bytes, const Crc &start) const { 
-    const uint8 *src = static_cast<const uint8 *>(data); 
-    Crc crc = start ^ Base().Canonize(); 
-    for (const uint8 *end = src + bytes; src < end; ++src) { 
-      CRC_BYTE(this, crc, *src); 
-    } 
-    return (crc ^ Base().Canonize()); 
-  } 
- 
-  // Byte-by-byte CRC with main loop unrolled. 
-  Crc CrcByteUnrolled(const void *data, size_t bytes, const Crc &start) const { 
-    if (bytes == 0) { 
-      return start; 
-    } 
- 
-    const uint8 *src = static_cast<const uint8 *>(data); 
-    const uint8 *end = src + bytes; 
-    Crc crc = start ^ Base().Canonize(); 
- 
-    // Unroll loop 4 times. 
-    end -= 3; 
-    for (; src < end; src += 4) { 
-      PREFETCH(src); 
-      CRC_BYTE(this, crc, src[0]); 
-      CRC_BYTE(this, crc, src[1]); 
-      CRC_BYTE(this, crc, src[2]); 
-      CRC_BYTE(this, crc, src[3]); 
-    } 
-    end += 3; 
- 
-    // Compute CRC of remaining bytes. 
-    for (; src < end; ++src) { 
-      CRC_BYTE(this, crc, *src); 
-    } 
- 
-    return (crc ^ Base().Canonize()); 
-  } 
- 
-  // Canonical, byte-by-byte CRC computation. 
-  Crc CrcByteWord(const void *data, size_t bytes, const Crc &start) const { 
-    const uint8 *src = static_cast<const uint8 *>(data); 
-    const uint8 *end = src + bytes; 
-    Crc crc0 = start ^ Base().Canonize(); 
- 
-    ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Crc); 
-    if (src >= end) { 
-      return (crc0 ^ Base().Canonize()); 
-    } 
- 
-    // Process 4*sizeof(Crc) bytes at a time. 
-    end -= 4 * sizeof(Crc) - 1; 
-    for (; src < end; src += 4 * sizeof(Crc)) { 
-      for (size_t i = 0; i < 4; ++i) { 
-        crc0 ^= reinterpret_cast<const Crc *>(src)[i]; 
-        if (i == 0) { 
-          PREFETCH(src); 
-        } 
-        for (size_t byte = 0; byte < sizeof(crc0); ++byte) { 
-          CRC_BYTE(this, crc0, 0); 
-        } 
-      } 
-    } 
-    end += 4 * sizeof(Crc) - 1; 
- 
-    // Process sizeof(Crc) bytes at a time. 
-    end -= sizeof(Crc) - 1; 
-    for (; src < end; src += sizeof(Crc)) { 
-      crc0 ^= reinterpret_cast<const Crc *>(src)[0]; 
-      for (size_t byte = 0; byte < sizeof(crc0); ++byte) { 
-        CRC_BYTE(this, crc0, 0); 
-      } 
-    } 
-    end += sizeof(Crc) - 1; 
- 
-    // Compute CRC of remaining bytes. 
-    for (;src < end; ++src) { 
-      CRC_BYTE(this, crc0, *src); 
-    } 
- 
-    return (crc0 ^ Base().Canonize()); 
-  } 
- 
-  // Faster, word-by-word CRC. 
-  Crc CrcWord(const void *data, size_t bytes, const Crc &start) const { 
-    const uint8 *src = static_cast<const uint8 *>(data); 
-    const uint8 *end = src + bytes; 
-    Crc crc0 = start ^ Base().Canonize(); 
- 
-    ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); 
-    if (src >= end) { 
-      return (crc0 ^ Base().Canonize()); 
-    } 
- 
-    // Process 4 sizeof(Word) bytes at once. 
-    end -= 4 * sizeof(Word) - 1; 
-    for (; src < end; src += 4 * sizeof(Word)) { 
-      Word buf0 = reinterpret_cast<const Word *>(src)[0]; 
-      PREFETCH(src); 
-      CRC_WORD(this, crc0, buf0); 
-      buf0 = reinterpret_cast<const Word *>(src)[1]; 
-      CRC_WORD(this, crc0, buf0); 
-      buf0 = reinterpret_cast<const Word *>(src)[2]; 
-      CRC_WORD(this, crc0, buf0); 
-      buf0 = reinterpret_cast<const Word *>(src)[3]; 
-      CRC_WORD(this, crc0, buf0); 
-    } 
-    end += 4 * sizeof(Word) - 1; 
- 
-    // Process sizeof(Word) bytes at a time. 
-    end -= sizeof(Word) - 1; 
-    for (; src < end; src += sizeof(Word)) { 
-      Word buf0 = reinterpret_cast<const Word *>(src)[0]; 
-      CRC_WORD(this, crc0, buf0); 
-    } 
-    end += sizeof(Word) - 1; 
- 
-    // Compute CRC of remaining bytes. 
-    for (;src < end; ++src) { 
-      CRC_BYTE(this, crc0, *src); 
-    } 
- 
-    return (crc0 ^ Base().Canonize()); 
-  } 
- 
-#define REPEAT_FROM_1(macro) \ 
-  macro(1); \ 
-  macro(2); \ 
-  macro(3); \ 
-  macro(4); \ 
-  macro(5); \ 
-  macro(6); \ 
-  macro(7); 
- 
-#define REPEAT_FROM_0(macro) \ 
-  macro(0); \ 
-  REPEAT_FROM_1(macro) 
- 
-  // Faster, process adjusent blocks in parallel and concatenate CRCs. 
-  Crc CrcBlockword(const void *data, size_t bytes, const Crc &start) const { 
-    if (kStride < 2 || kStride > 8) { 
-      // Unsupported configuration; 
-      // fall back to something sensible. 
-      return CrcWord(data, bytes, start); 
-    } 
- 
-    const uint8 *src = static_cast<const uint8 *>(data); 
-    const uint8 *end = src + bytes; 
-    Crc crc0 = start ^ Base().Canonize(); 
-    enum { 
-      // Add 16 to avoid false L1 cache collisions. 
-      kStripe = (15*1024 + 16) & ~(sizeof(Word) - 1), 
-    }; 
- 
-    ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); 
-    if (src >= end) { 
-      return (crc0 ^ Base().Canonize()); 
-    } 
- 
-    end -= kStride * kStripe - 1; 
-    if (src < end) { 
-      Crc x_pow_8kStripe = Base().Xpow8N(kStripe); 
-      do { 
-        const uint8 *stripe_end = src + kStripe; 
- 
-#define INIT_CRC(reg) \ 
-        Crc crc##reg; \ 
-        if (kStride >= reg) { \ 
-          crc##reg = 0; \ 
-        } 
-        REPEAT_FROM_1(INIT_CRC); 
-#undef INIT_CRC 
- 
-        do { 
-#define FIRST(reg) \ 
-          Word buf##reg; \ 
-          if (kStride > reg) { \ 
-            buf##reg = reinterpret_cast<const Word *>(src + reg * kStripe)[0]; \ 
-            buf##reg ^= Downcast<Crc, Word>(crc##reg); \ 
-            if (sizeof(crc##reg) > sizeof(buf##reg)) { \ 
-              crc##reg = SHIFT_RIGHT_SAFE(crc##reg, sizeof(buf##reg) * 8); \ 
-              crc##reg ^= TABLE_ENTRY(this->crc_word_, 0, buf##reg); \ 
-            } else { \ 
-              crc##reg = TABLE_ENTRY(this->crc_word_, 0, buf##reg); \ 
-            } \ 
-            buf##reg >>= 8; \ 
-          } 
-          REPEAT_FROM_0(FIRST); 
-#undef FIRST 
- 
-          for (size_t byte = 1; byte < sizeof(buf0) - 1; ++byte) { 
-#define NEXT(reg) do { \ 
-            if (kStride > reg) { \ 
-              crc##reg ^= TABLE_ENTRY(this->crc_word_, byte, buf##reg); \ 
-              buf##reg >>= 8; \ 
-            } \ 
-} while (0) 
-            REPEAT_FROM_0(NEXT); 
-#undef NEXT 
-          } 
- 
-#define LAST(reg) do { \ 
-          if (kStride > reg) { \ 
-            crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_, buf##reg); \ 
-          } \ 
-} while (0) 
-          REPEAT_FROM_0(LAST); 
-#undef LAST 
- 
-          src += sizeof(Word); 
-        } while (src < stripe_end); 
- 
-#if 0 
-// The code is left for illustrational purposes only. 
-#define COMBINE(reg) do { \ 
-        if (reg > 0 && kStride > reg) { \ 
-          crc0 = Base().ChangeStartValue(crc##reg, kStripe, 0, crc0); \ 
-        } \ 
-} while (0) 
-#else 
-#define COMBINE(reg) do { \ 
-        if (reg > 0 && kStride > reg) { \ 
-          crc0 = crc##reg ^ Base().Multiply(crc0, x_pow_8kStripe); \ 
-        } \ 
-} while (0) 
-#endif 
-        REPEAT_FROM_0(COMBINE); 
-#undef COMBINE 
- 
-        src += (kStride - 1) * kStripe; 
-      } 
-      while (src < end); 
-    } 
-    end += kStride * kStripe - 1; 
- 
-    // Process sizeof(Word) bytes at a time. 
-    end -= sizeof(Word) - 1; 
-    for (; src < end; src += sizeof(Word)) { 
-      Word buf0 = reinterpret_cast<const Word *>(src)[0]; 
-      CRC_WORD(this, crc0, buf0); 
-    } 
-    end += sizeof(Word) - 1; 
- 
-    // Compute CRC of remaining bytes. 
-    for (;src < end; ++src) { 
-      CRC_BYTE(this, crc0, *src); 
-    } 
- 
-    return (crc0 ^ Base().Canonize()); 
-  } 
- 
-  // Fastest, interleaved multi-byte CRC. 
-  Crc CrcMultiword(const void *data, size_t bytes, const Crc &start) const { 
-    if (kStride < 2 || kStride > 8) { 
-      // Unsupported configuration; 
-      // fall back to something sensible. 
-      return CrcWord(data, bytes, start); 
-    } 
- 
-    const uint8 *src = static_cast<const uint8 *>(data); 
-    const uint8 *end = src + bytes; 
-    Crc crc0 = start ^ Base().Canonize(); 
- 
-    ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); 
-    if (src >= end) { 
-      return (crc0 ^ Base().Canonize()); 
-    } 
- 
-    // Process kStride Word registers at once; 
-    // should have have at least 2*kInterleaveBytes of data to start. 
-    end -= 2*kInterleaveBytes - 1; 
-    if (src < end) { 
-      Crc crc_carryover; 
-      if (sizeof(Crc) > sizeof(Word)) { 
-        // crc_carryover is used if and only if Crc is wider than Word. 
-        crc_carryover = 0; 
-      } 
-#define INIT_CRC(reg) \ 
-      Crc crc##reg; \ 
-      if (reg > 0 && kStride > reg) { \ 
-        crc##reg = 0; \ 
-      } 
-      REPEAT_FROM_1(INIT_CRC); 
-#undef INIT_CRC 
- 
-#define INIT_BUF(reg) \ 
-      Word buf##reg; \ 
-      if (kStride > reg) { \ 
-        buf##reg = reinterpret_cast<const Word *>(src)[reg]; \ 
-      } 
-      REPEAT_FROM_0(INIT_BUF); 
-#undef INIT_BUF 
- 
-      do { 
-        PREFETCH(src); 
-        src += kInterleaveBytes; 
- 
-        if (sizeof(Crc) > sizeof(Word)) { 
-          crc0 ^= crc_carryover; 
-        } 
- 
-#define FIRST(reg, next_reg) do { \ 
-        if (kStride > reg) { \ 
-          buf##reg ^= Downcast<Crc, Word>(crc##reg); \ 
-          if (sizeof(Crc) > sizeof(Word)) { \ 
-            if (reg < kStride - 1) { \ 
-              crc##next_reg ^= SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \ 
-            } else { \ 
-              crc_carryover = SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \ 
-            } \ 
-          } \ 
-          crc##reg = TABLE_ENTRY(this->crc_word_interleaved_, 0, buf##reg); \ 
-          buf##reg >>= 8; \ 
-        } \ 
-} while (0) 
-        FIRST(0, 1); 
-        FIRST(1, 2); 
-        FIRST(2, 3); 
-        FIRST(3, 4); 
-        FIRST(4, 5); 
-        FIRST(5, 6); 
-        FIRST(6, 7); 
-        FIRST(7, 0); 
-#undef FIRST 
- 
-        for (size_t byte = 1; byte < sizeof(Word) - 1; ++byte) { 
-#define NEXT(reg) do { \ 
-          if (kStride > reg) { \ 
-            crc##reg ^= \ 
-                TABLE_ENTRY(this->crc_word_interleaved_, byte, buf##reg); \ 
-            buf##reg >>= 8; \ 
-          } \ 
-} while(0) 
-          REPEAT_FROM_0(NEXT); 
-#undef NEXT 
-        } 
- 
-#define LAST(reg) do { \ 
-        if (kStride > reg) { \ 
-          crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_interleaved_, buf##reg); \ 
-          buf##reg = reinterpret_cast<const Word *>(src)[reg]; \ 
-        } \ 
-} while(0) 
-        REPEAT_FROM_0(LAST); 
-#undef LAST 
-      } 
-      while (src < end); 
- 
-      if (sizeof(Crc) > sizeof(Word)) { 
-        crc0 ^= crc_carryover; 
-      } 
- 
-#define COMBINE(reg) do { \ 
-      if (kStride > reg) { \ 
-        if (reg != 0) { \ 
-          crc0 ^= crc##reg; \ 
-        } \ 
-        CRC_WORD(this, crc0, buf##reg); \ 
-      } \ 
-} while (0) 
-      REPEAT_FROM_0(COMBINE); 
-#undef COMBINE 
- 
-      src += kInterleaveBytes; 
-    } 
-    end += 2*kInterleaveBytes - 1; 
- 
-    // Process sizeof(Word) bytes at once. 
-    end -= sizeof(Word) - 1; 
-    for (; src < end; src += sizeof(Word)) { 
-      Word buf0 = reinterpret_cast<const Word *>(src)[0]; 
-      CRC_WORD(this, crc0, buf0); 
-    } 
-    end += sizeof(Word) - 1; 
- 
-    // Compute CRC of remaining bytes. 
-    for (;src < end; ++src) { 
-      CRC_BYTE(this, crc0, *src); 
-    } 
- 
-    return (crc0 ^ Base().Canonize()); 
-  } 
- 
- protected: 
-  enum { 
-    kInterleaveBytes = sizeof(Word) * kStride, 
-  }; 
- 
-  // Multiplication tables used by CRCs. 
-  TableEntry crc_word_interleaved_[sizeof(Word)][256]; 
-  TableEntry crc_word_[sizeof(Word)][256]; 
- 
-  // Base class stored after CRC tables so that the most frequently 
-  // used table is at offset 0 and may be accessed faster. 
-  GfUtil<Crc> base_; 
- 
-  friend class RollingCrc< GenericCrc<Crc, TableEntry, Word, kStride> >; 
- 
- private: 
-  // CrcMultiword on amd64 may run at 1.2 CPU cycles per byte which is 
-  // noticeably faster than CrcWord (2.2-2.6 cycles/byte depending on 
-  // hardware and compiler). However, there are problems with compilers. 
-  // 
-  // Test system: P45 chipset, Intel Q9650 CPU, 800MHz 4-4-4-12 memory. 
-  // 
-  // 64-bit compiler, <= 64-bit CRC, 64-bit tables, 64-bit reads: 
-  // CL 15.00.307291.1  C++   >1.2< CPU cycles/byte 
-  // ICL 11.1.051 -O3   C++    1.5  CPU cycles/byte 
-  // GCC 4.5 -O3        C++    2.0  CPU cycles/byte 
-  // GCC 4.x -O3        ASM   >1.2< CPU cycles/byte 
-  // 
-  // 32-bit compiler, MMX used, <= 64-bit CRC, 64-bit tables, 64-bit reads 
-  // CL 15.00.307291.1  C++   2.0  CPU cycles/byte 
-  // GCC 4.5 -O3        C++   1.9  CPU cycles/byte 
-  // ICL 11.1.051 -S    C++   1.6  CPU cycles/byte 
-  // GCC 4.x -O3        ASM  >1.3< CPU cycles/byte 
-  // 
-  // So, use inline ASM code for GCC for both i386 and amd64. 
- 
-  Crc CrcMultiwordI386Mmx( 
-          const void *data, size_t bytes, const Crc &start) const; 
-  Crc CrcMultiwordGccAmd64( 
-          const void *data, size_t bytes, const Crc &start) const; 
-  Crc CrcMultiwordGccAmd64Sse2( 
-          const uint8 *src, const uint8 *end, const Crc &start) const; 
-} GCC_ALIGN_ATTRIBUTE(16); 
- 
-#undef REPEAT_FROM_0 
-#undef REPEAT_FROM_1 
- 
- 
-// Specialized variants. 
-#if CRCUTIL_USE_ASM 
- 
-#if (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX))) 
- 
-// Declare specialized functions. 
-template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( 
-    const void *data, size_t bytes, const uint64 &start) const; 
- 
-#if HAVE_AMD64 && HAVE_SSE2 
-template<> 
-uint128_sse2 
-GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword( 
-    const void *data, size_t bytes, const uint128_sse2 &start) const; 
-#endif  // HAVE_AMD64 && HAVE_SSE2 
- 
-#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER <= 150030729 && \ 
-      (HAVE_I386 && HAVE_MMX) 
- 
-// Work around bug in MSC (present at least in v. 15.00.30729.1) 
-template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( 
-    const void *data, 
-    size_t bytes, 
-    const uint64 &start) const; 
-template<> __forceinline 
-uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword( 
-    const void *data, 
-    size_t bytes, 
-    const uint64 &start) const { 
-  typedef uint64 Word; 
-  typedef uint64 Crc; 
-  if (bytes <= 12) { 
-    const uint8 *src = static_cast<const uint8 *>(data); 
-    uint64 crc = start ^ Base().Canonize(); 
-    for (const uint8 *end = src + bytes; src < end; ++src) { 
-      CRC_BYTE(this, crc, *src); 
-    } 
-    return (crc ^ Base().Canonize()); 
-  } 
-  return CrcMultiwordI386Mmx(data, bytes, start); 
-} 
- 
-#endif  // (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX))) 
- 
-#endif  // CRCUTIL_USE_ASM 
- 
- 
-#pragma pack(pop) 
- 
-}  // namespace crcutil 
- 
-#endif  // CRCUTIL_GENERIC_CRC_H_ 
+// Copyright 2010 Google Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Defines GenericCrc class which implements arbitrary CRCs.
+//
+// Please read crc.pdf to understand how it all works.
+
+#ifndef CRCUTIL_GENERIC_CRC_H_
+#define CRCUTIL_GENERIC_CRC_H_
+
+#include "base_types.h"     // uint8
+#include "crc_casts.h"      // TO_BYTE(), Downcast<>.
+#include "gf_util.h"        // GfUtil<Crc> class.
+#include "platform.h"       // GCC_ALIGN_ATTRIBUTE(16)
+#include "uint128_sse2.h"   // uint128_sse2 type (if necessary)
+
+namespace crcutil {
+
+#pragma pack(push, 16)
+
+// Extends CRC by one byte.
+// Technically, if degree of a polynomial does not exceed 8,
+// right shift by 8 bits is not required, but who cares about CRC-8?
+#define CRC_BYTE(table, crc, byte) do { \
+  crc = ((sizeof(crc) > 1) ? SHIFT_RIGHT_SAFE(crc, 8) : 0) ^ \
+        table->crc_word_[sizeof(Word) - 1][TO_BYTE(crc) ^ (byte)]; \
+} while (0)
+
+#define TABLE_ENTRY(table, byte, buf) \
+  table[byte][Downcast<Word, uint8>(buf)]
+
+#define TABLE_ENTRY_LAST(table, buf) \
+  table[sizeof(Word) - 1][buf]
+
+// Extends CRC by one word.
+#define CRC_WORD(table, crc, buf) do { \
+  buf ^= Downcast<Crc, Word>(crc); \
+  if (sizeof(crc) > sizeof(buf)) { \
+    crc = SHIFT_RIGHT_SAFE(crc, sizeof(buf) * 8); \
+    crc ^= TABLE_ENTRY(table->crc_word_, 0, buf); \
+  } else { \
+    crc = TABLE_ENTRY(table->crc_word_, 0, buf); \
+  } \
+  buf >>= 8; \
+  for (size_t byte = 1; byte < sizeof(buf) - 1; ++byte) { \
+    crc ^= TABLE_ENTRY(table->crc_word_, byte, buf); \
+    buf >>= 8; \
+  } \
+  crc ^= TABLE_ENTRY_LAST(table->crc_word_, buf); \
+} while (0)
+
+// Process beginning of data block byte by byte until source pointer
+// becomes perfectly aligned on Word boundary.
+#define ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word) do { \
+  while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { \
+    if (src >= end) { \
+      return (crc ^ table->Base().Canonize()); \
+    } \
+    CRC_BYTE(table, crc, *src); \
+    src += 1; \
+  } \
+} while (0)
+
+
+// On amd64, enforcing alignment is 2-4% slower on small (<= 64 bytes) blocks
+// but 6-10% faster on larger blocks (>= 2KB).
+// Break-even point (+-1%) is around 1KB (Q9650, E6600).
+//
+#define ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, table, src, end, crc, Word) \
+do { \
+  if (sizeof(Word) > 8 || (bytes) > CRCUTIL_MIN_ALIGN_SIZE) { \
+    ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word); \
+  } \
+} while (0)
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4127)  // conditional expression is constant
+#endif  // defined(_MSC_VER)
+
+// Forward declarations.
+template<typename CrcImplementation> class RollingCrc;
+
+// Crc        is the type used internally and to return values of N-bit CRC.
+//            It should be at least as large as "TableEntry" and "Word" but
+//            may be larger (e.g. for 16-bit CRC, TableEntry and Word may be
+//            set to uint16 but Crc may be set to uint32).
+//
+// TableEntry is the type of values stored in the tables.
+//            To implement N-bit CRC, TableEntry should be large enough
+//            to store N bits.
+//
+// Word       is the type used to read data sizeof(Word) at a time.
+//            Ideally, it shoulde be "most suitable for given architecture"
+//            integer type -- typically "size_t".
+//
+// kStride    is the number of words processed in interleaved manner by
+//            CrcMultiword() and CrcWordblock(). Shall be either 3 or 4.
+//            Optimal value depends on hardware architecture (AMD64, ARM, etc).
+//
+template<typename _Crc, typename _TableEntry, typename _Word, int kStride>
+    class GenericCrc {
+ public:
+  // Make Crc, TableEntry, and Word types visible (used by RollingCrc etc.)
+  typedef _Crc Crc;
+  typedef _TableEntry TableEntry;
+  typedef _Word Word;
+
+  GenericCrc() {}
+
+  // Initializes the tables given generating polynomial of degree.
+  // If "canonical" is true, crc value will be XOR'ed with (-1) before and
+  // after actual CRC computation.
+  GenericCrc(const Crc &generating_polynomial, size_t degree, bool canonical) {
+    Init(generating_polynomial, degree, canonical);
+  }
+  void Init(const Crc &generating_polynomial, size_t degree, bool canonical) {
+    base_.Init(generating_polynomial, degree, canonical);
+
+    // Instead of computing
+    //    table[j][i] = MultiplyUnnormalized(i, 8, k),
+    // for all i = 0...255, we may notice that
+    // if i = 2**n then for all m = 1...(i-1)
+    // MultiplyUnnormalized(i + m, 8, k) =
+    //    MultiplyUnnormalized(i ^ m, 8, k) =
+    //    MultiplyUnnormalized(i, 8, k) ^ MultiplyUnnormalized(m, 8, k) =
+    //    MultiplyUnnormalized(i, 8, k) ^ crc_word_interleaved[j][m] =
+    //    table[i] ^ table[m].
+#if 0
+    for (size_t j = 0; j < sizeof(Word); ++j) {
+      Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree);
+      for (size_t i = 0; i < 256; ++i) {
+        Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k);
+        this->crc_word_interleaved_[j][i] = Downcast<Crc, TableEntry>(temp);
+      }
+    }
+#else
+    for (size_t j = 0; j < sizeof(Word); ++j) {
+      Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree);
+      TableEntry *table = this->crc_word_interleaved_[j];
+      table[0] = 0;  // Init 0s entry -- multiply 0 by anything yields 0.
+      for (size_t i = 1; i < 256; i <<= 1) {
+        TableEntry value = Downcast<Crc, TableEntry>(
+            Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k));
+        table[i] = value;
+        for (size_t m = 1; m < i; ++m) {
+          table[i + m] = value ^ table[m];
+        }
+      }
+    }
+#endif
+
+#if 0
+    for (size_t j = 0; j < sizeof(Word); ++j) {
+      Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree);
+      for (size_t i = 0; i < 256; ++i) {
+        Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k);
+        this->crc_word_[j][i] = Downcast<Crc, TableEntry>(temp);
+      }
+    }
+#else
+    for (size_t j = 0; j < sizeof(Word); ++j) {
+      Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree);
+      TableEntry *table = this->crc_word_[j];
+      table[0] = 0;  // Init 0s entry -- multiply 0 by anything yields 0.
+      for (size_t i = 1; i < 256; i <<= 1) {
+        TableEntry value = Downcast<Crc, TableEntry>(
+            Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k));
+        table[i] = value;
+        for (size_t m = 1; m < i; ++m) {
+          table[i + m] = value ^ table[m];
+        }
+      }
+    }
+#endif
+  }
+
+  // Default CRC implementation
+  Crc CrcDefault(const void *data, size_t bytes, const Crc &start) const {
+#if HAVE_AMD64 || HAVE_I386
+    return CrcMultiword(data, bytes, start);
+#else
+    // Very few CPUs have multiple ALUs and speculative execution
+    // (Itanium is an exception) so sophisticated algorithms will
+    // not perform better than good old Sarwate algorithm.
+    return CrcByteUnrolled(data, bytes, start);
+#endif  // HAVE_AMD64 || HAVE_I386
+  }
+
+  // Returns base class.
+  const GfUtil<Crc> &Base() const { return base_; }
+
+ protected:
+  // Canonical, byte-by-byte CRC computation.
+  Crc CrcByte(const void *data, size_t bytes, const Crc &start) const {
+    const uint8 *src = static_cast<const uint8 *>(data);
+    Crc crc = start ^ Base().Canonize();
+    for (const uint8 *end = src + bytes; src < end; ++src) {
+      CRC_BYTE(this, crc, *src);
+    }
+    return (crc ^ Base().Canonize());
+  }
+
+  // Byte-by-byte CRC with main loop unrolled.
+  Crc CrcByteUnrolled(const void *data, size_t bytes, const Crc &start) const {
+    if (bytes == 0) {
+      return start;
+    }
+
+    const uint8 *src = static_cast<const uint8 *>(data);
+    const uint8 *end = src + bytes;
+    Crc crc = start ^ Base().Canonize();
+
+    // Unroll loop 4 times.
+    end -= 3;
+    for (; src < end; src += 4) {
+      PREFETCH(src);
+      CRC_BYTE(this, crc, src[0]);
+      CRC_BYTE(this, crc, src[1]);
+      CRC_BYTE(this, crc, src[2]);
+      CRC_BYTE(this, crc, src[3]);
+    }
+    end += 3;
+
+    // Compute CRC of remaining bytes.
+    for (; src < end; ++src) {
+      CRC_BYTE(this, crc, *src);
+    }
+
+    return (crc ^ Base().Canonize());
+  }
+
+  // Canonical, byte-by-byte CRC computation.
+  Crc CrcByteWord(const void *data, size_t bytes, const Crc &start) const {
+    const uint8 *src = static_cast<const uint8 *>(data);
+    const uint8 *end = src + bytes;
+    Crc crc0 = start ^ Base().Canonize();
+
+    ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Crc);
+    if (src >= end) {
+      return (crc0 ^ Base().Canonize());
+    }
+
+    // Process 4*sizeof(Crc) bytes at a time.
+    end -= 4 * sizeof(Crc) - 1;
+    for (; src < end; src += 4 * sizeof(Crc)) {
+      for (size_t i = 0; i < 4; ++i) {
+        crc0 ^= reinterpret_cast<const Crc *>(src)[i];
+        if (i == 0) {
+          PREFETCH(src);
+        }
+        for (size_t byte = 0; byte < sizeof(crc0); ++byte) {
+          CRC_BYTE(this, crc0, 0);
+        }
+      }
+    }
+    end += 4 * sizeof(Crc) - 1;
+
+    // Process sizeof(Crc) bytes at a time.
+    end -= sizeof(Crc) - 1;
+    for (; src < end; src += sizeof(Crc)) {
+      crc0 ^= reinterpret_cast<const Crc *>(src)[0];
+      for (size_t byte = 0; byte < sizeof(crc0); ++byte) {
+        CRC_BYTE(this, crc0, 0);
+      }
+    }
+    end += sizeof(Crc) - 1;
+
+    // Compute CRC of remaining bytes.
+    for (;src < end; ++src) {
+      CRC_BYTE(this, crc0, *src);
+    }
+
+    return (crc0 ^ Base().Canonize());
+  }
+
+  // Faster, word-by-word CRC.
+  Crc CrcWord(const void *data, size_t bytes, const Crc &start) const {
+    const uint8 *src = static_cast<const uint8 *>(data);
+    const uint8 *end = src + bytes;
+    Crc crc0 = start ^ Base().Canonize();
+
+    ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
+    if (src >= end) {
+      return (crc0 ^ Base().Canonize());
+    }
+
+    // Process 4 sizeof(Word) bytes at once.
+    end -= 4 * sizeof(Word) - 1;
+    for (; src < end; src += 4 * sizeof(Word)) {
+      Word buf0 = reinterpret_cast<const Word *>(src)[0];
+      PREFETCH(src);
+      CRC_WORD(this, crc0, buf0);
+      buf0 = reinterpret_cast<const Word *>(src)[1];
+      CRC_WORD(this, crc0, buf0);
+      buf0 = reinterpret_cast<const Word *>(src)[2];
+      CRC_WORD(this, crc0, buf0);
+      buf0 = reinterpret_cast<const Word *>(src)[3];
+      CRC_WORD(this, crc0, buf0);
+    }
+    end += 4 * sizeof(Word) - 1;
+
+    // Process sizeof(Word) bytes at a time.
+    end -= sizeof(Word) - 1;
+    for (; src < end; src += sizeof(Word)) {
+      Word buf0 = reinterpret_cast<const Word *>(src)[0];
+      CRC_WORD(this, crc0, buf0);
+    }
+    end += sizeof(Word) - 1;
+
+    // Compute CRC of remaining bytes.
+    for (;src < end; ++src) {
+      CRC_BYTE(this, crc0, *src);
+    }
+
+    return (crc0 ^ Base().Canonize());
+  }
+
+#define REPEAT_FROM_1(macro) \
+  macro(1); \
+  macro(2); \
+  macro(3); \
+  macro(4); \
+  macro(5); \
+  macro(6); \
+  macro(7);
+
+#define REPEAT_FROM_0(macro) \
+  macro(0); \
+  REPEAT_FROM_1(macro)
+
+  // Faster, process adjusent blocks in parallel and concatenate CRCs.
+  Crc CrcBlockword(const void *data, size_t bytes, const Crc &start) const {
+    if (kStride < 2 || kStride > 8) {
+      // Unsupported configuration;
+      // fall back to something sensible.
+      return CrcWord(data, bytes, start);
+    }
+
+    const uint8 *src = static_cast<const uint8 *>(data);
+    const uint8 *end = src + bytes;
+    Crc crc0 = start ^ Base().Canonize();
+    enum {
+      // Add 16 to avoid false L1 cache collisions.
+      kStripe = (15*1024 + 16) & ~(sizeof(Word) - 1),
+    };
+
+    ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
+    if (src >= end) {
+      return (crc0 ^ Base().Canonize());
+    }
+
+    end -= kStride * kStripe - 1;
+    if (src < end) {
+      Crc x_pow_8kStripe = Base().Xpow8N(kStripe);
+      do {
+        const uint8 *stripe_end = src + kStripe;
+
+#define INIT_CRC(reg) \
+        Crc crc##reg; \
+        if (kStride >= reg) { \
+          crc##reg = 0; \
+        }
+        REPEAT_FROM_1(INIT_CRC);
+#undef INIT_CRC
+
+        do {
+#define FIRST(reg) \
+          Word buf##reg; \
+          if (kStride > reg) { \
+            buf##reg = reinterpret_cast<const Word *>(src + reg * kStripe)[0]; \
+            buf##reg ^= Downcast<Crc, Word>(crc##reg); \
+            if (sizeof(crc##reg) > sizeof(buf##reg)) { \
+              crc##reg = SHIFT_RIGHT_SAFE(crc##reg, sizeof(buf##reg) * 8); \
+              crc##reg ^= TABLE_ENTRY(this->crc_word_, 0, buf##reg); \
+            } else { \
+              crc##reg = TABLE_ENTRY(this->crc_word_, 0, buf##reg); \
+            } \
+            buf##reg >>= 8; \
+          }
+          REPEAT_FROM_0(FIRST);
+#undef FIRST
+
+          for (size_t byte = 1; byte < sizeof(buf0) - 1; ++byte) {
+#define NEXT(reg) do { \
+            if (kStride > reg) { \
+              crc##reg ^= TABLE_ENTRY(this->crc_word_, byte, buf##reg); \
+              buf##reg >>= 8; \
+            } \
+} while (0)
+            REPEAT_FROM_0(NEXT);
+#undef NEXT
+          }
+
+#define LAST(reg) do { \
+          if (kStride > reg) { \
+            crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_, buf##reg); \
+          } \
+} while (0)
+          REPEAT_FROM_0(LAST);
+#undef LAST
+
+          src += sizeof(Word);
+        } while (src < stripe_end);
+
+#if 0
+// The code is left for illustrational purposes only.
+#define COMBINE(reg) do { \
+        if (reg > 0 && kStride > reg) { \
+          crc0 = Base().ChangeStartValue(crc##reg, kStripe, 0, crc0); \
+        } \
+} while (0)
+#else
+#define COMBINE(reg) do { \
+        if (reg > 0 && kStride > reg) { \
+          crc0 = crc##reg ^ Base().Multiply(crc0, x_pow_8kStripe); \
+        } \
+} while (0)
+#endif
+        REPEAT_FROM_0(COMBINE);
+#undef COMBINE
+
+        src += (kStride - 1) * kStripe;
+      }
+      while (src < end);
+    }
+    end += kStride * kStripe - 1;
+
+    // Process sizeof(Word) bytes at a time.
+    end -= sizeof(Word) - 1;
+    for (; src < end; src += sizeof(Word)) {
+      Word buf0 = reinterpret_cast<const Word *>(src)[0];
+      CRC_WORD(this, crc0, buf0);
+    }
+    end += sizeof(Word) - 1;
+
+    // Compute CRC of remaining bytes.
+    for (;src < end; ++src) {
+      CRC_BYTE(this, crc0, *src);
+    }
+
+    return (crc0 ^ Base().Canonize());
+  }
+
+  // Fastest, interleaved multi-byte CRC.
+  Crc CrcMultiword(const void *data, size_t bytes, const Crc &start) const {
+    if (kStride < 2 || kStride > 8) {
+      // Unsupported configuration;
+      // fall back to something sensible.
+      return CrcWord(data, bytes, start);
+    }
+
+    const uint8 *src = static_cast<const uint8 *>(data);
+    const uint8 *end = src + bytes;
+    Crc crc0 = start ^ Base().Canonize();
+
+    ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
+    if (src >= end) {
+      return (crc0 ^ Base().Canonize());
+    }
+
+    // Process kStride Word registers at once;
+    // should have have at least 2*kInterleaveBytes of data to start.
+    end -= 2*kInterleaveBytes - 1;
+    if (src < end) {
+      Crc crc_carryover;
+      if (sizeof(Crc) > sizeof(Word)) {
+        // crc_carryover is used if and only if Crc is wider than Word.
+        crc_carryover = 0;
+      }
+#define INIT_CRC(reg) \
+      Crc crc##reg; \
+      if (reg > 0 && kStride > reg) { \
+        crc##reg = 0; \
+      }
+      REPEAT_FROM_1(INIT_CRC);
+#undef INIT_CRC
+
+#define INIT_BUF(reg) \
+      Word buf##reg; \
+      if (kStride > reg) { \
+        buf##reg = reinterpret_cast<const Word *>(src)[reg]; \
+      }
+      REPEAT_FROM_0(INIT_BUF);
+#undef INIT_BUF
+
+      do {
+        PREFETCH(src);
+        src += kInterleaveBytes;
+
+        if (sizeof(Crc) > sizeof(Word)) {
+          crc0 ^= crc_carryover;
+        }
+
+#define FIRST(reg, next_reg) do { \
+        if (kStride > reg) { \
+          buf##reg ^= Downcast<Crc, Word>(crc##reg); \
+          if (sizeof(Crc) > sizeof(Word)) { \
+            if (reg < kStride - 1) { \
+              crc##next_reg ^= SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \
+            } else { \
+              crc_carryover = SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \
+            } \
+          } \
+          crc##reg = TABLE_ENTRY(this->crc_word_interleaved_, 0, buf##reg); \
+          buf##reg >>= 8; \
+        } \
+} while (0)
+        FIRST(0, 1);
+        FIRST(1, 2);
+        FIRST(2, 3);
+        FIRST(3, 4);
+        FIRST(4, 5);
+        FIRST(5, 6);
+        FIRST(6, 7);
+        FIRST(7, 0);
+#undef FIRST
+
+        for (size_t byte = 1; byte < sizeof(Word) - 1; ++byte) {
+#define NEXT(reg) do { \
+          if (kStride > reg) { \
+            crc##reg ^= \
+                TABLE_ENTRY(this->crc_word_interleaved_, byte, buf##reg); \
+            buf##reg >>= 8; \
+          } \
+} while(0)
+          REPEAT_FROM_0(NEXT);
+#undef NEXT
+        }
+
+#define LAST(reg) do { \
+        if (kStride > reg) { \
+          crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_interleaved_, buf##reg); \
+          buf##reg = reinterpret_cast<const Word *>(src)[reg]; \
+        } \
+} while(0)
+        REPEAT_FROM_0(LAST);
+#undef LAST
+      }
+      while (src < end);
+
+      if (sizeof(Crc) > sizeof(Word)) {
+        crc0 ^= crc_carryover;
+      }
+
+#define COMBINE(reg) do { \
+      if (kStride > reg) { \
+        if (reg != 0) { \
+          crc0 ^= crc##reg; \
+        } \
+        CRC_WORD(this, crc0, buf##reg); \
+      } \
+} while (0)
+      REPEAT_FROM_0(COMBINE);
+#undef COMBINE
+
+      src += kInterleaveBytes;
+    }
+    end += 2*kInterleaveBytes - 1;
+
+    // Process sizeof(Word) bytes at once.
+    end -= sizeof(Word) - 1;
+    for (; src < end; src += sizeof(Word)) {
+      Word buf0 = reinterpret_cast<const Word *>(src)[0];
+      CRC_WORD(this, crc0, buf0);
+    }
+    end += sizeof(Word) - 1;
+
+    // Compute CRC of remaining bytes.
+    for (;src < end; ++src) {
+      CRC_BYTE(this, crc0, *src);
+    }
+
+    return (crc0 ^ Base().Canonize());
+  }
+
+ protected:
+  enum {
+    kInterleaveBytes = sizeof(Word) * kStride,
+  };
+
+  // Multiplication tables used by CRCs.
+  TableEntry crc_word_interleaved_[sizeof(Word)][256];
+  TableEntry crc_word_[sizeof(Word)][256];
+
+  // Base class stored after CRC tables so that the most frequently
+  // used table is at offset 0 and may be accessed faster.
+  GfUtil<Crc> base_;
+
+  friend class RollingCrc< GenericCrc<Crc, TableEntry, Word, kStride> >;
+
+ private:
+  // CrcMultiword on amd64 may run at 1.2 CPU cycles per byte which is
+  // noticeably faster than CrcWord (2.2-2.6 cycles/byte depending on
+  // hardware and compiler). However, there are problems with compilers.
+  //
+  // Test system: P45 chipset, Intel Q9650 CPU, 800MHz 4-4-4-12 memory.
+  //
+  // 64-bit compiler, <= 64-bit CRC, 64-bit tables, 64-bit reads:
+  // CL 15.00.307291.1  C++   >1.2< CPU cycles/byte
+  // ICL 11.1.051 -O3   C++    1.5  CPU cycles/byte
+  // GCC 4.5 -O3        C++    2.0  CPU cycles/byte
+  // GCC 4.x -O3        ASM   >1.2< CPU cycles/byte
+  //
+  // 32-bit compiler, MMX used, <= 64-bit CRC, 64-bit tables, 64-bit reads
+  // CL 15.00.307291.1  C++   2.0  CPU cycles/byte
+  // GCC 4.5 -O3        C++   1.9  CPU cycles/byte
+  // ICL 11.1.051 -S    C++   1.6  CPU cycles/byte
+  // GCC 4.x -O3        ASM  >1.3< CPU cycles/byte
+  //
+  // So, use inline ASM code for GCC for both i386 and amd64.
+
+  Crc CrcMultiwordI386Mmx(
+          const void *data, size_t bytes, const Crc &start) const;
+  Crc CrcMultiwordGccAmd64(
+          const void *data, size_t bytes, const Crc &start) const;
+  Crc CrcMultiwordGccAmd64Sse2(
+          const uint8 *src, const uint8 *end, const Crc &start) const;
+} GCC_ALIGN_ATTRIBUTE(16);
+
+#undef REPEAT_FROM_0
+#undef REPEAT_FROM_1
+
+
+// Specialized variants.
+#if CRCUTIL_USE_ASM
+
+#if (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX)))
+
+// Declare specialized functions.
+template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
+    const void *data, size_t bytes, const uint64 &start) const;
+
+#if HAVE_AMD64 && HAVE_SSE2
+template<>
+uint128_sse2
+GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword(
+    const void *data, size_t bytes, const uint128_sse2 &start) const;
+#endif  // HAVE_AMD64 && HAVE_SSE2
+
+#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER <= 150030729 && \
+      (HAVE_I386 && HAVE_MMX)
+
+// Work around bug in MSC (present at least in v. 15.00.30729.1)
+template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
+    const void *data,
+    size_t bytes,
+    const uint64 &start) const;
+template<> __forceinline
+uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
+    const void *data,
+    size_t bytes,
+    const uint64 &start) const {
+  typedef uint64 Word;
+  typedef uint64 Crc;
+  if (bytes <= 12) {
+    const uint8 *src = static_cast<const uint8 *>(data);
+    uint64 crc = start ^ Base().Canonize();
+    for (const uint8 *end = src + bytes; src < end; ++src) {
+      CRC_BYTE(this, crc, *src);
+    }
+    return (crc ^ Base().Canonize());
+  }
+  return CrcMultiwordI386Mmx(data, bytes, start);
+}
+
+#endif  // (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX)))
+
+#endif  // CRCUTIL_USE_ASM
+
+
+#pragma pack(pop)
+
+}  // namespace crcutil
+
+#endif  // CRCUTIL_GENERIC_CRC_H_
author	f0b0s <f0b0s@yandex-team.ru>	2022-02-10 16:46:51 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:46:51 +0300
commit	cdae02d225fb5b3afbb28990e79a7ac6c9125327 (patch)
tree	49e222ea1c5804306084bb3ae065bb702625360f /contrib/libs/crcutil/generic_crc.h
parent	deabc5260ac2e17b8f5152ee060bec1740613540 (diff)
download	ydb-cdae02d225fb5b3afbb28990e79a7ac6c9125327.tar.gz