Restoring authorship annotation for <f0b0s@yandex-team.ru>. Commit 1 of 2.

author: f0b0s <f0b0s@yandex-team.ru> 2022-02-10 16:46:51 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:46:51 +0300
commit: deabc5260ac2e17b8f5152ee060bec1740613540 (patch)
tree: bc498b2fe3c447d13c2abea85b429fee8dd485ef /contrib/libs/crcutil/crc32c_sse4.cc
parent: 2e6009493e74f88988b81f219b301f450331648d (diff)
download: ydb-deabc5260ac2e17b8f5152ee060bec1740613540.tar.gz
1 files changed, 363 insertions, 363 deletions
diff --git a/contrib/libs/crcutil/crc32c_sse4.cc b/contrib/libs/crcutil/crc32c_sse4.cc
index ed0f64410a..9875ab4ff2 100644
--- a/contrib/libs/crcutil/crc32c_sse4.cc
+++ b/contrib/libs/crcutil/crc32c_sse4.cc
@@ -1,369 +1,369 @@
-// Copyright 2010 Google Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Implements CRC32C using Intel's SSE4 crc32 instruction.
-// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero,
-// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise.
-
-#include "crc32c_sse4.h"
-
+// Copyright 2010 Google Inc.  All rights reserved. 
+// 
+// Licensed under the Apache License, Version 2.0 (the "License"); 
+// you may not use this file except in compliance with the License. 
+// You may obtain a copy of the License at 
+// 
+//      http://www.apache.org/licenses/LICENSE-2.0 
+// 
+// Unless required by applicable law or agreed to in writing, software 
+// distributed under the License is distributed on an "AS IS" BASIS, 
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+// See the License for the specific language governing permissions and 
+// limitations under the License. 
+ 
+// Implements CRC32C using Intel's SSE4 crc32 instruction. 
+// Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero, 
+// emilates intrinsics via CRC_WORD/CRC_BYTE otherwise. 
+ 
+#include "crc32c_sse4.h" 
+ 
 #include <util/system/compiler.h>
 
-#if HAVE_I386 || HAVE_AMD64
-
-namespace crcutil {
-
-#define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \
-  CRC_UPDATE_WORD(crc0, \
-      reinterpret_cast<const size_t *>(src + \
-          0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
-  CRC_UPDATE_WORD(crc1, \
-      reinterpret_cast<const size_t *>(src + \
-          1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
-  CRC_UPDATE_WORD(crc2, \
-      reinterpret_cast<const size_t *>(src + \
-          2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
-  if (num_stripes > 3) { \
-    CRC_UPDATE_WORD(crc3, \
-        reinterpret_cast<const size_t *>(src + \
-            3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
-  } \
-} while (0)
-
-// Multiplies "crc" by "x**(8 *  STRIPE_SIZE(block_size)"
-// using appropriate multiplication table(s).
-//
-#if 0
-
-// This variant is for illustration purposes only.
-// Actual implementation below:
-// 1. Splits the computation into 2 data-independent paths
-//    by independently multiplying lower and upper halves
-//    of "crc0" in interleaved manner, and combining the
-//    results in the end.
-// 2. Removing redundant "crc0 = 0" etc. in the beginning.
-// 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round.
-#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
-  size_t tmp0 = crc0; \
-  crc0 = 0; \
-  for (size_t i = 0; i < kNumTables; ++i) { \
-    crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
-            [i][tmp0 & (kTableEntries - 1)]; \
-    tmp0 >>= kTableEntryBits; \
-  } \
-} while (0)
-
-#else
-
-#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
-  size_t tmp0 = crc0; \
-  size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \
-  crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
-         [0][tmp0 & (kTableEntries - 1)]; \
-  tmp0 >>= kTableEntryBits; \
-  size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
-                [kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
-  tmp1 >>= kTableEntryBits; \
-  for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \
-    crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
-            [i][tmp0 & (kTableEntries - 1)]; \
-    tmp0 >>= kTableEntryBits; \
-    crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
-            [i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
-    tmp1 >>= kTableEntryBits; \
-  } \
-  crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
-          [kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \
-  if (kNumTables & 1) { \
-    tmp0 >>= kTableEntryBits; \
-  } \
-  crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
-          [kNumTables - 1][tmp1]; \
-  if (kNumTables & 1) { \
-    crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
-            [kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \
-  } \
-  crc0 ^= crc1; \
-} while (0)
-
-#endif
-
-// Given CRCs (crc0, crc1, etc.) of consequitive
-// stripes of STRIPE_SIZE(block_size) bytes each,
-// produces CRC of concatenated stripes.
-#define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \
-  MULTIPLY_CRC(crc0, block_size, num_stripes); \
-  crc0 ^= crc1; \
-  MULTIPLY_CRC(crc0, block_size, num_stripes); \
-  crc0 ^= crc2; \
-  if (num_stripes > 3) { \
-    MULTIPLY_CRC(crc0, block_size, num_stripes); \
-    crc0 ^= crc3; \
-  } \
-} while (0)
-
-// Processes input BLOCK_SIZE(block) bytes per iteration
-// by splitting a block of BLOCK_SIZE(block) bytes into N
-// equally-sized stripes of STRIPE_SIZE(block_size) each,
-// computing CRC of each stripe, and concatenating stripe CRCs.
-#define PROCESS_BLOCK(block_size, num_stripes) do { \
-  while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
-    Crc crc1 = 0; \
-    Crc crc2 = 0; \
-    Crc crc3; \
-    if (num_stripes > 3) crc3 = 0; \
-    { \
-      const uint8 *stripe_end = src + \
-          (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \
-              kUnrolledLoopBytes) * kUnrolledLoopBytes; \
-      do { \
-        UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
-        UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \
-        UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \
-        UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \
-        UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \
-        UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \
-        UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \
-        UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \
-        src += kUnrolledLoopBytes; \
-      } while (src < stripe_end); \
-      if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
-          kUnrolledLoopBytes) != 0) { \
-        stripe_end += \
-            CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
-                kUnrolledLoopBytes; \
-        do { \
-          UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
-          src += sizeof(size_t); \
-        } while (src < stripe_end); \
-      } \
-    } \
-    COMBINE_STRIPE_CRCS(block_size, num_stripes); \
-    src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \
-           ((num_stripes) - 1); \
-    bytes = static_cast<size_t>(end - src); \
-  } \
- no_more_##block_size##_##num_stripes:; \
-} while (0)
-
+#if HAVE_I386 || HAVE_AMD64 
+ 
+namespace crcutil { 
+ 
+#define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \ 
+  CRC_UPDATE_WORD(crc0, \ 
+      reinterpret_cast<const size_t *>(src + \ 
+          0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ 
+  CRC_UPDATE_WORD(crc1, \ 
+      reinterpret_cast<const size_t *>(src + \ 
+          1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ 
+  CRC_UPDATE_WORD(crc2, \ 
+      reinterpret_cast<const size_t *>(src + \ 
+          2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ 
+  if (num_stripes > 3) { \ 
+    CRC_UPDATE_WORD(crc3, \ 
+        reinterpret_cast<const size_t *>(src + \ 
+            3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \ 
+  } \ 
+} while (0) 
+ 
+// Multiplies "crc" by "x**(8 *  STRIPE_SIZE(block_size)" 
+// using appropriate multiplication table(s). 
+// 
+#if 0 
+ 
+// This variant is for illustration purposes only. 
+// Actual implementation below: 
+// 1. Splits the computation into 2 data-independent paths 
+//    by independently multiplying lower and upper halves 
+//    of "crc0" in interleaved manner, and combining the 
+//    results in the end. 
+// 2. Removing redundant "crc0 = 0" etc. in the beginning. 
+// 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round. 
+#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ 
+  size_t tmp0 = crc0; \ 
+  crc0 = 0; \ 
+  for (size_t i = 0; i < kNumTables; ++i) { \ 
+    crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ 
+            [i][tmp0 & (kTableEntries - 1)]; \ 
+    tmp0 >>= kTableEntryBits; \ 
+  } \ 
+} while (0) 
+ 
+#else 
+ 
+#define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \ 
+  size_t tmp0 = crc0; \ 
+  size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \ 
+  crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ 
+         [0][tmp0 & (kTableEntries - 1)]; \ 
+  tmp0 >>= kTableEntryBits; \ 
+  size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ 
+                [kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ 
+  tmp1 >>= kTableEntryBits; \ 
+  for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \ 
+    crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ 
+            [i][tmp0 & (kTableEntries - 1)]; \ 
+    tmp0 >>= kTableEntryBits; \ 
+    crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ 
+            [i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \ 
+    tmp1 >>= kTableEntryBits; \ 
+  } \ 
+  crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ 
+          [kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \ 
+  if (kNumTables & 1) { \ 
+    tmp0 >>= kTableEntryBits; \ 
+  } \ 
+  crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ 
+          [kNumTables - 1][tmp1]; \ 
+  if (kNumTables & 1) { \ 
+    crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \ 
+            [kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \ 
+  } \ 
+  crc0 ^= crc1; \ 
+} while (0) 
+ 
+#endif 
+ 
+// Given CRCs (crc0, crc1, etc.) of consequitive 
+// stripes of STRIPE_SIZE(block_size) bytes each, 
+// produces CRC of concatenated stripes. 
+#define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \ 
+  MULTIPLY_CRC(crc0, block_size, num_stripes); \ 
+  crc0 ^= crc1; \ 
+  MULTIPLY_CRC(crc0, block_size, num_stripes); \ 
+  crc0 ^= crc2; \ 
+  if (num_stripes > 3) { \ 
+    MULTIPLY_CRC(crc0, block_size, num_stripes); \ 
+    crc0 ^= crc3; \ 
+  } \ 
+} while (0) 
+ 
+// Processes input BLOCK_SIZE(block) bytes per iteration 
+// by splitting a block of BLOCK_SIZE(block) bytes into N 
+// equally-sized stripes of STRIPE_SIZE(block_size) each, 
+// computing CRC of each stripe, and concatenating stripe CRCs. 
+#define PROCESS_BLOCK(block_size, num_stripes) do { \ 
+  while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ 
+    Crc crc1 = 0; \ 
+    Crc crc2 = 0; \ 
+    Crc crc3; \ 
+    if (num_stripes > 3) crc3 = 0; \ 
+    { \ 
+      const uint8 *stripe_end = src + \ 
+          (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \ 
+              kUnrolledLoopBytes) * kUnrolledLoopBytes; \ 
+      do { \ 
+        UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ 
+        UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \ 
+        UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \ 
+        UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \ 
+        UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \ 
+        UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \ 
+        UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \ 
+        UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \ 
+        src += kUnrolledLoopBytes; \ 
+      } while (src < stripe_end); \ 
+      if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ 
+          kUnrolledLoopBytes) != 0) { \ 
+        stripe_end += \ 
+            CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \ 
+                kUnrolledLoopBytes; \ 
+        do { \ 
+          UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \ 
+          src += sizeof(size_t); \ 
+        } while (src < stripe_end); \ 
+      } \ 
+    } \ 
+    COMBINE_STRIPE_CRCS(block_size, num_stripes); \ 
+    src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \ 
+           ((num_stripes) - 1); \ 
+    bytes = static_cast<size_t>(end - src); \ 
+  } \ 
+ no_more_##block_size##_##num_stripes:; \ 
+} while (0) 
+ 
 Y_NO_SANITIZE("undefined")
-size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const {
-  const uint8 *src = static_cast<const uint8 *>(data);
-  const uint8 *end = src + bytes;
-  crc0 ^= Base().Canonize();
-
-  // If we don't have too much data to process,
-  // do not waste time trying to align input etc.
-  // Noticeably improves performance on small inputs.
-  if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t;
-  if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t;
-  if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t;
-
-#define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \
-  if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
-    goto no_more_##block_size##_##num_stripes; \
-  } \
-} while (0)
-#define NOOP(block_size, num_stripes)
-
-  CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL,
-                                             NOOP,
-                                             NOOP);
-
-#undef PROCESS_TAIL_IF_SMALL
-
-
-  // Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because:
-  // 1. It uses CRC_BYTE() which won't work.
-  // 2. Its threshold may be incorrect becuase Crc32 that uses
-  //    native CPU crc32 instruction is much faster than
-  //    generic table-based CRC computation.
-  //
-  // In case of X5550 CPU, break even point is at 2KB -- exactly.
-  if (bytes >= 2 * 1024) {
-    while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) {
-      if (src >= end) {
-        return (crc0 ^ Base().Canonize());
-      }
-      CRC_UPDATE_BYTE(crc0, src[0]);
-      src += 1;
-    }
-    bytes = static_cast<size_t>(end - src);
-  }
-  if (src >= end) {
-    return (crc0 ^ Base().Canonize());
-  }
-
-  // Quickly skip processing of too large blocks
-  // Noticeably improves performance on small inputs.
-#define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \
-  if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
-    goto no_more_##block_size##_##num_stripes; \
-  } \
-} while (0)
-
-  CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP,
-                                             SKIP_BLOCK_IF_NEEDED,
-                                             SKIP_BLOCK_IF_NEEDED);
-
-#undef SKIP_BLOCK_IF_NEEDED
-
-  // Process data in all blocks.
-  CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK,
-                                              PROCESS_BLOCK,
-                                              PROCESS_BLOCK);
-
-  // Finish the tail word-by-word and then byte-by-byte.
-#define CRC_UPDATE_WORD_4(index) do { \
-  CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \
-  CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \
-  CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \
-  CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \
-} while (0)
-
-  if (bytes >= 4 * 4 * sizeof(size_t)) {
-    end -= 4 * 4 * sizeof(size_t);
-    do {
-      CRC_UPDATE_WORD_4(4 * 0);
-      CRC_UPDATE_WORD_4(4 * 1);
-      CRC_UPDATE_WORD_4(4 * 2);
-      CRC_UPDATE_WORD_4(4 * 3);
-      src += 4 * 4 * sizeof(size_t);
-    } while (src <= end);
-    end += 4 * 4 * sizeof(size_t);
-    bytes = static_cast<size_t>(end - src);
-  }
- less_than_16_size_t:
-
-  if (bytes >= 4 * 2 * sizeof(size_t)) {
-    CRC_UPDATE_WORD_4(4 * 0);
-    CRC_UPDATE_WORD_4(4 * 1);
-    src += 4 * 2 * sizeof(size_t);
-    bytes -= 4 * 2 * sizeof(size_t);
-  }
- less_than_8_size_t:
-
-  if (bytes >= 4 * sizeof(size_t)) {
-    CRC_UPDATE_WORD_4(0);
-    src += 4 * sizeof(size_t);
-    bytes -= 4 * sizeof(size_t);
-  }
- less_than_4_size_t:
-
-  if (bytes >= 1 * sizeof(size_t)) {
-    end -= 1 * sizeof(size_t);
-    do {
-      CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]);
-      src += 1 * sizeof(size_t);
-    } while (src <= end);
-    end += 1 * sizeof(size_t);
-  }
-
-  while (src < end) {
-    CRC_UPDATE_BYTE(crc0, src[0]);
-    src += 1;
-  }
-
-  return (crc0 ^ Base().Canonize());
-}
-
-
-void Crc32cSSE4::Init(bool constant) {
-  base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant);
-
-#define INIT_MUL_TABLE(block_size, num_stripes) do { \
-  size_t multiplier = \
-      Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \
-  for (size_t table = 0; table < kNumTables; ++table) { \
-    for (size_t entry = 0; entry < kTableEntries; ++entry) { \
-      size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \
-      CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \
-            static_cast<Entry>(Base().Multiply(value, multiplier)); \
-    } \
-  } \
-} while (0)
-
-  CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE);
-
-#undef INIT_MUL_TABLE
-
-#if !CRCUTIL_USE_MM_CRC32
-  for (size_t j = 0; j < sizeof(Word); ++j) {
-    Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32);
-    for (size_t i = 0; i < 256; ++i) {
-      crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k);
-    }
-  }
-#endif  // !CRCUTIL_USE_MM_CRC32
-}
-
-
-bool Crc32cSSE4::IsSSE42Available() {
-#if defined(_MSC_VER)
-  int cpu_info[4];
-  __cpuid(cpu_info, 1);
+size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const { 
+  const uint8 *src = static_cast<const uint8 *>(data); 
+  const uint8 *end = src + bytes; 
+  crc0 ^= Base().Canonize(); 
+ 
+  // If we don't have too much data to process, 
+  // do not waste time trying to align input etc. 
+  // Noticeably improves performance on small inputs. 
+  if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t; 
+  if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t; 
+  if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t; 
+ 
+#define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \ 
+  if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ 
+    goto no_more_##block_size##_##num_stripes; \ 
+  } \ 
+} while (0) 
+#define NOOP(block_size, num_stripes) 
+ 
+  CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL, 
+                                             NOOP, 
+                                             NOOP); 
+ 
+#undef PROCESS_TAIL_IF_SMALL 
+ 
+ 
+  // Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because: 
+  // 1. It uses CRC_BYTE() which won't work. 
+  // 2. Its threshold may be incorrect becuase Crc32 that uses 
+  //    native CPU crc32 instruction is much faster than 
+  //    generic table-based CRC computation. 
+  // 
+  // In case of X5550 CPU, break even point is at 2KB -- exactly. 
+  if (bytes >= 2 * 1024) { 
+    while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { 
+      if (src >= end) { 
+        return (crc0 ^ Base().Canonize()); 
+      } 
+      CRC_UPDATE_BYTE(crc0, src[0]); 
+      src += 1; 
+    } 
+    bytes = static_cast<size_t>(end - src); 
+  } 
+  if (src >= end) { 
+    return (crc0 ^ Base().Canonize()); 
+  } 
+ 
+  // Quickly skip processing of too large blocks 
+  // Noticeably improves performance on small inputs. 
+#define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \ 
+  if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \ 
+    goto no_more_##block_size##_##num_stripes; \ 
+  } \ 
+} while (0) 
+ 
+  CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP, 
+                                             SKIP_BLOCK_IF_NEEDED, 
+                                             SKIP_BLOCK_IF_NEEDED); 
+ 
+#undef SKIP_BLOCK_IF_NEEDED 
+ 
+  // Process data in all blocks. 
+  CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK, 
+                                              PROCESS_BLOCK, 
+                                              PROCESS_BLOCK); 
+ 
+  // Finish the tail word-by-word and then byte-by-byte. 
+#define CRC_UPDATE_WORD_4(index) do { \ 
+  CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \ 
+  CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \ 
+  CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \ 
+  CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \ 
+} while (0) 
+ 
+  if (bytes >= 4 * 4 * sizeof(size_t)) { 
+    end -= 4 * 4 * sizeof(size_t); 
+    do { 
+      CRC_UPDATE_WORD_4(4 * 0); 
+      CRC_UPDATE_WORD_4(4 * 1); 
+      CRC_UPDATE_WORD_4(4 * 2); 
+      CRC_UPDATE_WORD_4(4 * 3); 
+      src += 4 * 4 * sizeof(size_t); 
+    } while (src <= end); 
+    end += 4 * 4 * sizeof(size_t); 
+    bytes = static_cast<size_t>(end - src); 
+  } 
+ less_than_16_size_t: 
+ 
+  if (bytes >= 4 * 2 * sizeof(size_t)) { 
+    CRC_UPDATE_WORD_4(4 * 0); 
+    CRC_UPDATE_WORD_4(4 * 1); 
+    src += 4 * 2 * sizeof(size_t); 
+    bytes -= 4 * 2 * sizeof(size_t); 
+  } 
+ less_than_8_size_t: 
+ 
+  if (bytes >= 4 * sizeof(size_t)) { 
+    CRC_UPDATE_WORD_4(0); 
+    src += 4 * sizeof(size_t); 
+    bytes -= 4 * sizeof(size_t); 
+  } 
+ less_than_4_size_t: 
+ 
+  if (bytes >= 1 * sizeof(size_t)) { 
+    end -= 1 * sizeof(size_t); 
+    do { 
+      CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]); 
+      src += 1 * sizeof(size_t); 
+    } while (src <= end); 
+    end += 1 * sizeof(size_t); 
+  } 
+ 
+  while (src < end) { 
+    CRC_UPDATE_BYTE(crc0, src[0]); 
+    src += 1; 
+  } 
+ 
+  return (crc0 ^ Base().Canonize()); 
+} 
+ 
+ 
+void Crc32cSSE4::Init(bool constant) { 
+  base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant); 
+ 
+#define INIT_MUL_TABLE(block_size, num_stripes) do { \ 
+  size_t multiplier = \ 
+      Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \ 
+  for (size_t table = 0; table < kNumTables; ++table) { \ 
+    for (size_t entry = 0; entry < kTableEntries; ++entry) { \ 
+      size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \ 
+      CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \ 
+            static_cast<Entry>(Base().Multiply(value, multiplier)); \ 
+    } \ 
+  } \ 
+} while (0) 
+ 
+  CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE); 
+ 
+#undef INIT_MUL_TABLE 
+ 
+#if !CRCUTIL_USE_MM_CRC32 
+  for (size_t j = 0; j < sizeof(Word); ++j) { 
+    Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32); 
+    for (size_t i = 0; i < 256; ++i) { 
+      crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k); 
+    } 
+  } 
+#endif  // !CRCUTIL_USE_MM_CRC32 
+} 
+ 
+ 
+bool Crc32cSSE4::IsSSE42Available() { 
+#if defined(_MSC_VER) 
+  int cpu_info[4]; 
+  __cpuid(cpu_info, 1); 
   return ((cpu_info[2] & (1 << 20)) != 0);
-#elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386)
-  // Not using "cpuid.h" intentionally: it is missing from
-  // too many installations.
-  uint32 eax;
-  uint32 ecx;
-  uint32 edx;
-  __asm__ volatile(
-#if HAVE_I386 && defined(__PIC__)
+#elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386) 
+  // Not using "cpuid.h" intentionally: it is missing from 
+  // too many installations. 
+  uint32 eax; 
+  uint32 ecx; 
+  uint32 edx; 
+  __asm__ volatile( 
+#if HAVE_I386 && defined(__PIC__) 
     "push %%ebx\n"
-    "cpuid\n"
+    "cpuid\n" 
     "pop %%ebx\n"
-#else
-    "cpuid\n"
-#endif  // HAVE_I386 && defined(__PIC__)
-    : "=a" (eax), "=c" (ecx), "=d" (edx)
-    : "a" (1), "2" (0)
-    : "%ebx"
-  );
-  return ((ecx & (1 << 20)) != 0);
-#else
-  return false;
-#endif
-}
-
-
-void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc,
-                             size_t roll_window_bytes,
-                             const Crc &start_value) {
-  crc_ = &crc;
-  roll_window_bytes_ = roll_window_bytes;
-  start_value_ = start_value;
-
-  Crc add = crc.Base().Canonize() ^ start_value;
-  add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes));
-  add ^= crc.Base().Canonize();
-  Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1);
-  add = crc.Base().Multiply(add, mul);
-
-  mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree());
-  for (size_t i = 0; i < 256; ++i) {
-    out_[i] = static_cast<Entry>(
-                  crc.Base().MultiplyUnnormalized(
-                      static_cast<Crc>(i), 8, mul) ^ add);
-  }
-
-#if !CRCUTIL_USE_MM_CRC32
-  memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_));
-#endif  // !CRCUTIL_USE_MM_CRC32
-}
-
-}  // namespace crcutil
-
-#endif  // HAVE_I386 || HAVE_AMD64
+#else 
+    "cpuid\n" 
+#endif  // HAVE_I386 && defined(__PIC__) 
+    : "=a" (eax), "=c" (ecx), "=d" (edx) 
+    : "a" (1), "2" (0) 
+    : "%ebx" 
+  ); 
+  return ((ecx & (1 << 20)) != 0); 
+#else 
+  return false; 
+#endif 
+} 
+ 
+ 
+void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc, 
+                             size_t roll_window_bytes, 
+                             const Crc &start_value) { 
+  crc_ = &crc; 
+  roll_window_bytes_ = roll_window_bytes; 
+  start_value_ = start_value; 
+ 
+  Crc add = crc.Base().Canonize() ^ start_value; 
+  add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes)); 
+  add ^= crc.Base().Canonize(); 
+  Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1); 
+  add = crc.Base().Multiply(add, mul); 
+ 
+  mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree()); 
+  for (size_t i = 0; i < 256; ++i) { 
+    out_[i] = static_cast<Entry>( 
+                  crc.Base().MultiplyUnnormalized( 
+                      static_cast<Crc>(i), 8, mul) ^ add); 
+  } 
+ 
+#if !CRCUTIL_USE_MM_CRC32 
+  memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_)); 
+#endif  // !CRCUTIL_USE_MM_CRC32 
+} 
+ 
+}  // namespace crcutil 
+ 
+#endif  // HAVE_I386 || HAVE_AMD64
author	f0b0s <f0b0s@yandex-team.ru>	2022-02-10 16:46:51 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:46:51 +0300
commit	deabc5260ac2e17b8f5152ee060bec1740613540 (patch)
tree	bc498b2fe3c447d13c2abea85b429fee8dd485ef /contrib/libs/crcutil/crc32c_sse4.cc
parent	2e6009493e74f88988b81f219b301f450331648d (diff)
download	ydb-deabc5260ac2e17b8f5152ee060bec1740613540.tar.gz