add windows platform to ydb github export

author: alexv-smirnov <alex@ydb.tech> 2022-12-20 00:50:48 +0300
committer: alexv-smirnov <alex@ydb.tech> 2022-12-20 00:50:48 +0300
commit: 84f2cfa253cc618438ed6e9d68b33fa7c0d88cb9 (patch)
tree: f0cf2236e0aafb3e437199f1ac7b559e7fad554a /contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc
parent: bde6febc1ad3b826e72746de21d7250803e8e0b5 (diff)
download: ydb-84f2cfa253cc618438ed6e9d68b33fa7c0d88cb9.tar.gz
1 files changed, 304 insertions, 0 deletions
diff --git a/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc b/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc
new file mode 100644
index 0000000000..af7352aa46
--- /dev/null
+++ b/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc
@@ -0,0 +1,304 @@
+// Copyright 2010 Google Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Implements 64-bit multiword CRC for Microsoft and Intel compilers
+// using MMX instructions (i386).
+
+#include "generic_crc.h"
+
+#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
+
+namespace crcutil {
+
+#define CRC_WORD_MMX() \
+    __asm   pxor  BUF0, CRC0 \
+    __asm   movd  TMP0, BUF0 \
+    __asm   psrlq BUF0, 32 \
+    __asm   movzx TEMP, TMP0L \
+    __asm   shr   TMP0, 8 \
+    __asm   movq  CRC0, [TABLE + TEMP * 8] \
+    __asm   movzx TEMP, TMP0L \
+    __asm   shr   TMP0, 8 \
+    __asm   pxor  CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \
+    __asm   movzx TEMP, TMP0L \
+    __asm   shr   TMP0, 8 \
+    __asm   pxor  CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \
+    __asm   pxor  CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \
+    __asm   movd  TMP0, BUF0 \
+    __asm   movzx TEMP, TMP0L \
+    __asm   shr   TMP0, 8 \
+    __asm   pxor  CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \
+    __asm   movzx TEMP, TMP0L \
+    __asm   shr   TMP0, 8 \
+    __asm   pxor  CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \
+    __asm   movzx TEMP, TMP0L \
+    __asm   shr   TMP0, 8 \
+    __asm   pxor  CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \
+    __asm   pxor  CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
+
+// frame pointer register 'ebp' modified by inline assembly code
+#pragma warning(disable: 4731)
+
+template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
+    const void *data,
+    size_t bytes,
+    const uint64 &start) const {
+  const uint8 *src = static_cast<const uint8 *>(data);
+  const uint8 *end = src + bytes;
+  uint64 crc0 = start ^ this->Base().Canonize();
+
+  ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
+  if (src >= end) {
+    return (crc0 ^ this->Base().Canonize());
+  }
+
+#define CRC0  mm0
+#define CRC1  mm1
+#define CRC2  mm2
+#define CRC3  mm3
+#define BUF0  mm4
+#define BUF1  mm5
+#define BUF2  mm6
+#define BUF3  mm7
+#define TMP0  eax
+#define TMP0L  al
+#define TMP0H  ah
+#define TMP1  ebx
+#define TMP1L  bl
+#define TMP1H  bh
+#define TMP2  ecx
+#define TMP2L  cl
+#define TMP2H  ch
+#define TMP3  edx
+#define TMP3L  dl
+#define TMP3H  dh
+#define TEMP  edi
+#define SRC   esi
+#define END   [esp]
+#define TABLE ebp
+
+
+  const uint64 *interleaved_table_address =
+                    &this->crc_word_interleaved_[0][0];
+  const uint64 *word_table_address = &this->crc_word_[0][0];
+
+  __asm {
+    push  ebp
+
+    mov   TMP0, interleaved_table_address
+
+    movq  CRC0, crc0
+    mov   SRC, src
+    mov   TMP1, end
+    sub   TMP1, 2*4*8 - 1
+    cmp   SRC, TMP1
+    mov   TABLE, word_table_address
+    jae   end_main_loop
+
+    push  TABLE
+    mov   TABLE, TMP0
+    push  TMP1
+
+    pxor  CRC1, CRC1
+    pxor  CRC2, CRC2
+    pxor  CRC3, CRC3
+
+    movq  BUF0, [SRC]
+    movq  BUF1, [SRC + 1 * 8]
+    movq  BUF2, [SRC + 2 * 8]
+    movq  BUF3, [SRC + 3 * 8]
+
+ main_loop:
+#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
+    prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH]
+#endif
+    add   SRC, 32
+    pxor  BUF0, CRC0
+    pxor  BUF1, CRC1
+    pxor  BUF2, CRC2
+    pxor  BUF3, CRC3
+
+    movd  TMP0, BUF0
+    psrlq BUF0, 32
+    movd  TMP1, BUF1
+    psrlq BUF1, 32
+    movd  TMP2, BUF2
+    psrlq BUF2, 32
+    movd  TMP3, BUF3
+    psrlq BUF3, 32
+
+    movzx TEMP, TMP0L
+    movq  CRC0, [TABLE + TEMP * 8]
+    movzx TEMP, TMP1L
+    movq  CRC1, [TABLE + TEMP * 8]
+    movzx TEMP, TMP2L
+    movq  CRC2, [TABLE + TEMP * 8]
+    movzx TEMP, TMP3L
+    movq  CRC3, [TABLE + TEMP * 8]
+
+    movzx TEMP, TMP0H
+    shr TMP0, 16
+    pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8]
+    movzx TEMP, TMP1H
+    shr TMP1, 16
+    pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8]
+    movzx TEMP, TMP2H
+    shr TMP2, 16
+    pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8]
+    movzx TEMP, TMP3H
+    shr TMP3, 16
+    pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8]
+
+    movzx TEMP, TMP0L
+    shr TMP0, 8
+    pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8]
+    movzx TEMP, TMP1L
+    shr TMP1, 8
+    pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8]
+    movzx TEMP, TMP2L
+    shr TMP2, 8
+    pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8]
+    movzx TEMP, TMP3L
+    shr TMP3, 8
+    pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8]
+
+    pxor  CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8]
+    movd  TMP0, BUF0
+    pxor  CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8]
+    movd  TMP1, BUF1
+    pxor  CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8]
+    movd  TMP2, BUF2
+    pxor  CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8]
+    movd  TMP3, BUF3
+
+    movzx TEMP, TMP0L
+    pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8]
+    movzx TEMP, TMP1L
+    pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8]
+    movzx TEMP, TMP2L
+    pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8]
+    movzx TEMP, TMP3L
+    pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8]
+
+    movzx TEMP, TMP0H
+    shr TMP0, 16
+    pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8]
+    movzx TEMP, TMP1H
+    shr TMP1, 16
+    pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8]
+    movzx TEMP, TMP2H
+    shr TMP2, 16
+    pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8]
+    movzx TEMP, TMP3H
+    shr TMP3, 16
+    pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8]
+
+    movzx TEMP, TMP0L
+    shr TMP0, 8
+    pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8]
+    movzx TEMP, TMP1L
+    shr TMP1, 8
+    pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8]
+    movzx TEMP, TMP2L
+    shr TMP2, 8
+    pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8]
+    movzx TEMP, TMP3L
+    shr TMP3, 8
+    pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8]
+
+    pxor  CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
+    movq  BUF0, [SRC]
+    pxor  CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8]
+    movq  BUF1, [SRC + 1 * 8]
+    pxor  CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8]
+    movq  BUF2, [SRC + 2 * 8]
+    pxor  CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8]
+    movq  BUF3, [SRC + 3 * 8]
+
+    cmp   END, SRC
+    ja    main_loop
+
+#undef END
+#define END TMP1
+    pop   END
+    pop   TABLE
+    add   SRC, 32
+
+    CRC_WORD_MMX()
+
+    pxor  BUF1, CRC1
+    movq  BUF0, BUF1
+    CRC_WORD_MMX()
+
+    pxor  BUF2, CRC2
+    movq  BUF0, BUF2
+    CRC_WORD_MMX()
+
+    pxor  BUF3, CRC3
+    movq  BUF0, BUF3
+    CRC_WORD_MMX()
+
+ end_main_loop:
+    add   END, 2*4*8 - 8
+    cmp   SRC, END
+    jae   end_word_loop
+
+ word_loop:
+    movq  BUF0, [SRC]
+    add   SRC, 8
+    CRC_WORD_MMX()
+    cmp   END, SRC
+    ja    word_loop
+ end_word_loop:
+
+#if 0   // Plain C version is faster?
+    add   END, 7
+    cmp   SRC, END
+    jae   end_byte_loop
+
+ byte_loop:
+    movd  TMP0, CRC0
+    movzx TEMP, byte ptr [SRC]
+    movzx TMP0, TMP0L
+    psrlq CRC0, 8
+    xor   TEMP, TMP0
+    add   SRC, 1
+    pxor  CRC0, [TABLE + TEMP*8 + 7*256*8]
+    cmp   END, SRC
+    ja    byte_loop
+ end_byte_loop:
+#endif
+
+    pop   ebp
+
+    mov   src, SRC
+    movq  crc0, CRC0
+
+    emms
+  }
+
+#if 1
+  // Compute CRC of remaining bytes.
+  for (;src < end; ++src) {
+    CRC_BYTE(this, crc0, *src);
+  }
+#endif
+
+  return (crc0 ^ this->Base().Canonize());
+}
+
+
+}  // namespace crcutil
+
+#endif  // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
author	alexv-smirnov <alex@ydb.tech>	2022-12-20 00:50:48 +0300
committer	alexv-smirnov <alex@ydb.tech>	2022-12-20 00:50:48 +0300
commit	84f2cfa253cc618438ed6e9d68b33fa7c0d88cb9 (patch)
tree	f0cf2236e0aafb3e437199f1ac7b559e7fad554a /contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc
parent	bde6febc1ad3b826e72746de21d7250803e8e0b5 (diff)
download	ydb-84f2cfa253cc618438ed6e9d68b33fa7c0d88cb9.tar.gz