aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc
diff options
context:
space:
mode:
authoralexv-smirnov <alex@ydb.tech>2022-12-20 00:50:48 +0300
committeralexv-smirnov <alex@ydb.tech>2022-12-20 00:50:48 +0300
commit84f2cfa253cc618438ed6e9d68b33fa7c0d88cb9 (patch)
treef0cf2236e0aafb3e437199f1ac7b559e7fad554a /contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc
parentbde6febc1ad3b826e72746de21d7250803e8e0b5 (diff)
downloadydb-84f2cfa253cc618438ed6e9d68b33fa7c0d88cb9.tar.gz
add windows platform to ydb github export
Diffstat (limited to 'contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc')
-rw-r--r--contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc304
1 files changed, 304 insertions, 0 deletions
diff --git a/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc b/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc
new file mode 100644
index 0000000000..af7352aa46
--- /dev/null
+++ b/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc
@@ -0,0 +1,304 @@
+// Copyright 2010 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Implements 64-bit multiword CRC for Microsoft and Intel compilers
+// using MMX instructions (i386).
+
+#include "generic_crc.h"
+
+#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
+
+namespace crcutil {
+
+#define CRC_WORD_MMX() \
+ __asm pxor BUF0, CRC0 \
+ __asm movd TMP0, BUF0 \
+ __asm psrlq BUF0, 32 \
+ __asm movzx TEMP, TMP0L \
+ __asm shr TMP0, 8 \
+ __asm movq CRC0, [TABLE + TEMP * 8] \
+ __asm movzx TEMP, TMP0L \
+ __asm shr TMP0, 8 \
+ __asm pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \
+ __asm movzx TEMP, TMP0L \
+ __asm shr TMP0, 8 \
+ __asm pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \
+ __asm pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \
+ __asm movd TMP0, BUF0 \
+ __asm movzx TEMP, TMP0L \
+ __asm shr TMP0, 8 \
+ __asm pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \
+ __asm movzx TEMP, TMP0L \
+ __asm shr TMP0, 8 \
+ __asm pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \
+ __asm movzx TEMP, TMP0L \
+ __asm shr TMP0, 8 \
+ __asm pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \
+ __asm pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
+
+// frame pointer register 'ebp' modified by inline assembly code
+#pragma warning(disable: 4731)
+
+template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
+ const void *data,
+ size_t bytes,
+ const uint64 &start) const {
+ const uint8 *src = static_cast<const uint8 *>(data);
+ const uint8 *end = src + bytes;
+ uint64 crc0 = start ^ this->Base().Canonize();
+
+ ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
+ if (src >= end) {
+ return (crc0 ^ this->Base().Canonize());
+ }
+
+#define CRC0 mm0
+#define CRC1 mm1
+#define CRC2 mm2
+#define CRC3 mm3
+#define BUF0 mm4
+#define BUF1 mm5
+#define BUF2 mm6
+#define BUF3 mm7
+#define TMP0 eax
+#define TMP0L al
+#define TMP0H ah
+#define TMP1 ebx
+#define TMP1L bl
+#define TMP1H bh
+#define TMP2 ecx
+#define TMP2L cl
+#define TMP2H ch
+#define TMP3 edx
+#define TMP3L dl
+#define TMP3H dh
+#define TEMP edi
+#define SRC esi
+#define END [esp]
+#define TABLE ebp
+
+
+ const uint64 *interleaved_table_address =
+ &this->crc_word_interleaved_[0][0];
+ const uint64 *word_table_address = &this->crc_word_[0][0];
+
+ __asm {
+ push ebp
+
+ mov TMP0, interleaved_table_address
+
+ movq CRC0, crc0
+ mov SRC, src
+ mov TMP1, end
+ sub TMP1, 2*4*8 - 1
+ cmp SRC, TMP1
+ mov TABLE, word_table_address
+ jae end_main_loop
+
+ push TABLE
+ mov TABLE, TMP0
+ push TMP1
+
+ pxor CRC1, CRC1
+ pxor CRC2, CRC2
+ pxor CRC3, CRC3
+
+ movq BUF0, [SRC]
+ movq BUF1, [SRC + 1 * 8]
+ movq BUF2, [SRC + 2 * 8]
+ movq BUF3, [SRC + 3 * 8]
+
+ main_loop:
+#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
+ prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH]
+#endif
+ add SRC, 32
+ pxor BUF0, CRC0
+ pxor BUF1, CRC1
+ pxor BUF2, CRC2
+ pxor BUF3, CRC3
+
+ movd TMP0, BUF0
+ psrlq BUF0, 32
+ movd TMP1, BUF1
+ psrlq BUF1, 32
+ movd TMP2, BUF2
+ psrlq BUF2, 32
+ movd TMP3, BUF3
+ psrlq BUF3, 32
+
+ movzx TEMP, TMP0L
+ movq CRC0, [TABLE + TEMP * 8]
+ movzx TEMP, TMP1L
+ movq CRC1, [TABLE + TEMP * 8]
+ movzx TEMP, TMP2L
+ movq CRC2, [TABLE + TEMP * 8]
+ movzx TEMP, TMP3L
+ movq CRC3, [TABLE + TEMP * 8]
+
+ movzx TEMP, TMP0H
+ shr TMP0, 16
+ pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8]
+ movzx TEMP, TMP1H
+ shr TMP1, 16
+ pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8]
+ movzx TEMP, TMP2H
+ shr TMP2, 16
+ pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8]
+ movzx TEMP, TMP3H
+ shr TMP3, 16
+ pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8]
+
+ movzx TEMP, TMP0L
+ shr TMP0, 8
+ pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8]
+ movzx TEMP, TMP1L
+ shr TMP1, 8
+ pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8]
+ movzx TEMP, TMP2L
+ shr TMP2, 8
+ pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8]
+ movzx TEMP, TMP3L
+ shr TMP3, 8
+ pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8]
+
+ pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8]
+ movd TMP0, BUF0
+ pxor CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8]
+ movd TMP1, BUF1
+ pxor CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8]
+ movd TMP2, BUF2
+ pxor CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8]
+ movd TMP3, BUF3
+
+ movzx TEMP, TMP0L
+ pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8]
+ movzx TEMP, TMP1L
+ pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8]
+ movzx TEMP, TMP2L
+ pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8]
+ movzx TEMP, TMP3L
+ pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8]
+
+ movzx TEMP, TMP0H
+ shr TMP0, 16
+ pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8]
+ movzx TEMP, TMP1H
+ shr TMP1, 16
+ pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8]
+ movzx TEMP, TMP2H
+ shr TMP2, 16
+ pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8]
+ movzx TEMP, TMP3H
+ shr TMP3, 16
+ pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8]
+
+ movzx TEMP, TMP0L
+ shr TMP0, 8
+ pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8]
+ movzx TEMP, TMP1L
+ shr TMP1, 8
+ pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8]
+ movzx TEMP, TMP2L
+ shr TMP2, 8
+ pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8]
+ movzx TEMP, TMP3L
+ shr TMP3, 8
+ pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8]
+
+ pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
+ movq BUF0, [SRC]
+ pxor CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8]
+ movq BUF1, [SRC + 1 * 8]
+ pxor CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8]
+ movq BUF2, [SRC + 2 * 8]
+ pxor CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8]
+ movq BUF3, [SRC + 3 * 8]
+
+ cmp END, SRC
+ ja main_loop
+
+#undef END
+#define END TMP1
+ pop END
+ pop TABLE
+ add SRC, 32
+
+ CRC_WORD_MMX()
+
+ pxor BUF1, CRC1
+ movq BUF0, BUF1
+ CRC_WORD_MMX()
+
+ pxor BUF2, CRC2
+ movq BUF0, BUF2
+ CRC_WORD_MMX()
+
+ pxor BUF3, CRC3
+ movq BUF0, BUF3
+ CRC_WORD_MMX()
+
+ end_main_loop:
+ add END, 2*4*8 - 8
+ cmp SRC, END
+ jae end_word_loop
+
+ word_loop:
+ movq BUF0, [SRC]
+ add SRC, 8
+ CRC_WORD_MMX()
+ cmp END, SRC
+ ja word_loop
+ end_word_loop:
+
+#if 0 // Plain C version is faster?
+ add END, 7
+ cmp SRC, END
+ jae end_byte_loop
+
+ byte_loop:
+ movd TMP0, CRC0
+ movzx TEMP, byte ptr [SRC]
+ movzx TMP0, TMP0L
+ psrlq CRC0, 8
+ xor TEMP, TMP0
+ add SRC, 1
+ pxor CRC0, [TABLE + TEMP*8 + 7*256*8]
+ cmp END, SRC
+ ja byte_loop
+ end_byte_loop:
+#endif
+
+ pop ebp
+
+ mov src, SRC
+ movq crc0, CRC0
+
+ emms
+ }
+
+#if 1
+ // Compute CRC of remaining bytes.
+ for (;src < end; ++src) {
+ CRC_BYTE(this, crc0, *src);
+ }
+#endif
+
+ return (crc0 ^ this->Base().Canonize());
+}
+
+
+} // namespace crcutil
+
+#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)