diff options
author | alexv-smirnov <alex@ydb.tech> | 2022-12-20 00:50:48 +0300 |
---|---|---|
committer | alexv-smirnov <alex@ydb.tech> | 2022-12-20 00:50:48 +0300 |
commit | 84f2cfa253cc618438ed6e9d68b33fa7c0d88cb9 (patch) | |
tree | f0cf2236e0aafb3e437199f1ac7b559e7fad554a /contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc | |
parent | bde6febc1ad3b826e72746de21d7250803e8e0b5 (diff) | |
download | ydb-84f2cfa253cc618438ed6e9d68b33fa7c0d88cb9.tar.gz |
add windows platform to ydb github export
Diffstat (limited to 'contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc')
-rw-r--r-- | contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc | 304 |
1 files changed, 304 insertions, 0 deletions
diff --git a/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc b/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc new file mode 100644 index 0000000000..af7352aa46 --- /dev/null +++ b/contrib/libs/crcutil/multiword_64_64_cl_i386_mmx.cc @@ -0,0 +1,304 @@ +// Copyright 2010 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements 64-bit multiword CRC for Microsoft and Intel compilers +// using MMX instructions (i386). + +#include "generic_crc.h" + +#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER) + +namespace crcutil { + +#define CRC_WORD_MMX() \ + __asm pxor BUF0, CRC0 \ + __asm movd TMP0, BUF0 \ + __asm psrlq BUF0, 32 \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm movq CRC0, [TABLE + TEMP * 8] \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \ + __asm pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \ + __asm movd TMP0, BUF0 \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \ + __asm movzx TEMP, TMP0L \ + __asm shr TMP0, 8 \ + __asm pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \ + __asm pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8] + +// frame pointer register 'ebp' modified by inline assembly code +#pragma warning(disable: 4731) + +template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx( + const void *data, + size_t bytes, + const uint64 &start) const { + const uint8 *src = static_cast<const uint8 *>(data); + const uint8 *end = src + bytes; + uint64 crc0 = start ^ this->Base().Canonize(); + + ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64); + if (src >= end) { + return (crc0 ^ this->Base().Canonize()); + } + +#define CRC0 mm0 +#define CRC1 mm1 +#define CRC2 mm2 +#define CRC3 mm3 +#define BUF0 mm4 +#define BUF1 mm5 +#define BUF2 mm6 +#define BUF3 mm7 +#define TMP0 eax +#define TMP0L al +#define TMP0H ah +#define TMP1 ebx +#define TMP1L bl +#define TMP1H bh +#define TMP2 ecx +#define TMP2L cl +#define TMP2H ch +#define TMP3 edx +#define TMP3L dl +#define TMP3H dh +#define TEMP edi +#define SRC esi +#define END [esp] +#define TABLE ebp + + + const uint64 *interleaved_table_address = + &this->crc_word_interleaved_[0][0]; + const uint64 *word_table_address = &this->crc_word_[0][0]; + + __asm { + push ebp + + mov TMP0, interleaved_table_address + + movq CRC0, crc0 + mov SRC, src + mov TMP1, end + sub TMP1, 2*4*8 - 1 + cmp SRC, TMP1 + mov TABLE, word_table_address + jae end_main_loop + + push TABLE + mov TABLE, TMP0 + push TMP1 + + pxor CRC1, CRC1 + pxor CRC2, CRC2 + pxor CRC3, CRC3 + + movq BUF0, [SRC] + movq BUF1, [SRC + 1 * 8] + movq BUF2, [SRC + 2 * 8] + movq BUF3, [SRC + 3 * 8] + + main_loop: +#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0 + prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH] +#endif + add SRC, 32 + pxor BUF0, CRC0 + pxor BUF1, CRC1 + pxor BUF2, CRC2 + pxor BUF3, CRC3 + + movd TMP0, BUF0 + psrlq BUF0, 32 + movd TMP1, BUF1 + psrlq BUF1, 32 + movd TMP2, BUF2 + psrlq BUF2, 32 + movd TMP3, BUF3 + psrlq BUF3, 32 + + movzx TEMP, TMP0L + movq CRC0, [TABLE + TEMP * 8] + movzx TEMP, TMP1L + movq CRC1, [TABLE + TEMP * 8] + movzx TEMP, TMP2L + movq CRC2, [TABLE + TEMP * 8] + movzx TEMP, TMP3L + movq CRC3, [TABLE + TEMP * 8] + + movzx TEMP, TMP0H + shr TMP0, 16 + pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] + movzx TEMP, TMP1H + shr TMP1, 16 + pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8] + movzx TEMP, TMP2H + shr TMP2, 16 + pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8] + movzx TEMP, TMP3H + shr TMP3, 16 + pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8] + + movzx TEMP, TMP0L + shr TMP0, 8 + pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] + movzx TEMP, TMP1L + shr TMP1, 8 + pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8] + movzx TEMP, TMP2L + shr TMP2, 8 + pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8] + movzx TEMP, TMP3L + shr TMP3, 8 + pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8] + + pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] + movd TMP0, BUF0 + pxor CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8] + movd TMP1, BUF1 + pxor CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8] + movd TMP2, BUF2 + pxor CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8] + movd TMP3, BUF3 + + movzx TEMP, TMP0L + pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] + movzx TEMP, TMP1L + pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8] + movzx TEMP, TMP2L + pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8] + movzx TEMP, TMP3L + pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8] + + movzx TEMP, TMP0H + shr TMP0, 16 + pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] + movzx TEMP, TMP1H + shr TMP1, 16 + pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8] + movzx TEMP, TMP2H + shr TMP2, 16 + pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8] + movzx TEMP, TMP3H + shr TMP3, 16 + pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8] + + movzx TEMP, TMP0L + shr TMP0, 8 + pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] + movzx TEMP, TMP1L + shr TMP1, 8 + pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8] + movzx TEMP, TMP2L + shr TMP2, 8 + pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8] + movzx TEMP, TMP3L + shr TMP3, 8 + pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8] + + pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8] + movq BUF0, [SRC] + pxor CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8] + movq BUF1, [SRC + 1 * 8] + pxor CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8] + movq BUF2, [SRC + 2 * 8] + pxor CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8] + movq BUF3, [SRC + 3 * 8] + + cmp END, SRC + ja main_loop + +#undef END +#define END TMP1 + pop END + pop TABLE + add SRC, 32 + + CRC_WORD_MMX() + + pxor BUF1, CRC1 + movq BUF0, BUF1 + CRC_WORD_MMX() + + pxor BUF2, CRC2 + movq BUF0, BUF2 + CRC_WORD_MMX() + + pxor BUF3, CRC3 + movq BUF0, BUF3 + CRC_WORD_MMX() + + end_main_loop: + add END, 2*4*8 - 8 + cmp SRC, END + jae end_word_loop + + word_loop: + movq BUF0, [SRC] + add SRC, 8 + CRC_WORD_MMX() + cmp END, SRC + ja word_loop + end_word_loop: + +#if 0 // Plain C version is faster? + add END, 7 + cmp SRC, END + jae end_byte_loop + + byte_loop: + movd TMP0, CRC0 + movzx TEMP, byte ptr [SRC] + movzx TMP0, TMP0L + psrlq CRC0, 8 + xor TEMP, TMP0 + add SRC, 1 + pxor CRC0, [TABLE + TEMP*8 + 7*256*8] + cmp END, SRC + ja byte_loop + end_byte_loop: +#endif + + pop ebp + + mov src, SRC + movq crc0, CRC0 + + emms + } + +#if 1 + // Compute CRC of remaining bytes. + for (;src < end; ++src) { + CRC_BYTE(this, crc0, *src); + } +#endif + + return (crc0 ^ this->Base().Canonize()); +} + + +} // namespace crcutil + +#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER) |