diff options
author | orivej <orivej@yandex-team.ru> | 2022-02-10 16:44:49 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:49 +0300 |
commit | 718c552901d703c502ccbefdfc3c9028d608b947 (patch) | |
tree | 46534a98bbefcd7b1f3faa5b52c138ab27db75b7 /contrib/restricted/aws/aws-checksums | |
parent | e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (diff) | |
download | ydb-718c552901d703c502ccbefdfc3c9028d608b947.tar.gz |
Restoring authorship annotation for <orivej@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/restricted/aws/aws-checksums')
7 files changed, 455 insertions, 455 deletions
diff --git a/contrib/restricted/aws/aws-checksums/include/aws/checksums/crc.h b/contrib/restricted/aws/aws-checksums/include/aws/checksums/crc.h index 0bb3022e78..24b0a6dc7b 100644 --- a/contrib/restricted/aws/aws-checksums/include/aws/checksums/crc.h +++ b/contrib/restricted/aws/aws-checksums/include/aws/checksums/crc.h @@ -1,8 +1,8 @@ #ifndef AWS_CHECKSUMS_CRC_H #define AWS_CHECKSUMS_CRC_H -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. */ #include <aws/checksums/exports.h> diff --git a/contrib/restricted/aws/aws-checksums/include/aws/checksums/exports.h b/contrib/restricted/aws/aws-checksums/include/aws/checksums/exports.h index a14e84e440..060b6a85ea 100644 --- a/contrib/restricted/aws/aws-checksums/include/aws/checksums/exports.h +++ b/contrib/restricted/aws/aws-checksums/include/aws/checksums/exports.h @@ -1,11 +1,11 @@ #ifndef AWS_CHECKSUMS_EXPORTS_H #define AWS_CHECKSUMS_EXPORTS_H -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. */ -#if defined(AWS_C_RT_USE_WINDOWS_DLL_SEMANTICS) || defined(_WIN32) -# ifdef AWS_CHECKSUMS_USE_IMPORT_EXPORT +#if defined(AWS_C_RT_USE_WINDOWS_DLL_SEMANTICS) || defined(_WIN32) +# ifdef AWS_CHECKSUMS_USE_IMPORT_EXPORT # ifdef AWS_CHECKSUMS_EXPORTS # define AWS_CHECKSUMS_API __declspec(dllexport) # else @@ -13,14 +13,14 @@ # endif /* AWS_CHECKSUMS_EXPORTS */ # else # define AWS_CHECKSUMS_API -# endif /* AWS_CHECKSUMS_USE_IMPORT_EXPORT */ -#else /* defined (AWS_C_RT_USE_WINDOWS_DLL_SEMANTICS) || defined (_WIN32) */ -# if ((__GNUC__ >= 4) || defined(__clang__)) && defined(AWS_CHECKSUMS_USE_IMPORT_EXPORT) && \ - defined(AWS_CHECKSUMS_EXPORTS) -# define AWS_CHECKSUMS_API __attribute__((visibility("default"))) -# else -# define AWS_CHECKSUMS_API -# endif /* __GNUC__ >= 4 || defined(__clang__) */ -#endif /* defined (AWS_C_RT_USE_WINDOWS_DLL_SEMANTICS) || defined (_WIN32) */ +# endif /* AWS_CHECKSUMS_USE_IMPORT_EXPORT */ +#else /* defined (AWS_C_RT_USE_WINDOWS_DLL_SEMANTICS) || defined (_WIN32) */ +# if ((__GNUC__ >= 4) || defined(__clang__)) && defined(AWS_CHECKSUMS_USE_IMPORT_EXPORT) && \ + defined(AWS_CHECKSUMS_EXPORTS) +# define AWS_CHECKSUMS_API __attribute__((visibility("default"))) +# else +# define AWS_CHECKSUMS_API +# endif /* __GNUC__ >= 4 || defined(__clang__) */ +#endif /* defined (AWS_C_RT_USE_WINDOWS_DLL_SEMANTICS) || defined (_WIN32) */ #endif /* AWS_CHECKSUMS_EXPORTS_H */ diff --git a/contrib/restricted/aws/aws-checksums/include/aws/checksums/private/crc_priv.h b/contrib/restricted/aws/aws-checksums/include/aws/checksums/private/crc_priv.h index 221c86f9a2..120de51c6c 100644 --- a/contrib/restricted/aws/aws-checksums/include/aws/checksums/private/crc_priv.h +++ b/contrib/restricted/aws/aws-checksums/include/aws/checksums/private/crc_priv.h @@ -1,8 +1,8 @@ #ifndef AWS_CHECKSUMS_PRIVATE_CRC_PRIV_H #define AWS_CHECKSUMS_PRIVATE_CRC_PRIV_H -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. */ #define AWS_CRC32_SIZE_BYTES 4 @@ -23,9 +23,9 @@ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_sw(const uint8_t *input, int len /* Computes the Castagnoli CRC32c (iSCSI). */ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32); -/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */ -AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32); - +/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */ +AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32); + #ifdef __cplusplus } #endif diff --git a/contrib/restricted/aws/aws-checksums/source/crc.c b/contrib/restricted/aws/aws-checksums/source/crc.c index f5d3e802fd..f4445f7316 100644 --- a/contrib/restricted/aws/aws-checksums/source/crc.c +++ b/contrib/restricted/aws/aws-checksums/source/crc.c @@ -1,29 +1,29 @@ -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. */ #include <aws/checksums/crc.h> #include <aws/checksums/private/crc_priv.h> -#include <aws/common/cpuid.h> - +#include <aws/common/cpuid.h> + static uint32_t (*s_crc32c_fn_ptr)(const uint8_t *input, int length, uint32_t previousCrc32) = 0; -static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t previousCrc32) = 0; +static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t previousCrc32) = 0; uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32) { - if (AWS_UNLIKELY(!s_crc32_fn_ptr)) { - if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) { - s_crc32_fn_ptr = aws_checksums_crc32_hw; - } else { - s_crc32_fn_ptr = aws_checksums_crc32_sw; - } - } - return s_crc32_fn_ptr(input, length, previousCrc32); + if (AWS_UNLIKELY(!s_crc32_fn_ptr)) { + if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) { + s_crc32_fn_ptr = aws_checksums_crc32_hw; + } else { + s_crc32_fn_ptr = aws_checksums_crc32_sw; + } + } + return s_crc32_fn_ptr(input, length, previousCrc32); } uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32) { - if (AWS_UNLIKELY(!s_crc32c_fn_ptr)) { - if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) || aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) { + if (AWS_UNLIKELY(!s_crc32c_fn_ptr)) { + if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) || aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) { s_crc32c_fn_ptr = aws_checksums_crc32c_hw; } else { s_crc32c_fn_ptr = aws_checksums_crc32c_sw; diff --git a/contrib/restricted/aws/aws-checksums/source/crc_sw.c b/contrib/restricted/aws/aws-checksums/source/crc_sw.c index 9bc326b9e8..2d6c54e6de 100644 --- a/contrib/restricted/aws/aws-checksums/source/crc_sw.c +++ b/contrib/restricted/aws/aws-checksums/source/crc_sw.c @@ -1,6 +1,6 @@ -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. */ #include <aws/checksums/private/crc_priv.h> #include <stddef.h> diff --git a/contrib/restricted/aws/aws-checksums/source/intel/asm/crc32c_sse42_asm.c b/contrib/restricted/aws/aws-checksums/source/intel/asm/crc32c_sse42_asm.c index 79278ed84d..232d28c1cc 100644 --- a/contrib/restricted/aws/aws-checksums/source/intel/asm/crc32c_sse42_asm.c +++ b/contrib/restricted/aws/aws-checksums/source/intel/asm/crc32c_sse42_asm.c @@ -1,391 +1,391 @@ -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. - */ - -#include <aws/checksums/private/crc_priv.h> - -#include <aws/common/cpuid.h> - -/*this implementation is only for 64 bit arch and (if on GCC, release mode). - * If using clang, this will run for both debug and release.*/ -#if defined(__x86_64__) && \ - (defined(__clang__) || !((defined(__GNUC__)) && ((__GNUC__ == 4 && __GNUC_MINOR__ < 4) || defined(DEBUG_BUILD)))) - -# if defined(__clang__) -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wdollar-in-identifier-extension" -# endif - -/* - * Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using - * the specified Magic Constants K1 and K2. - * Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding - * Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11 - * Result is placed in ecx - */ -# define FOLD_K1K2(NAME, K1, K2) \ - "fold_k1k2_" #NAME "_%=: \n" \ - "movl " #K1 ", %%r8d # Magic K1 constant \n" \ - "movl " #K2 ", %%r9d # Magic K2 constant \n" \ - "movq %%rcx, %%xmm1 # crc0 into lower dword of xmm1 \n" \ - "movq %%r8, %%xmm3 # K1 into lower dword of xmm3 \n" \ - "movq %%r11, %%xmm2 # crc1 into lower dword of xmm2 \n" \ - "movq %%r9, %%xmm4 # K2 into lower dword of xmm4 \n" \ - "pclmulqdq $0x00, %%xmm3, %%xmm1 # Multiply crc0 by K1 \n" \ - "pclmulqdq $0x00, %%xmm4, %%xmm2 # Multiply crc1 by K2 \n" \ - "xor %%rcx, %%rcx # \n" \ - "xor %%r11, %%r11 # \n" \ - "movq %%xmm1, %%r8 # \n" \ - "movq %%xmm2, %%r9 # \n" \ - "crc32q %%r8, %%rcx # folding crc0 \n" \ - "crc32q %%r9, %%r11 # folding crc1 \n" \ - "xor %%r10d, %%ecx # combine crc2 and crc0 \n" \ - "xor %%r11d, %%ecx # combine crc1 and crc0 \n" - -/** - * Private (static) function. - * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine - * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function - * is optimized for exactly 256 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a - * pointer to input data that is exactly 256 bytes in length. Note: this function does NOT invert bits of the input crc - * or return value. - */ -static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) { - __asm__ __volatile__( - "enter_256_%=:" - - "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" - "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" - - "crc32q 0(%[in]), %%rcx # crc0 \n" - "crc32q 88(%[in]), %%r11 # crc1 \n" - "crc32q 176(%[in]), %%r10 # crc2 \n" - - "crc32q 8(%[in]), %%rcx # crc0 \n" - "crc32q 96(%[in]), %%r11 # crc1 \n" - "crc32q 184(%[in]), %%r10 # crc2 \n" - - "crc32q 16(%[in]), %%rcx # crc0 \n" - "crc32q 104(%[in]), %%r11 # crc1 \n" - "crc32q 192(%[in]), %%r10 # crc2 \n" - - "crc32q 24(%[in]), %%rcx # crc0 \n" - "crc32q 112(%[in]), %%r11 # crc1 \n" - "crc32q 200(%[in]), %%r10 # crc2 \n" - - "crc32q 32(%[in]), %%rcx # crc0 \n" - "crc32q 120(%[in]), %%r11 # crc1 \n" - "crc32q 208(%[in]), %%r10 # crc2 \n" - - "crc32q 40(%[in]), %%rcx # crc0 \n" - "crc32q 128(%[in]), %%r11 # crc1 \n" - "crc32q 216(%[in]), %%r10 # crc2 \n" - - "crc32q 48(%[in]), %%rcx # crc0 \n" - "crc32q 136(%[in]), %%r11 # crc1 \n" - "crc32q 224(%[in]), %%r10 # crc2 \n" - - "crc32q 56(%[in]), %%rcx # crc0 \n" - "crc32q 144(%[in]), %%r11 # crc1 \n" - "crc32q 232(%[in]), %%r10 # crc2 \n" - - "crc32q 64(%[in]), %%rcx # crc0 \n" - "crc32q 152(%[in]), %%r11 # crc1 \n" - "crc32q 240(%[in]), %%r10 # crc2 \n" - - "crc32q 72(%[in]), %%rcx # crc0 \n" - "crc32q 160(%[in]), %%r11 # crc1 \n" - "crc32q 248(%[in]), %%r10 # crc2 \n" - - "crc32q 80(%[in]), %%rcx # crc0 \n" - "crc32q 168(%[in]), %%r11 # crc2 \n" - - FOLD_K1K2(256, $0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */ - - /* output registers - [crc] is an input and and output so it is marked read/write (i.e. "+c")*/ - : "+c"(crc) - - /* input registers */ - : [ crc ] "c"(crc), [ in ] "d"(input) - - /* additional clobbered registers */ - : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc"); - return crc; -} - -/** - * Private (static) function. - * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine - * instruction by operating on 3 24-byte stripes in parallel. The results are folded together using CLMUL. This function - * is optimized for exactly 1024 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a - * pointer to input data that is exactly 1024 bytes in length. Note: this function does NOT invert bits of the input crc - * or return value. - */ -static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) { - __asm__ __volatile__( - "enter_1024_%=:" - - "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" - "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" - - "movl $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n" - - "loop_1024_%=:" - - "prefetcht0 128(%[in]) # \n" - "prefetcht0 472(%[in]) # \n" - "prefetcht0 808(%[in]) # \n" - - "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n" - "crc32q 344(%[in]), %%r11 # crc1: stripe1 \n" - "crc32q 680(%[in]), %%r10 # crc2: stripe2 \n" - - "crc32q 8(%[in]), %%rcx # crc0 \n" - "crc32q 352(%[in]), %%r11 # crc1 \n" - "crc32q 688(%[in]), %%r10 # crc2 \n" - - "crc32q 16(%[in]), %%rcx # crc0 \n" - "crc32q 360(%[in]), %%r11 # crc1 \n" - "crc32q 696(%[in]), %%r10 # crc2 \n" - - "crc32q 24(%[in]), %%rcx # crc0 \n" - "crc32q 368(%[in]), %%r11 # crc1 \n" - "crc32q 704(%[in]), %%r10 # crc2 \n" - - "crc32q 32(%[in]), %%rcx # crc0 \n" - "crc32q 376(%[in]), %%r11 # crc1 \n" - "crc32q 712(%[in]), %%r10 # crc2 \n" - - "crc32q 40(%[in]), %%rcx # crc0 \n" - "crc32q 384(%[in]), %%r11 # crc1 \n" - "crc32q 720(%[in]), %%r10 # crc2 \n" - - "crc32q 48(%[in]), %%rcx # crc0 \n" - "crc32q 392(%[in]), %%r11 # crc1 \n" - "crc32q 728(%[in]), %%r10 # crc2 \n" - - "crc32q 56(%[in]), %%rcx # crc0 \n" - "crc32q 400(%[in]), %%r11 # crc1 \n" - "crc32q 736(%[in]), %%r10 # crc2 \n" - - "add $64, %[in] # \n" - "sub $1, %%r8d # \n" - "jnz loop_1024_%= # \n" - - "crc32q 0(%[in]), %%rcx # crc0 \n" - "crc32q 344(%[in]), %%r11 # crc1 \n" - "crc32q 680(%[in]), %%r10 # crc2 \n" - - "crc32q 8(%[in]), %%rcx # crc0 \n" - "crc32q 352(%[in]), %%r11 # crc1 \n" - "crc32q 688(%[in]), %%r10 # crc2 \n" - - "crc32q 16(%[in]), %%rcx # crc0 \n" - "crc32q 696(%[in]), %%r10 # crc2 \n" - - FOLD_K1K2(1024, $0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx - - output registers - [crc] is an input and and output so it is marked read/write (i.e. "+c") - we clobber the register for [input] (via add instruction) so we must also - tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber */ - : "+c"(crc), "+d"(input) - - /* input registers */ - /* the numeric values match the position of the output registers */ - : [ crc ] "c"(crc), [ in ] "d"(input) - - /* additional clobbered registers */ - /* "cc" is the flags - we add and sub, so the flags are also clobbered */ - : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc"); - return crc; -} - -/** - * Private (static) function. - * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine - * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function - * is optimized for exactly 3072 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a - * pointer to input data that is exactly 3072 bytes in length. Note: this function does NOT invert bits of the input crc - * or return value. - */ -static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) { - __asm__ __volatile__( - "enter_3072_%=:" - - "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" - "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" - - "movl $16, %%r8d # Loop 16 times through 64 byte chunks in 3 parallel stripes \n" - - "loop_3072_%=:" - - "prefetcht0 128(%[in]) # \n" - "prefetcht0 1152(%[in]) # \n" - "prefetcht0 2176(%[in]) # \n" - - "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n" - "crc32q 1024(%[in]), %%r11 # crc1: stripe1 \n" - "crc32q 2048(%[in]), %%r10 # crc2: stripe2 \n" - - "crc32q 8(%[in]), %%rcx # crc0: stripe0 \n" - "crc32q 1032(%[in]), %%r11 # crc1: stripe1 \n" - "crc32q 2056(%[in]), %%r10 # crc2: stripe2 \n" - - "crc32q 16(%[in]), %%rcx # crc0: stripe0 \n" - "crc32q 1040(%[in]), %%r11 # crc1: stripe1 \n" - "crc32q 2064(%[in]), %%r10 # crc2: stripe2 \n" - - "crc32q 24(%[in]), %%rcx # crc0: stripe0 \n" - "crc32q 1048(%[in]), %%r11 # crc1: stripe1 \n" - "crc32q 2072(%[in]), %%r10 # crc2: stripe2 \n" - - "crc32q 32(%[in]), %%rcx # crc0: stripe0 \n" - "crc32q 1056(%[in]), %%r11 # crc1: stripe1 \n" - "crc32q 2080(%[in]), %%r10 # crc2: stripe2 \n" - - "crc32q 40(%[in]), %%rcx # crc0: stripe0 \n" - "crc32q 1064(%[in]), %%r11 # crc1: stripe1 \n" - "crc32q 2088(%[in]), %%r10 # crc2: stripe2 \n" - - "crc32q 48(%[in]), %%rcx # crc0: stripe0 \n" - "crc32q 1072(%[in]), %%r11 # crc1: stripe1 \n" - "crc32q 2096(%[in]), %%r10 # crc2: stripe2 \n" - - "crc32q 56(%[in]), %%rcx # crc0: stripe0 \n" - "crc32q 1080(%[in]), %%r11 # crc1: stripe1 \n" - "crc32q 2104(%[in]), %%r10 # crc2: stripe2 \n" - - "add $64, %[in] # \n" - "sub $1, %%r8d # \n" - "jnz loop_3072_%= # \n" - - FOLD_K1K2( - 3072, - $0xa51b6135, - $0x170076fa) /* Magic Constants used to fold crc stripes into ecx - - output registers - [crc] is an input and and output so it is marked read/write (i.e. "+c") - we clobber the register for [input] (via add instruction) so we must also - tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber*/ - : "+c"(crc), "+d"(input) - - /* input registers - the numeric values match the position of the output registers */ - : [ crc ] "c"(crc), [ in ] "d"(input) - - /* additional clobbered registers - "cc" is the flags - we add and sub, so the flags are also clobbered */ - : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc"); - - return crc; -} - -static bool detection_performed = false; -static bool detected_clmul = false; - -/* - * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and - * PCLMULQDQ machine instructions (if present). - * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction. - * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent - * call. - */ -uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - - if (AWS_UNLIKELY(!detection_performed)) { - detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); - /* Simply setting the flag true to skip HW detection next time - Not using memory barriers since the worst that can - happen is a fallback to the non HW accelerated code. */ - detection_performed = true; - } - - uint32_t crc = ~previousCrc32; - - /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ - if (AWS_UNLIKELY(length < 8)) { - while (length-- > 0) { - __asm__("loop_small_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input)); - input++; - } - return ~crc; - } - - /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ - int input_alignment = (unsigned long int)input & 0x7; - - /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ - int leading = (8 - input_alignment) & 0x7; - - /* reduce the length by the leading unaligned bytes we are about to process */ - length -= leading; - - /* spin through the leading unaligned input bytes (if any) one-by-one */ - while (leading-- > 0) { - __asm__("loop_leading_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input)); - input++; - } - - /* Using likely to keep this code inlined */ - if (AWS_LIKELY(detected_clmul)) { - - while (AWS_LIKELY(length >= 3072)) { - /* Compute crc32c on each block, chaining each crc result */ - crc = s_crc32c_sse42_clmul_3072(input, crc); - input += 3072; - length -= 3072; - } - while (AWS_LIKELY(length >= 1024)) { - /* Compute crc32c on each block, chaining each crc result */ - crc = s_crc32c_sse42_clmul_1024(input, crc); - input += 1024; - length -= 1024; - } - while (AWS_LIKELY(length >= 256)) { - /* Compute crc32c on each block, chaining each crc result */ - crc = s_crc32c_sse42_clmul_256(input, crc); - input += 256; - length -= 256; - } - } - - /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ - while (AWS_LIKELY(length >= 8)) { - /* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */ - __asm__ __volatile__("loop_8_%=: CRC32Q (%[in]), %%rcx" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input)); - input += 8; - length -= 8; - } - - /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ - while (length-- > 0) { - __asm__ __volatile__("loop_trailing_%=: CRC32B (%[in]), %[crc]" - : "+c"(crc) - : [ crc ] "c"(crc), [ in ] "r"(input)); - input++; - } - - return ~crc; -} -uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); -} - -# if defined(__clang__) -# pragma clang diagnostic pop -# endif - -#else -uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); -} - -uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32c_sw(input, length, previousCrc32); -} - -#endif +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include <aws/checksums/private/crc_priv.h> + +#include <aws/common/cpuid.h> + +/*this implementation is only for 64 bit arch and (if on GCC, release mode). + * If using clang, this will run for both debug and release.*/ +#if defined(__x86_64__) && \ + (defined(__clang__) || !((defined(__GNUC__)) && ((__GNUC__ == 4 && __GNUC_MINOR__ < 4) || defined(DEBUG_BUILD)))) + +# if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wdollar-in-identifier-extension" +# endif + +/* + * Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using + * the specified Magic Constants K1 and K2. + * Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding + * Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11 + * Result is placed in ecx + */ +# define FOLD_K1K2(NAME, K1, K2) \ + "fold_k1k2_" #NAME "_%=: \n" \ + "movl " #K1 ", %%r8d # Magic K1 constant \n" \ + "movl " #K2 ", %%r9d # Magic K2 constant \n" \ + "movq %%rcx, %%xmm1 # crc0 into lower dword of xmm1 \n" \ + "movq %%r8, %%xmm3 # K1 into lower dword of xmm3 \n" \ + "movq %%r11, %%xmm2 # crc1 into lower dword of xmm2 \n" \ + "movq %%r9, %%xmm4 # K2 into lower dword of xmm4 \n" \ + "pclmulqdq $0x00, %%xmm3, %%xmm1 # Multiply crc0 by K1 \n" \ + "pclmulqdq $0x00, %%xmm4, %%xmm2 # Multiply crc1 by K2 \n" \ + "xor %%rcx, %%rcx # \n" \ + "xor %%r11, %%r11 # \n" \ + "movq %%xmm1, %%r8 # \n" \ + "movq %%xmm2, %%r9 # \n" \ + "crc32q %%r8, %%rcx # folding crc0 \n" \ + "crc32q %%r9, %%r11 # folding crc1 \n" \ + "xor %%r10d, %%ecx # combine crc2 and crc0 \n" \ + "xor %%r11d, %%ecx # combine crc1 and crc0 \n" + +/** + * Private (static) function. + * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine + * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function + * is optimized for exactly 256 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a + * pointer to input data that is exactly 256 bytes in length. Note: this function does NOT invert bits of the input crc + * or return value. + */ +static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) { + __asm__ __volatile__( + "enter_256_%=:" + + "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" + "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" + + "crc32q 0(%[in]), %%rcx # crc0 \n" + "crc32q 88(%[in]), %%r11 # crc1 \n" + "crc32q 176(%[in]), %%r10 # crc2 \n" + + "crc32q 8(%[in]), %%rcx # crc0 \n" + "crc32q 96(%[in]), %%r11 # crc1 \n" + "crc32q 184(%[in]), %%r10 # crc2 \n" + + "crc32q 16(%[in]), %%rcx # crc0 \n" + "crc32q 104(%[in]), %%r11 # crc1 \n" + "crc32q 192(%[in]), %%r10 # crc2 \n" + + "crc32q 24(%[in]), %%rcx # crc0 \n" + "crc32q 112(%[in]), %%r11 # crc1 \n" + "crc32q 200(%[in]), %%r10 # crc2 \n" + + "crc32q 32(%[in]), %%rcx # crc0 \n" + "crc32q 120(%[in]), %%r11 # crc1 \n" + "crc32q 208(%[in]), %%r10 # crc2 \n" + + "crc32q 40(%[in]), %%rcx # crc0 \n" + "crc32q 128(%[in]), %%r11 # crc1 \n" + "crc32q 216(%[in]), %%r10 # crc2 \n" + + "crc32q 48(%[in]), %%rcx # crc0 \n" + "crc32q 136(%[in]), %%r11 # crc1 \n" + "crc32q 224(%[in]), %%r10 # crc2 \n" + + "crc32q 56(%[in]), %%rcx # crc0 \n" + "crc32q 144(%[in]), %%r11 # crc1 \n" + "crc32q 232(%[in]), %%r10 # crc2 \n" + + "crc32q 64(%[in]), %%rcx # crc0 \n" + "crc32q 152(%[in]), %%r11 # crc1 \n" + "crc32q 240(%[in]), %%r10 # crc2 \n" + + "crc32q 72(%[in]), %%rcx # crc0 \n" + "crc32q 160(%[in]), %%r11 # crc1 \n" + "crc32q 248(%[in]), %%r10 # crc2 \n" + + "crc32q 80(%[in]), %%rcx # crc0 \n" + "crc32q 168(%[in]), %%r11 # crc2 \n" + + FOLD_K1K2(256, $0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */ + + /* output registers + [crc] is an input and and output so it is marked read/write (i.e. "+c")*/ + : "+c"(crc) + + /* input registers */ + : [ crc ] "c"(crc), [ in ] "d"(input) + + /* additional clobbered registers */ + : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc"); + return crc; +} + +/** + * Private (static) function. + * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine + * instruction by operating on 3 24-byte stripes in parallel. The results are folded together using CLMUL. This function + * is optimized for exactly 1024 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a + * pointer to input data that is exactly 1024 bytes in length. Note: this function does NOT invert bits of the input crc + * or return value. + */ +static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) { + __asm__ __volatile__( + "enter_1024_%=:" + + "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" + "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" + + "movl $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n" + + "loop_1024_%=:" + + "prefetcht0 128(%[in]) # \n" + "prefetcht0 472(%[in]) # \n" + "prefetcht0 808(%[in]) # \n" + + "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n" + "crc32q 344(%[in]), %%r11 # crc1: stripe1 \n" + "crc32q 680(%[in]), %%r10 # crc2: stripe2 \n" + + "crc32q 8(%[in]), %%rcx # crc0 \n" + "crc32q 352(%[in]), %%r11 # crc1 \n" + "crc32q 688(%[in]), %%r10 # crc2 \n" + + "crc32q 16(%[in]), %%rcx # crc0 \n" + "crc32q 360(%[in]), %%r11 # crc1 \n" + "crc32q 696(%[in]), %%r10 # crc2 \n" + + "crc32q 24(%[in]), %%rcx # crc0 \n" + "crc32q 368(%[in]), %%r11 # crc1 \n" + "crc32q 704(%[in]), %%r10 # crc2 \n" + + "crc32q 32(%[in]), %%rcx # crc0 \n" + "crc32q 376(%[in]), %%r11 # crc1 \n" + "crc32q 712(%[in]), %%r10 # crc2 \n" + + "crc32q 40(%[in]), %%rcx # crc0 \n" + "crc32q 384(%[in]), %%r11 # crc1 \n" + "crc32q 720(%[in]), %%r10 # crc2 \n" + + "crc32q 48(%[in]), %%rcx # crc0 \n" + "crc32q 392(%[in]), %%r11 # crc1 \n" + "crc32q 728(%[in]), %%r10 # crc2 \n" + + "crc32q 56(%[in]), %%rcx # crc0 \n" + "crc32q 400(%[in]), %%r11 # crc1 \n" + "crc32q 736(%[in]), %%r10 # crc2 \n" + + "add $64, %[in] # \n" + "sub $1, %%r8d # \n" + "jnz loop_1024_%= # \n" + + "crc32q 0(%[in]), %%rcx # crc0 \n" + "crc32q 344(%[in]), %%r11 # crc1 \n" + "crc32q 680(%[in]), %%r10 # crc2 \n" + + "crc32q 8(%[in]), %%rcx # crc0 \n" + "crc32q 352(%[in]), %%r11 # crc1 \n" + "crc32q 688(%[in]), %%r10 # crc2 \n" + + "crc32q 16(%[in]), %%rcx # crc0 \n" + "crc32q 696(%[in]), %%r10 # crc2 \n" + + FOLD_K1K2(1024, $0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx + + output registers + [crc] is an input and and output so it is marked read/write (i.e. "+c") + we clobber the register for [input] (via add instruction) so we must also + tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber */ + : "+c"(crc), "+d"(input) + + /* input registers */ + /* the numeric values match the position of the output registers */ + : [ crc ] "c"(crc), [ in ] "d"(input) + + /* additional clobbered registers */ + /* "cc" is the flags - we add and sub, so the flags are also clobbered */ + : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc"); + return crc; +} + +/** + * Private (static) function. + * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine + * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function + * is optimized for exactly 3072 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a + * pointer to input data that is exactly 3072 bytes in length. Note: this function does NOT invert bits of the input crc + * or return value. + */ +static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) { + __asm__ __volatile__( + "enter_3072_%=:" + + "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" + "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" + + "movl $16, %%r8d # Loop 16 times through 64 byte chunks in 3 parallel stripes \n" + + "loop_3072_%=:" + + "prefetcht0 128(%[in]) # \n" + "prefetcht0 1152(%[in]) # \n" + "prefetcht0 2176(%[in]) # \n" + + "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n" + "crc32q 1024(%[in]), %%r11 # crc1: stripe1 \n" + "crc32q 2048(%[in]), %%r10 # crc2: stripe2 \n" + + "crc32q 8(%[in]), %%rcx # crc0: stripe0 \n" + "crc32q 1032(%[in]), %%r11 # crc1: stripe1 \n" + "crc32q 2056(%[in]), %%r10 # crc2: stripe2 \n" + + "crc32q 16(%[in]), %%rcx # crc0: stripe0 \n" + "crc32q 1040(%[in]), %%r11 # crc1: stripe1 \n" + "crc32q 2064(%[in]), %%r10 # crc2: stripe2 \n" + + "crc32q 24(%[in]), %%rcx # crc0: stripe0 \n" + "crc32q 1048(%[in]), %%r11 # crc1: stripe1 \n" + "crc32q 2072(%[in]), %%r10 # crc2: stripe2 \n" + + "crc32q 32(%[in]), %%rcx # crc0: stripe0 \n" + "crc32q 1056(%[in]), %%r11 # crc1: stripe1 \n" + "crc32q 2080(%[in]), %%r10 # crc2: stripe2 \n" + + "crc32q 40(%[in]), %%rcx # crc0: stripe0 \n" + "crc32q 1064(%[in]), %%r11 # crc1: stripe1 \n" + "crc32q 2088(%[in]), %%r10 # crc2: stripe2 \n" + + "crc32q 48(%[in]), %%rcx # crc0: stripe0 \n" + "crc32q 1072(%[in]), %%r11 # crc1: stripe1 \n" + "crc32q 2096(%[in]), %%r10 # crc2: stripe2 \n" + + "crc32q 56(%[in]), %%rcx # crc0: stripe0 \n" + "crc32q 1080(%[in]), %%r11 # crc1: stripe1 \n" + "crc32q 2104(%[in]), %%r10 # crc2: stripe2 \n" + + "add $64, %[in] # \n" + "sub $1, %%r8d # \n" + "jnz loop_3072_%= # \n" + + FOLD_K1K2( + 3072, + $0xa51b6135, + $0x170076fa) /* Magic Constants used to fold crc stripes into ecx + + output registers + [crc] is an input and and output so it is marked read/write (i.e. "+c") + we clobber the register for [input] (via add instruction) so we must also + tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber*/ + : "+c"(crc), "+d"(input) + + /* input registers + the numeric values match the position of the output registers */ + : [ crc ] "c"(crc), [ in ] "d"(input) + + /* additional clobbered registers + "cc" is the flags - we add and sub, so the flags are also clobbered */ + : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc"); + + return crc; +} + +static bool detection_performed = false; +static bool detected_clmul = false; + +/* + * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and + * PCLMULQDQ machine instructions (if present). + * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction. + * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent + * call. + */ +uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { + + if (AWS_UNLIKELY(!detection_performed)) { + detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); + /* Simply setting the flag true to skip HW detection next time + Not using memory barriers since the worst that can + happen is a fallback to the non HW accelerated code. */ + detection_performed = true; + } + + uint32_t crc = ~previousCrc32; + + /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ + if (AWS_UNLIKELY(length < 8)) { + while (length-- > 0) { + __asm__("loop_small_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input)); + input++; + } + return ~crc; + } + + /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ + int input_alignment = (unsigned long int)input & 0x7; + + /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ + int leading = (8 - input_alignment) & 0x7; + + /* reduce the length by the leading unaligned bytes we are about to process */ + length -= leading; + + /* spin through the leading unaligned input bytes (if any) one-by-one */ + while (leading-- > 0) { + __asm__("loop_leading_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input)); + input++; + } + + /* Using likely to keep this code inlined */ + if (AWS_LIKELY(detected_clmul)) { + + while (AWS_LIKELY(length >= 3072)) { + /* Compute crc32c on each block, chaining each crc result */ + crc = s_crc32c_sse42_clmul_3072(input, crc); + input += 3072; + length -= 3072; + } + while (AWS_LIKELY(length >= 1024)) { + /* Compute crc32c on each block, chaining each crc result */ + crc = s_crc32c_sse42_clmul_1024(input, crc); + input += 1024; + length -= 1024; + } + while (AWS_LIKELY(length >= 256)) { + /* Compute crc32c on each block, chaining each crc result */ + crc = s_crc32c_sse42_clmul_256(input, crc); + input += 256; + length -= 256; + } + } + + /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ + while (AWS_LIKELY(length >= 8)) { + /* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */ + __asm__ __volatile__("loop_8_%=: CRC32Q (%[in]), %%rcx" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input)); + input += 8; + length -= 8; + } + + /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ + while (length-- > 0) { + __asm__ __volatile__("loop_trailing_%=: CRC32B (%[in]), %[crc]" + : "+c"(crc) + : [ crc ] "c"(crc), [ in ] "r"(input)); + input++; + } + + return ~crc; +} +uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { + return aws_checksums_crc32_sw(input, length, previousCrc32); +} + +# if defined(__clang__) +# pragma clang diagnostic pop +# endif + +#else +uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { + return aws_checksums_crc32_sw(input, length, previousCrc32); +} + +uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { + return aws_checksums_crc32c_sw(input, length, previousCrc32); +} + +#endif diff --git a/contrib/restricted/aws/aws-checksums/ya.make b/contrib/restricted/aws/aws-checksums/ya.make index a08a095ee1..5833068700 100644 --- a/contrib/restricted/aws/aws-checksums/ya.make +++ b/contrib/restricted/aws/aws-checksums/ya.make @@ -2,44 +2,44 @@ LIBRARY() -OWNER(g:cpp-contrib) +OWNER(g:cpp-contrib) -VERSION(0.1.10) +VERSION(0.1.10) LICENSE(Apache-2.0) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) -PEERDIR( - contrib/restricted/aws/aws-c-common -) - +PEERDIR( + contrib/restricted/aws/aws-c-common +) + ADDINCL( - GLOBAL contrib/restricted/aws/aws-checksums/include + GLOBAL contrib/restricted/aws/aws-checksums/include ) NO_COMPILER_WARNINGS() -NO_RUNTIME() - -CFLAGS( - -DAWS_CHECKSUMS_EXPORTS - -DAWS_CHECKSUMS_USE_IMPORT_EXPORT - -DAWS_COMMON_USE_IMPORT_EXPORT - -DHAVE_SYSCONF - -Daws_checksums_EXPORTS -) +NO_RUNTIME() +CFLAGS( + -DAWS_CHECKSUMS_EXPORTS + -DAWS_CHECKSUMS_USE_IMPORT_EXPORT + -DAWS_COMMON_USE_IMPORT_EXPORT + -DHAVE_SYSCONF + -Daws_checksums_EXPORTS +) + IF (BUILD_TYPE == "DEBUG") - CFLAGS( - -DDEBUG_BUILD - ) -ENDIF() - + CFLAGS( + -DDEBUG_BUILD + ) +ENDIF() + SRCS( source/crc.c source/crc_sw.c - source/intel/asm/crc32c_sse42_asm.c + source/intel/asm/crc32c_sse42_asm.c ) END() |