contrib/restricted/aws/aws-checksums/source/intel/intrin/crc64nvme_clmul.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

/**
 * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 * SPDX-License-Identifier: Apache-2.0.
 */

#include <aws/checksums/private/crc64_priv.h>
#include <aws/common/assert.h>

// msvc compilers older than 2019 are missing some intrinsics. Gate those off.
#if defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && !(defined(_MSC_VER) && _MSC_VER < 1920)

#    include <emmintrin.h>
#    include <immintrin.h>
#    include <smmintrin.h>
#    include <wmmintrin.h>

#    define load_xmm(ptr) _mm_loadu_si128((const __m128i *)(const void *)(ptr))
#    define mask_high_bytes(xmm, count)                                                                                \
        _mm_and_si128((xmm), load_xmm(aws_checksums_masks_shifts[3] + (intptr_t)(count)))
#    define cmull_xmm_hi(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x11)
#    define cmull_xmm_lo(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x00)
#    define cmull_xmm_pair(xmm1, xmm2) _mm_xor_si128(cmull_xmm_hi((xmm1), (xmm2)), cmull_xmm_lo((xmm1), (xmm2)))

uint64_t aws_checksums_crc64nvme_intel_clmul(const uint8_t *input, int length, uint64_t previous_crc64) {

    // the amount of complexity required to handle vector instructions on
    // memory regions smaller than an xmm register does not justify the very negligible performance gains
    // we would get for using it on an input this small.
    if (length < 16) {
        return aws_checksums_crc64nvme_sw(input, length, previous_crc64);
    }

    // Invert the previous crc bits and load into the lower half of an xmm register
    __m128i a1 = _mm_cvtsi64_si128((int64_t)(~previous_crc64));

    // There are 16 or more bytes of input - load the first 16 bytes and XOR with the previous crc
    a1 = _mm_xor_si128(a1, load_xmm(input));
    input += 16;
    length -= 16;

    // Load the folding constants x^128 and x^192
    const __m128i x128 = load_xmm(aws_checksums_crc64nvme_constants.x128);

    if (length >= 48) {
        // Load the next 48 bytes
        __m128i b1 = load_xmm(input + 0x00);
        __m128i c1 = load_xmm(input + 0x10);
        __m128i d1 = load_xmm(input + 0x20);

        input += 48;
        length -= 48;

        // Load the folding constants x^512 and x^576
        const __m128i x512 = load_xmm(aws_checksums_crc64nvme_constants.x512);

        if (length >= 64) {
            // Load the next 64 bytes
            __m128i e1 = load_xmm(input + 0x00);
            __m128i f1 = load_xmm(input + 0x10);
            __m128i g1 = load_xmm(input + 0x20);
            __m128i h1 = load_xmm(input + 0x30);
            input += 64;
            length -= 64;

            // Load the folding constants x^1024 and x^1088
            const __m128i x1024 = load_xmm(aws_checksums_crc64nvme_constants.x1024);

            // Spin through 128 bytes and fold in parallel
            int loops = length / 128;
            length &= 127;
            while (loops--) {
                a1 = _mm_xor_si128(cmull_xmm_pair(x1024, a1), load_xmm(input + 0x00));
                b1 = _mm_xor_si128(cmull_xmm_pair(x1024, b1), load_xmm(input + 0x10));
                c1 = _mm_xor_si128(cmull_xmm_pair(x1024, c1), load_xmm(input + 0x20));
                d1 = _mm_xor_si128(cmull_xmm_pair(x1024, d1), load_xmm(input + 0x30));
                e1 = _mm_xor_si128(cmull_xmm_pair(x1024, e1), load_xmm(input + 0x40));
                f1 = _mm_xor_si128(cmull_xmm_pair(x1024, f1), load_xmm(input + 0x50));
                g1 = _mm_xor_si128(cmull_xmm_pair(x1024, g1), load_xmm(input + 0x60));
                h1 = _mm_xor_si128(cmull_xmm_pair(x1024, h1), load_xmm(input + 0x70));
                input += 128;
            }

            // Fold 128 to 64 bytes - e1 through h1 fold into a1 through d1
            a1 = _mm_xor_si128(cmull_xmm_pair(x512, a1), e1);
            b1 = _mm_xor_si128(cmull_xmm_pair(x512, b1), f1);
            c1 = _mm_xor_si128(cmull_xmm_pair(x512, c1), g1);
            d1 = _mm_xor_si128(cmull_xmm_pair(x512, d1), h1);
        }

        if (length & 64) {
            a1 = _mm_xor_si128(cmull_xmm_pair(x512, a1), load_xmm(input + 0x00));
            b1 = _mm_xor_si128(cmull_xmm_pair(x512, b1), load_xmm(input + 0x10));
            c1 = _mm_xor_si128(cmull_xmm_pair(x512, c1), load_xmm(input + 0x20));
            d1 = _mm_xor_si128(cmull_xmm_pair(x512, d1), load_xmm(input + 0x30));
            input += 64;
        }
        length &= 63;

        // Load the x^256, x^320, x^384, and x^448 constants
        const __m128i x384 = load_xmm(aws_checksums_crc64nvme_constants.x384);
        const __m128i x256 = load_xmm(aws_checksums_crc64nvme_constants.x256);

        // Fold 64 bytes to 16 bytes
        a1 = _mm_xor_si128(d1, cmull_xmm_pair(x384, a1));
        a1 = _mm_xor_si128(a1, cmull_xmm_pair(x256, b1));
        a1 = _mm_xor_si128(a1, cmull_xmm_pair(x128, c1));
    }

    // Process any remaining chunks of 16 bytes
    int loops = length / 16;
    while (loops--) {
        a1 = _mm_xor_si128(cmull_xmm_pair(a1, x128), load_xmm(input));
        input += 16;
    }

    // The remaining length can be only 0-15 bytes
    length &= 15;
    if (length == 0) {
        // Multiply the lower half of the crc register by x^128 (it's in the upper half)
        __m128i mul_by_x128 = _mm_clmulepi64_si128(a1, x128, 0x10);
        // XOR the result with the upper half of the crc
        a1 = _mm_xor_si128(_mm_bsrli_si128(a1, 8), mul_by_x128);
    } else { // Handle any trailing input from 1-15 bytes
        // Multiply the crc by a pair of trailing length constants in order to fold it into the trailing input
        a1 = cmull_xmm_pair(a1, load_xmm(aws_checksums_crc64nvme_constants.trailing[length - 1]));
        // Safely load (ending at the trailing input) and mask out any leading garbage
        __m128i trailing_input = mask_high_bytes(load_xmm(input + length - 16), length);
        // Multiply the lower half of the trailing input register by x^128 (it's in the upper half)
        __m128i mul_by_x128 = _mm_clmulepi64_si128(trailing_input, x128, 0x10);
        // XOR the results with the upper half of the trailing input
        a1 = _mm_xor_si128(a1, _mm_bsrli_si128(trailing_input, 8));
        a1 = _mm_xor_si128(a1, mul_by_x128);
    }

    // Barrett modular reduction
    const __m128i mu_poly = load_xmm(aws_checksums_crc64nvme_constants.mu_poly);
    // Multiply the lower half of input by mu
    __m128i mul_by_mu = _mm_clmulepi64_si128(mu_poly, a1, 0x00);
    // Multiply the lower half of the mul_by_mu result by poly (it's in the upper half)
    __m128i mul_by_poly = _mm_clmulepi64_si128(mu_poly, mul_by_mu, 0x01);
    // Left shift mul_by_mu to get the low half into the upper half and XOR all the upper halves
    __m128i reduced = _mm_xor_si128(_mm_xor_si128(a1, _mm_bslli_si128(mul_by_mu, 8)), mul_by_poly);
    // After the XORs, the CRC falls in the upper half of the register - invert the bits before returning the crc
    return ~(uint64_t)_mm_extract_epi64(reduced, 1);
}

#endif /* defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && !(defined(_MSC_VER) && _MSC_VER < 1920) */