diff options
| author | orivej <[email protected]> | 2022-02-10 16:45:01 +0300 |
|---|---|---|
| committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:45:01 +0300 |
| commit | 2d37894b1b037cf24231090eda8589bbb44fb6fc (patch) | |
| tree | be835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/restricted/aws/aws-c-common/source/arch/intel | |
| parent | 718c552901d703c502ccbefdfc3c9028d608b947 (diff) | |
Restoring authorship annotation for <[email protected]>. Commit 2 of 2.
Diffstat (limited to 'contrib/restricted/aws/aws-c-common/source/arch/intel')
3 files changed, 516 insertions, 516 deletions
diff --git a/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c b/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c index 07970177799..d2ceab01060 100644 --- a/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c +++ b/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c @@ -1,29 +1,29 @@ -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. - */ - -#include <aws/common/cpuid.h> - -void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd) { - uint32_t ebx = 0; - uint32_t edx = 0; - -#if defined(__i386__) && defined(__PIC__) - /* in case of PIC under 32-bit EBX cannot be clobbered */ - __asm__ __volatile__("movl %%ebx, %%edi \n\t " - "cpuid \n\t " - "xchgl %%ebx, %%edi" - : "=D"(ebx), -#else - __asm__ __volatile__("cpuid" - : "+b"(ebx), -#endif - "+a"(eax), - "+c"(ecx), - "=d"(edx)); - abcd[0] = eax; - abcd[1] = ebx; - abcd[2] = ecx; - abcd[3] = edx; -} +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include <aws/common/cpuid.h> + +void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd) { + uint32_t ebx = 0; + uint32_t edx = 0; + +#if defined(__i386__) && defined(__PIC__) + /* in case of PIC under 32-bit EBX cannot be clobbered */ + __asm__ __volatile__("movl %%ebx, %%edi \n\t " + "cpuid \n\t " + "xchgl %%ebx, %%edi" + : "=D"(ebx), +#else + __asm__ __volatile__("cpuid" + : "+b"(ebx), +#endif + "+a"(eax), + "+c"(ecx), + "=d"(edx)); + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +} diff --git a/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c b/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c index ddd9640c620..6385c146fb0 100644 --- a/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c +++ b/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c @@ -1,103 +1,103 @@ -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. - */ - -/* - * MSVC wants us to use the non-portable _dupenv_s instead; since we need - * to remain portable, tell MSVC to suppress this warning. - */ -#define _CRT_SECURE_NO_WARNINGS - -#include <aws/common/cpuid.h> -#include <stdlib.h> - -extern void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd); - -typedef bool(has_feature_fn)(void); - -static bool s_has_clmul(void) { - uint32_t abcd[4]; - uint32_t clmul_mask = 0x00000002; - aws_run_cpuid(1, 0, abcd); - - if ((abcd[2] & clmul_mask) != clmul_mask) - return false; - - return true; -} - -static bool s_has_sse41(void) { - uint32_t abcd[4]; - uint32_t sse41_mask = 0x00080000; - aws_run_cpuid(1, 0, abcd); - - if ((abcd[2] & sse41_mask) != sse41_mask) - return false; - - return true; -} - -static bool s_has_sse42(void) { - uint32_t abcd[4]; - uint32_t sse42_mask = 0x00100000; - aws_run_cpuid(1, 0, abcd); - - if ((abcd[2] & sse42_mask) != sse42_mask) - return false; - - return true; -} - -static bool s_has_avx2(void) { - uint32_t abcd[4]; - uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8); - /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1 && - CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 && - CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */ - aws_run_cpuid(7, 0, abcd); - - if ((abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask) - return false; - - return true; -} - -has_feature_fn *s_check_cpu_feature[AWS_CPU_FEATURE_COUNT] = { - [AWS_CPU_FEATURE_CLMUL] = s_has_clmul, - [AWS_CPU_FEATURE_SSE_4_1] = s_has_sse41, - [AWS_CPU_FEATURE_SSE_4_2] = s_has_sse42, - [AWS_CPU_FEATURE_AVX2] = s_has_avx2, -}; - -bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) { - if (s_check_cpu_feature[feature_name]) - return s_check_cpu_feature[feature_name](); - return false; -} - -#define CPUID_AVAILABLE 0 -#define CPUID_UNAVAILABLE 1 -static int cpuid_state = 2; - -bool aws_common_private_has_avx2(void) { - if (AWS_LIKELY(cpuid_state == 0)) { - return true; - } - if (AWS_LIKELY(cpuid_state == 1)) { - return false; - } - - /* Provide a hook for testing fallbacks and benchmarking */ - const char *env_avx2_enabled = getenv("AWS_COMMON_AVX2"); - if (env_avx2_enabled) { - int is_enabled = atoi(env_avx2_enabled); - cpuid_state = !is_enabled; - return is_enabled; - } - - bool available = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2); - cpuid_state = available ? CPUID_AVAILABLE : CPUID_UNAVAILABLE; - - return available; -} +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +/* + * MSVC wants us to use the non-portable _dupenv_s instead; since we need + * to remain portable, tell MSVC to suppress this warning. + */ +#define _CRT_SECURE_NO_WARNINGS + +#include <aws/common/cpuid.h> +#include <stdlib.h> + +extern void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd); + +typedef bool(has_feature_fn)(void); + +static bool s_has_clmul(void) { + uint32_t abcd[4]; + uint32_t clmul_mask = 0x00000002; + aws_run_cpuid(1, 0, abcd); + + if ((abcd[2] & clmul_mask) != clmul_mask) + return false; + + return true; +} + +static bool s_has_sse41(void) { + uint32_t abcd[4]; + uint32_t sse41_mask = 0x00080000; + aws_run_cpuid(1, 0, abcd); + + if ((abcd[2] & sse41_mask) != sse41_mask) + return false; + + return true; +} + +static bool s_has_sse42(void) { + uint32_t abcd[4]; + uint32_t sse42_mask = 0x00100000; + aws_run_cpuid(1, 0, abcd); + + if ((abcd[2] & sse42_mask) != sse42_mask) + return false; + + return true; +} + +static bool s_has_avx2(void) { + uint32_t abcd[4]; + uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8); + /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1 && + CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 && + CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */ + aws_run_cpuid(7, 0, abcd); + + if ((abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask) + return false; + + return true; +} + +has_feature_fn *s_check_cpu_feature[AWS_CPU_FEATURE_COUNT] = { + [AWS_CPU_FEATURE_CLMUL] = s_has_clmul, + [AWS_CPU_FEATURE_SSE_4_1] = s_has_sse41, + [AWS_CPU_FEATURE_SSE_4_2] = s_has_sse42, + [AWS_CPU_FEATURE_AVX2] = s_has_avx2, +}; + +bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) { + if (s_check_cpu_feature[feature_name]) + return s_check_cpu_feature[feature_name](); + return false; +} + +#define CPUID_AVAILABLE 0 +#define CPUID_UNAVAILABLE 1 +static int cpuid_state = 2; + +bool aws_common_private_has_avx2(void) { + if (AWS_LIKELY(cpuid_state == 0)) { + return true; + } + if (AWS_LIKELY(cpuid_state == 1)) { + return false; + } + + /* Provide a hook for testing fallbacks and benchmarking */ + const char *env_avx2_enabled = getenv("AWS_COMMON_AVX2"); + if (env_avx2_enabled) { + int is_enabled = atoi(env_avx2_enabled); + cpuid_state = !is_enabled; + return is_enabled; + } + + bool available = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2); + cpuid_state = available ? CPUID_AVAILABLE : CPUID_UNAVAILABLE; + + return available; +} diff --git a/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c b/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c index d4b580f24af..ebae8613810 100644 --- a/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c +++ b/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c @@ -1,384 +1,384 @@ -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. - */ - -#include <emmintrin.h> -#include <immintrin.h> - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <aws/common/common.h> - -/***** Decode logic *****/ - -/* - * Decodes ranges of bytes in place - * For each byte of 'in' that is between lo and hi (inclusive), adds offset and _adds_ it to the corresponding offset in - * out. - */ -static inline __m256i translate_range(__m256i in, uint8_t lo, uint8_t hi, uint8_t offset) { - __m256i lovec = _mm256_set1_epi8(lo); - __m256i hivec = _mm256_set1_epi8((char)(hi - lo)); - __m256i offsetvec = _mm256_set1_epi8(offset); - - __m256i tmp = _mm256_sub_epi8(in, lovec); - /* - * we'll use the unsigned min operator to do our comparison. Note that - * there's no unsigned compare as a comparison intrinsic. - */ - __m256i mask = _mm256_min_epu8(tmp, hivec); - /* if mask = tmp, then keep that byte */ - mask = _mm256_cmpeq_epi8(mask, tmp); - - tmp = _mm256_add_epi8(tmp, offsetvec); - tmp = _mm256_and_si256(tmp, mask); - return tmp; -} - -/* - * For each 8-bit element in in, if the element equals match, add to the corresponding element in out the value decode. - */ -static inline __m256i translate_exact(__m256i in, uint8_t match, uint8_t decode) { - __m256i mask = _mm256_cmpeq_epi8(in, _mm256_set1_epi8(match)); - return _mm256_and_si256(mask, _mm256_set1_epi8(decode)); -} - -/* - * Input: a pointer to a 256-bit vector of base64 characters - * The pointed-to-vector is replaced by a 256-bit vector of 6-bit decoded parts; - * on decode failure, returns false, else returns true on success. - */ -static inline bool decode_vec(__m256i *in) { - __m256i tmp1, tmp2, tmp3; - - /* - * Base64 decoding table, see RFC4648 - * - * Note that we use multiple vector registers to try to allow the CPU to - * paralellize the merging ORs - */ - tmp1 = translate_range(*in, 'A', 'Z', 0 + 1); - tmp2 = translate_range(*in, 'a', 'z', 26 + 1); - tmp3 = translate_range(*in, '0', '9', 52 + 1); - tmp1 = _mm256_or_si256(tmp1, translate_exact(*in, '+', 62 + 1)); - tmp2 = _mm256_or_si256(tmp2, translate_exact(*in, '/', 63 + 1)); - tmp3 = _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2)); - - /* - * We use 0 to mark decode failures, so everything is decoded to one higher - * than normal. We'll shift this down now. - */ - *in = _mm256_sub_epi8(tmp3, _mm256_set1_epi8(1)); - - /* If any byte is now zero, we had a decode failure */ - __m256i mask = _mm256_cmpeq_epi8(tmp3, _mm256_set1_epi8(0)); - return _mm256_testz_si256(mask, mask); -} - -AWS_ALIGNED_TYPEDEF(uint8_t, aligned256[32], 32); - -/* - * Input: a 256-bit vector, interpreted as 32 * 6-bit values - * Output: a 256-bit vector, the lower 24 bytes of which contain the packed version of the input - */ -static inline __m256i pack_vec(__m256i in) { - /* - * Our basic strategy is to split the input vector into three vectors, for each 6-bit component - * of each 24-bit group, shift the groups into place, then OR the vectors together. Conveniently, - * we can do this on a (32 bit) dword-by-dword basis. - * - * It's important to note that we're interpreting the vector as being little-endian. That is, - * on entry, we have dwords that look like this: - * - * MSB LSB - * 00DD DDDD 00CC CCCC 00BB BBBB 00AA AAAA - * - * And we want to translate to: - * - * MSB LSB - * 0000 0000 AAAA AABB BBBB CCCC CCDD DDDD - * - * After which point we can pack these dwords together to produce our final output. - */ - __m256i maskA = _mm256_set1_epi32(0xFF); // low bits - __m256i maskB = _mm256_set1_epi32(0xFF00); - __m256i maskC = _mm256_set1_epi32(0xFF0000); - __m256i maskD = _mm256_set1_epi32((int)0xFF000000); - - __m256i bitsA = _mm256_slli_epi32(_mm256_and_si256(in, maskA), 18); - __m256i bitsB = _mm256_slli_epi32(_mm256_and_si256(in, maskB), 4); - __m256i bitsC = _mm256_srli_epi32(_mm256_and_si256(in, maskC), 10); - __m256i bitsD = _mm256_srli_epi32(_mm256_and_si256(in, maskD), 24); - - __m256i dwords = _mm256_or_si256(_mm256_or_si256(bitsA, bitsB), _mm256_or_si256(bitsC, bitsD)); - /* - * Now we have a series of dwords with empty MSBs. - * We need to pack them together (and shift down) with a shuffle operation. - * Unfortunately the shuffle operation operates independently within each 128-bit lane, - * so we'll need to do this in two steps: First we compact dwords within each lane, then - * we do a dword shuffle to compact the two lanes together. - - * 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 <- byte index (little endian) - * -- 09 0a 0b -- 06 07 08 -- 03 04 05 -- 00 01 02 <- data index - * - * We also reverse the order of 3-byte fragments within each lane; we've constructed - * those fragments in little endian but the order of fragments within the overall - * vector is in memory order (big endian) - */ - const aligned256 shufvec_buf = { - /* clang-format off */ - /* MSB */ - 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */ - 2, 1, 0, - 6, 5, 4, - 10, 9, 8, - 14, 13, 12, - - 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */ - 2, 1, 0, - 6, 5, 4, - 10, 9, 8, - 14, 13, 12 - /* LSB */ - /* clang-format on */ - }; - __m256i shufvec = _mm256_load_si256((__m256i const *)&shufvec_buf); - - dwords = _mm256_shuffle_epi8(dwords, shufvec); - /* - * Now shuffle the 32-bit words: - * A B C 0 D E F 0 -> 0 0 A B C D E F - */ - __m256i shuf32 = _mm256_set_epi32(0, 0, 7, 6, 5, 3, 2, 1); - - dwords = _mm256_permutevar8x32_epi32(dwords, shuf32); - - return dwords; -} - -static inline bool decode(const unsigned char *in, unsigned char *out) { - __m256i vec = _mm256_loadu_si256((__m256i const *)in); - if (!decode_vec(&vec)) { - return false; - } - vec = pack_vec(vec); - - /* - * We'll do overlapping writes to get both the low 128 bits and the high 64-bits written. - * Input (memory order): 0 1 2 3 4 5 - - (dwords) - * Input (little endian) - - 5 4 3 2 1 0 - * Output in memory: - * [0 1 2 3] [4 5] - */ - __m128i lo = _mm256_extracti128_si256(vec, 0); - /* - * Unfortunately some compilers don't support _mm256_extract_epi64, - * so we'll just copy right out of the vector as a fallback - */ - -#ifdef HAVE_MM256_EXTRACT_EPI64 - uint64_t hi = _mm256_extract_epi64(vec, 2); - const uint64_t *p_hi = &hi; -#else - const uint64_t *p_hi = (uint64_t *)&vec + 2; -#endif - - _mm_storeu_si128((__m128i *)out, lo); - memcpy(out + 16, p_hi, sizeof(*p_hi)); - - return true; -} - -size_t aws_common_private_base64_decode_sse41(const unsigned char *in, unsigned char *out, size_t len) { - if (len % 4) { - return (size_t)-1; - } - - size_t outlen = 0; - while (len > 32) { - if (!decode(in, out)) { - return (size_t)-1; - } - len -= 32; - in += 32; - out += 24; - outlen += 24; - } - - if (len > 0) { - unsigned char tmp_in[32]; - unsigned char tmp_out[24]; - - memset(tmp_out, 0xEE, sizeof(tmp_out)); - - /* We need to ensure the vector contains valid b64 characters */ - memset(tmp_in, 'A', sizeof(tmp_in)); - memcpy(tmp_in, in, len); - - size_t final_out = (3 * len) / 4; - - /* Check for end-of-string padding (up to 2 characters) */ - for (int i = 0; i < 2; i++) { - if (tmp_in[len - 1] == '=') { - tmp_in[len - 1] = 'A'; /* make sure the inner loop doesn't bail out */ - len--; - final_out--; - } - } - - if (!decode(tmp_in, tmp_out)) { - return (size_t)-1; - } - - /* Check that there are no trailing ones bits */ - for (size_t i = final_out; i < sizeof(tmp_out); i++) { - if (tmp_out[i]) { - return (size_t)-1; - } - } - - memcpy(out, tmp_out, final_out); - outlen += final_out; - } - return outlen; -} - -/***** Encode logic *****/ -static inline __m256i encode_chars(__m256i in) { - __m256i tmp1, tmp2, tmp3; - - /* - * Base64 encoding table, see RFC4648 - * - * We again use fan-in for the ORs here. - */ - tmp1 = translate_range(in, 0, 25, 'A'); - tmp2 = translate_range(in, 26, 26 + 25, 'a'); - tmp3 = translate_range(in, 52, 61, '0'); - tmp1 = _mm256_or_si256(tmp1, translate_exact(in, 62, '+')); - tmp2 = _mm256_or_si256(tmp2, translate_exact(in, 63, '/')); - - return _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2)); -} - -/* - * Input: A 256-bit vector, interpreted as 24 bytes (LSB) plus 8 bytes of high-byte padding - * Output: A 256-bit vector of base64 characters - */ -static inline __m256i encode_stride(__m256i vec) { - /* - * First, since byte-shuffle operations operate within 128-bit subvectors, swap around the dwords - * to balance the amount of actual data between 128-bit subvectors. - * After this we want the LE representation to look like: -- XX XX XX -- XX XX XX - */ - __m256i shuf32 = _mm256_set_epi32(7, 5, 4, 3, 6, 2, 1, 0); - vec = _mm256_permutevar8x32_epi32(vec, shuf32); - - /* - * Next, within each group of 3 bytes, we need to byteswap into little endian form so our bitshifts - * will work properly. We also shuffle around so that each dword has one 3-byte group, plus one byte - * (MSB) of zero-padding. - * Because this is a byte-shuffle, indexes are within each 128-bit subvector. - * - * -- -- -- -- 11 10 09 08 07 06 05 04 03 02 01 00 - */ - - const aligned256 shufvec_buf = { - /* clang-format off */ - /* MSB */ - 2, 1, 0, 0xFF, - 5, 4, 3, 0xFF, - 8, 7, 6, 0xFF, - 11, 10, 9, 0xFF, - - 2, 1, 0, 0xFF, - 5, 4, 3, 0xFF, - 8, 7, 6, 0xFF, - 11, 10, 9, 0xFF - /* LSB */ - /* clang-format on */ - }; - vec = _mm256_shuffle_epi8(vec, _mm256_load_si256((__m256i const *)&shufvec_buf)); - - /* - * Now shift and mask to split out 6-bit groups. - * We'll also do a second byteswap to get back into big-endian - */ - __m256i mask0 = _mm256_set1_epi32(0x3F); - __m256i mask1 = _mm256_set1_epi32(0x3F << 6); - __m256i mask2 = _mm256_set1_epi32(0x3F << 12); - __m256i mask3 = _mm256_set1_epi32(0x3F << 18); - - __m256i digit0 = _mm256_and_si256(mask0, vec); - __m256i digit1 = _mm256_and_si256(mask1, vec); - __m256i digit2 = _mm256_and_si256(mask2, vec); - __m256i digit3 = _mm256_and_si256(mask3, vec); - - /* - * Because we want to byteswap, the low-order digit0 goes into the - * high-order byte - */ - digit0 = _mm256_slli_epi32(digit0, 24); - digit1 = _mm256_slli_epi32(digit1, 10); - digit2 = _mm256_srli_epi32(digit2, 4); - digit3 = _mm256_srli_epi32(digit3, 18); - - vec = _mm256_or_si256(_mm256_or_si256(digit0, digit1), _mm256_or_si256(digit2, digit3)); - - /* Finally translate to the base64 character set */ - return encode_chars(vec); -} - -void aws_common_private_base64_encode_sse41(const uint8_t *input, uint8_t *output, size_t inlen) { - __m256i instride, outstride; - - while (inlen >= 32) { - /* - * Where possible, we'll load a full vector at a time and ignore the over-read. - * However, if we have < 32 bytes left, this would result in a potential read - * of unreadable pages, so we use bounce buffers below. - */ - instride = _mm256_loadu_si256((__m256i const *)input); - outstride = encode_stride(instride); - _mm256_storeu_si256((__m256i *)output, outstride); - - input += 24; - output += 32; - inlen -= 24; - } - - while (inlen) { - /* - * We need to go through a bounce buffer for anything remaining, as we - * don't want to over-read or over-write the ends of the buffers. - */ - size_t stridelen = inlen > 24 ? 24 : inlen; - size_t outlen = ((stridelen + 2) / 3) * 4; - - memset(&instride, 0, sizeof(instride)); - memcpy(&instride, input, stridelen); - - outstride = encode_stride(instride); - memcpy(output, &outstride, outlen); - - if (inlen < 24) { - if (inlen % 3 >= 1) { - /* AA== or AAA= */ - output[outlen - 1] = '='; - } - if (inlen % 3 == 1) { - /* AA== */ - output[outlen - 2] = '='; - } - - return; - } - - input += stridelen; - output += outlen; - inlen -= stridelen; - } -} +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include <emmintrin.h> +#include <immintrin.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <aws/common/common.h> + +/***** Decode logic *****/ + +/* + * Decodes ranges of bytes in place + * For each byte of 'in' that is between lo and hi (inclusive), adds offset and _adds_ it to the corresponding offset in + * out. + */ +static inline __m256i translate_range(__m256i in, uint8_t lo, uint8_t hi, uint8_t offset) { + __m256i lovec = _mm256_set1_epi8(lo); + __m256i hivec = _mm256_set1_epi8((char)(hi - lo)); + __m256i offsetvec = _mm256_set1_epi8(offset); + + __m256i tmp = _mm256_sub_epi8(in, lovec); + /* + * we'll use the unsigned min operator to do our comparison. Note that + * there's no unsigned compare as a comparison intrinsic. + */ + __m256i mask = _mm256_min_epu8(tmp, hivec); + /* if mask = tmp, then keep that byte */ + mask = _mm256_cmpeq_epi8(mask, tmp); + + tmp = _mm256_add_epi8(tmp, offsetvec); + tmp = _mm256_and_si256(tmp, mask); + return tmp; +} + +/* + * For each 8-bit element in in, if the element equals match, add to the corresponding element in out the value decode. + */ +static inline __m256i translate_exact(__m256i in, uint8_t match, uint8_t decode) { + __m256i mask = _mm256_cmpeq_epi8(in, _mm256_set1_epi8(match)); + return _mm256_and_si256(mask, _mm256_set1_epi8(decode)); +} + +/* + * Input: a pointer to a 256-bit vector of base64 characters + * The pointed-to-vector is replaced by a 256-bit vector of 6-bit decoded parts; + * on decode failure, returns false, else returns true on success. + */ +static inline bool decode_vec(__m256i *in) { + __m256i tmp1, tmp2, tmp3; + + /* + * Base64 decoding table, see RFC4648 + * + * Note that we use multiple vector registers to try to allow the CPU to + * paralellize the merging ORs + */ + tmp1 = translate_range(*in, 'A', 'Z', 0 + 1); + tmp2 = translate_range(*in, 'a', 'z', 26 + 1); + tmp3 = translate_range(*in, '0', '9', 52 + 1); + tmp1 = _mm256_or_si256(tmp1, translate_exact(*in, '+', 62 + 1)); + tmp2 = _mm256_or_si256(tmp2, translate_exact(*in, '/', 63 + 1)); + tmp3 = _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2)); + + /* + * We use 0 to mark decode failures, so everything is decoded to one higher + * than normal. We'll shift this down now. + */ + *in = _mm256_sub_epi8(tmp3, _mm256_set1_epi8(1)); + + /* If any byte is now zero, we had a decode failure */ + __m256i mask = _mm256_cmpeq_epi8(tmp3, _mm256_set1_epi8(0)); + return _mm256_testz_si256(mask, mask); +} + +AWS_ALIGNED_TYPEDEF(uint8_t, aligned256[32], 32); + +/* + * Input: a 256-bit vector, interpreted as 32 * 6-bit values + * Output: a 256-bit vector, the lower 24 bytes of which contain the packed version of the input + */ +static inline __m256i pack_vec(__m256i in) { + /* + * Our basic strategy is to split the input vector into three vectors, for each 6-bit component + * of each 24-bit group, shift the groups into place, then OR the vectors together. Conveniently, + * we can do this on a (32 bit) dword-by-dword basis. + * + * It's important to note that we're interpreting the vector as being little-endian. That is, + * on entry, we have dwords that look like this: + * + * MSB LSB + * 00DD DDDD 00CC CCCC 00BB BBBB 00AA AAAA + * + * And we want to translate to: + * + * MSB LSB + * 0000 0000 AAAA AABB BBBB CCCC CCDD DDDD + * + * After which point we can pack these dwords together to produce our final output. + */ + __m256i maskA = _mm256_set1_epi32(0xFF); // low bits + __m256i maskB = _mm256_set1_epi32(0xFF00); + __m256i maskC = _mm256_set1_epi32(0xFF0000); + __m256i maskD = _mm256_set1_epi32((int)0xFF000000); + + __m256i bitsA = _mm256_slli_epi32(_mm256_and_si256(in, maskA), 18); + __m256i bitsB = _mm256_slli_epi32(_mm256_and_si256(in, maskB), 4); + __m256i bitsC = _mm256_srli_epi32(_mm256_and_si256(in, maskC), 10); + __m256i bitsD = _mm256_srli_epi32(_mm256_and_si256(in, maskD), 24); + + __m256i dwords = _mm256_or_si256(_mm256_or_si256(bitsA, bitsB), _mm256_or_si256(bitsC, bitsD)); + /* + * Now we have a series of dwords with empty MSBs. + * We need to pack them together (and shift down) with a shuffle operation. + * Unfortunately the shuffle operation operates independently within each 128-bit lane, + * so we'll need to do this in two steps: First we compact dwords within each lane, then + * we do a dword shuffle to compact the two lanes together. + + * 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 <- byte index (little endian) + * -- 09 0a 0b -- 06 07 08 -- 03 04 05 -- 00 01 02 <- data index + * + * We also reverse the order of 3-byte fragments within each lane; we've constructed + * those fragments in little endian but the order of fragments within the overall + * vector is in memory order (big endian) + */ + const aligned256 shufvec_buf = { + /* clang-format off */ + /* MSB */ + 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */ + 2, 1, 0, + 6, 5, 4, + 10, 9, 8, + 14, 13, 12, + + 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */ + 2, 1, 0, + 6, 5, 4, + 10, 9, 8, + 14, 13, 12 + /* LSB */ + /* clang-format on */ + }; + __m256i shufvec = _mm256_load_si256((__m256i const *)&shufvec_buf); + + dwords = _mm256_shuffle_epi8(dwords, shufvec); + /* + * Now shuffle the 32-bit words: + * A B C 0 D E F 0 -> 0 0 A B C D E F + */ + __m256i shuf32 = _mm256_set_epi32(0, 0, 7, 6, 5, 3, 2, 1); + + dwords = _mm256_permutevar8x32_epi32(dwords, shuf32); + + return dwords; +} + +static inline bool decode(const unsigned char *in, unsigned char *out) { + __m256i vec = _mm256_loadu_si256((__m256i const *)in); + if (!decode_vec(&vec)) { + return false; + } + vec = pack_vec(vec); + + /* + * We'll do overlapping writes to get both the low 128 bits and the high 64-bits written. + * Input (memory order): 0 1 2 3 4 5 - - (dwords) + * Input (little endian) - - 5 4 3 2 1 0 + * Output in memory: + * [0 1 2 3] [4 5] + */ + __m128i lo = _mm256_extracti128_si256(vec, 0); + /* + * Unfortunately some compilers don't support _mm256_extract_epi64, + * so we'll just copy right out of the vector as a fallback + */ + +#ifdef HAVE_MM256_EXTRACT_EPI64 + uint64_t hi = _mm256_extract_epi64(vec, 2); + const uint64_t *p_hi = &hi; +#else + const uint64_t *p_hi = (uint64_t *)&vec + 2; +#endif + + _mm_storeu_si128((__m128i *)out, lo); + memcpy(out + 16, p_hi, sizeof(*p_hi)); + + return true; +} + +size_t aws_common_private_base64_decode_sse41(const unsigned char *in, unsigned char *out, size_t len) { + if (len % 4) { + return (size_t)-1; + } + + size_t outlen = 0; + while (len > 32) { + if (!decode(in, out)) { + return (size_t)-1; + } + len -= 32; + in += 32; + out += 24; + outlen += 24; + } + + if (len > 0) { + unsigned char tmp_in[32]; + unsigned char tmp_out[24]; + + memset(tmp_out, 0xEE, sizeof(tmp_out)); + + /* We need to ensure the vector contains valid b64 characters */ + memset(tmp_in, 'A', sizeof(tmp_in)); + memcpy(tmp_in, in, len); + + size_t final_out = (3 * len) / 4; + + /* Check for end-of-string padding (up to 2 characters) */ + for (int i = 0; i < 2; i++) { + if (tmp_in[len - 1] == '=') { + tmp_in[len - 1] = 'A'; /* make sure the inner loop doesn't bail out */ + len--; + final_out--; + } + } + + if (!decode(tmp_in, tmp_out)) { + return (size_t)-1; + } + + /* Check that there are no trailing ones bits */ + for (size_t i = final_out; i < sizeof(tmp_out); i++) { + if (tmp_out[i]) { + return (size_t)-1; + } + } + + memcpy(out, tmp_out, final_out); + outlen += final_out; + } + return outlen; +} + +/***** Encode logic *****/ +static inline __m256i encode_chars(__m256i in) { + __m256i tmp1, tmp2, tmp3; + + /* + * Base64 encoding table, see RFC4648 + * + * We again use fan-in for the ORs here. + */ + tmp1 = translate_range(in, 0, 25, 'A'); + tmp2 = translate_range(in, 26, 26 + 25, 'a'); + tmp3 = translate_range(in, 52, 61, '0'); + tmp1 = _mm256_or_si256(tmp1, translate_exact(in, 62, '+')); + tmp2 = _mm256_or_si256(tmp2, translate_exact(in, 63, '/')); + + return _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2)); +} + +/* + * Input: A 256-bit vector, interpreted as 24 bytes (LSB) plus 8 bytes of high-byte padding + * Output: A 256-bit vector of base64 characters + */ +static inline __m256i encode_stride(__m256i vec) { + /* + * First, since byte-shuffle operations operate within 128-bit subvectors, swap around the dwords + * to balance the amount of actual data between 128-bit subvectors. + * After this we want the LE representation to look like: -- XX XX XX -- XX XX XX + */ + __m256i shuf32 = _mm256_set_epi32(7, 5, 4, 3, 6, 2, 1, 0); + vec = _mm256_permutevar8x32_epi32(vec, shuf32); + + /* + * Next, within each group of 3 bytes, we need to byteswap into little endian form so our bitshifts + * will work properly. We also shuffle around so that each dword has one 3-byte group, plus one byte + * (MSB) of zero-padding. + * Because this is a byte-shuffle, indexes are within each 128-bit subvector. + * + * -- -- -- -- 11 10 09 08 07 06 05 04 03 02 01 00 + */ + + const aligned256 shufvec_buf = { + /* clang-format off */ + /* MSB */ + 2, 1, 0, 0xFF, + 5, 4, 3, 0xFF, + 8, 7, 6, 0xFF, + 11, 10, 9, 0xFF, + + 2, 1, 0, 0xFF, + 5, 4, 3, 0xFF, + 8, 7, 6, 0xFF, + 11, 10, 9, 0xFF + /* LSB */ + /* clang-format on */ + }; + vec = _mm256_shuffle_epi8(vec, _mm256_load_si256((__m256i const *)&shufvec_buf)); + + /* + * Now shift and mask to split out 6-bit groups. + * We'll also do a second byteswap to get back into big-endian + */ + __m256i mask0 = _mm256_set1_epi32(0x3F); + __m256i mask1 = _mm256_set1_epi32(0x3F << 6); + __m256i mask2 = _mm256_set1_epi32(0x3F << 12); + __m256i mask3 = _mm256_set1_epi32(0x3F << 18); + + __m256i digit0 = _mm256_and_si256(mask0, vec); + __m256i digit1 = _mm256_and_si256(mask1, vec); + __m256i digit2 = _mm256_and_si256(mask2, vec); + __m256i digit3 = _mm256_and_si256(mask3, vec); + + /* + * Because we want to byteswap, the low-order digit0 goes into the + * high-order byte + */ + digit0 = _mm256_slli_epi32(digit0, 24); + digit1 = _mm256_slli_epi32(digit1, 10); + digit2 = _mm256_srli_epi32(digit2, 4); + digit3 = _mm256_srli_epi32(digit3, 18); + + vec = _mm256_or_si256(_mm256_or_si256(digit0, digit1), _mm256_or_si256(digit2, digit3)); + + /* Finally translate to the base64 character set */ + return encode_chars(vec); +} + +void aws_common_private_base64_encode_sse41(const uint8_t *input, uint8_t *output, size_t inlen) { + __m256i instride, outstride; + + while (inlen >= 32) { + /* + * Where possible, we'll load a full vector at a time and ignore the over-read. + * However, if we have < 32 bytes left, this would result in a potential read + * of unreadable pages, so we use bounce buffers below. + */ + instride = _mm256_loadu_si256((__m256i const *)input); + outstride = encode_stride(instride); + _mm256_storeu_si256((__m256i *)output, outstride); + + input += 24; + output += 32; + inlen -= 24; + } + + while (inlen) { + /* + * We need to go through a bounce buffer for anything remaining, as we + * don't want to over-read or over-write the ends of the buffers. + */ + size_t stridelen = inlen > 24 ? 24 : inlen; + size_t outlen = ((stridelen + 2) / 3) * 4; + + memset(&instride, 0, sizeof(instride)); + memcpy(&instride, input, stridelen); + + outstride = encode_stride(instride); + memcpy(output, &outstride, outlen); + + if (inlen < 24) { + if (inlen % 3 >= 1) { + /* AA== or AAA= */ + output[outlen - 1] = '='; + } + if (inlen % 3 == 1) { + /* AA== */ + output[outlen - 2] = '='; + } + + return; + } + + input += stridelen; + output += outlen; + inlen -= stridelen; + } +} |
