summaryrefslogtreecommitdiffstats
path: root/contrib/restricted/aws/aws-c-common/source/arch/intel
diff options
context:
space:
mode:
authororivej <[email protected]>2022-02-10 16:45:01 +0300
committerDaniil Cherednik <[email protected]>2022-02-10 16:45:01 +0300
commit2d37894b1b037cf24231090eda8589bbb44fb6fc (patch)
treebe835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/restricted/aws/aws-c-common/source/arch/intel
parent718c552901d703c502ccbefdfc3c9028d608b947 (diff)
Restoring authorship annotation for <[email protected]>. Commit 2 of 2.
Diffstat (limited to 'contrib/restricted/aws/aws-c-common/source/arch/intel')
-rw-r--r--contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c58
-rw-r--r--contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c206
-rw-r--r--contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c768
3 files changed, 516 insertions, 516 deletions
diff --git a/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c b/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c
index 07970177799..d2ceab01060 100644
--- a/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c
+++ b/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c
@@ -1,29 +1,29 @@
-/**
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
- * SPDX-License-Identifier: Apache-2.0.
- */
-
-#include <aws/common/cpuid.h>
-
-void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd) {
- uint32_t ebx = 0;
- uint32_t edx = 0;
-
-#if defined(__i386__) && defined(__PIC__)
- /* in case of PIC under 32-bit EBX cannot be clobbered */
- __asm__ __volatile__("movl %%ebx, %%edi \n\t "
- "cpuid \n\t "
- "xchgl %%ebx, %%edi"
- : "=D"(ebx),
-#else
- __asm__ __volatile__("cpuid"
- : "+b"(ebx),
-#endif
- "+a"(eax),
- "+c"(ecx),
- "=d"(edx));
- abcd[0] = eax;
- abcd[1] = ebx;
- abcd[2] = ecx;
- abcd[3] = edx;
-}
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <aws/common/cpuid.h>
+
+void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd) {
+ uint32_t ebx = 0;
+ uint32_t edx = 0;
+
+#if defined(__i386__) && defined(__PIC__)
+ /* in case of PIC under 32-bit EBX cannot be clobbered */
+ __asm__ __volatile__("movl %%ebx, %%edi \n\t "
+ "cpuid \n\t "
+ "xchgl %%ebx, %%edi"
+ : "=D"(ebx),
+#else
+ __asm__ __volatile__("cpuid"
+ : "+b"(ebx),
+#endif
+ "+a"(eax),
+ "+c"(ecx),
+ "=d"(edx));
+ abcd[0] = eax;
+ abcd[1] = ebx;
+ abcd[2] = ecx;
+ abcd[3] = edx;
+}
diff --git a/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c b/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c
index ddd9640c620..6385c146fb0 100644
--- a/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c
+++ b/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c
@@ -1,103 +1,103 @@
-/**
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
- * SPDX-License-Identifier: Apache-2.0.
- */
-
-/*
- * MSVC wants us to use the non-portable _dupenv_s instead; since we need
- * to remain portable, tell MSVC to suppress this warning.
- */
-#define _CRT_SECURE_NO_WARNINGS
-
-#include <aws/common/cpuid.h>
-#include <stdlib.h>
-
-extern void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd);
-
-typedef bool(has_feature_fn)(void);
-
-static bool s_has_clmul(void) {
- uint32_t abcd[4];
- uint32_t clmul_mask = 0x00000002;
- aws_run_cpuid(1, 0, abcd);
-
- if ((abcd[2] & clmul_mask) != clmul_mask)
- return false;
-
- return true;
-}
-
-static bool s_has_sse41(void) {
- uint32_t abcd[4];
- uint32_t sse41_mask = 0x00080000;
- aws_run_cpuid(1, 0, abcd);
-
- if ((abcd[2] & sse41_mask) != sse41_mask)
- return false;
-
- return true;
-}
-
-static bool s_has_sse42(void) {
- uint32_t abcd[4];
- uint32_t sse42_mask = 0x00100000;
- aws_run_cpuid(1, 0, abcd);
-
- if ((abcd[2] & sse42_mask) != sse42_mask)
- return false;
-
- return true;
-}
-
-static bool s_has_avx2(void) {
- uint32_t abcd[4];
- uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8);
- /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1 &&
- CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 &&
- CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */
- aws_run_cpuid(7, 0, abcd);
-
- if ((abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask)
- return false;
-
- return true;
-}
-
-has_feature_fn *s_check_cpu_feature[AWS_CPU_FEATURE_COUNT] = {
- [AWS_CPU_FEATURE_CLMUL] = s_has_clmul,
- [AWS_CPU_FEATURE_SSE_4_1] = s_has_sse41,
- [AWS_CPU_FEATURE_SSE_4_2] = s_has_sse42,
- [AWS_CPU_FEATURE_AVX2] = s_has_avx2,
-};
-
-bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) {
- if (s_check_cpu_feature[feature_name])
- return s_check_cpu_feature[feature_name]();
- return false;
-}
-
-#define CPUID_AVAILABLE 0
-#define CPUID_UNAVAILABLE 1
-static int cpuid_state = 2;
-
-bool aws_common_private_has_avx2(void) {
- if (AWS_LIKELY(cpuid_state == 0)) {
- return true;
- }
- if (AWS_LIKELY(cpuid_state == 1)) {
- return false;
- }
-
- /* Provide a hook for testing fallbacks and benchmarking */
- const char *env_avx2_enabled = getenv("AWS_COMMON_AVX2");
- if (env_avx2_enabled) {
- int is_enabled = atoi(env_avx2_enabled);
- cpuid_state = !is_enabled;
- return is_enabled;
- }
-
- bool available = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2);
- cpuid_state = available ? CPUID_AVAILABLE : CPUID_UNAVAILABLE;
-
- return available;
-}
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+/*
+ * MSVC wants us to use the non-portable _dupenv_s instead; since we need
+ * to remain portable, tell MSVC to suppress this warning.
+ */
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <aws/common/cpuid.h>
+#include <stdlib.h>
+
+extern void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd);
+
+typedef bool(has_feature_fn)(void);
+
+static bool s_has_clmul(void) {
+ uint32_t abcd[4];
+ uint32_t clmul_mask = 0x00000002;
+ aws_run_cpuid(1, 0, abcd);
+
+ if ((abcd[2] & clmul_mask) != clmul_mask)
+ return false;
+
+ return true;
+}
+
+static bool s_has_sse41(void) {
+ uint32_t abcd[4];
+ uint32_t sse41_mask = 0x00080000;
+ aws_run_cpuid(1, 0, abcd);
+
+ if ((abcd[2] & sse41_mask) != sse41_mask)
+ return false;
+
+ return true;
+}
+
+static bool s_has_sse42(void) {
+ uint32_t abcd[4];
+ uint32_t sse42_mask = 0x00100000;
+ aws_run_cpuid(1, 0, abcd);
+
+ if ((abcd[2] & sse42_mask) != sse42_mask)
+ return false;
+
+ return true;
+}
+
+static bool s_has_avx2(void) {
+ uint32_t abcd[4];
+ uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8);
+ /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1 &&
+ CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 &&
+ CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */
+ aws_run_cpuid(7, 0, abcd);
+
+ if ((abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask)
+ return false;
+
+ return true;
+}
+
+has_feature_fn *s_check_cpu_feature[AWS_CPU_FEATURE_COUNT] = {
+ [AWS_CPU_FEATURE_CLMUL] = s_has_clmul,
+ [AWS_CPU_FEATURE_SSE_4_1] = s_has_sse41,
+ [AWS_CPU_FEATURE_SSE_4_2] = s_has_sse42,
+ [AWS_CPU_FEATURE_AVX2] = s_has_avx2,
+};
+
+bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) {
+ if (s_check_cpu_feature[feature_name])
+ return s_check_cpu_feature[feature_name]();
+ return false;
+}
+
+#define CPUID_AVAILABLE 0
+#define CPUID_UNAVAILABLE 1
+static int cpuid_state = 2;
+
+bool aws_common_private_has_avx2(void) {
+ if (AWS_LIKELY(cpuid_state == 0)) {
+ return true;
+ }
+ if (AWS_LIKELY(cpuid_state == 1)) {
+ return false;
+ }
+
+ /* Provide a hook for testing fallbacks and benchmarking */
+ const char *env_avx2_enabled = getenv("AWS_COMMON_AVX2");
+ if (env_avx2_enabled) {
+ int is_enabled = atoi(env_avx2_enabled);
+ cpuid_state = !is_enabled;
+ return is_enabled;
+ }
+
+ bool available = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2);
+ cpuid_state = available ? CPUID_AVAILABLE : CPUID_UNAVAILABLE;
+
+ return available;
+}
diff --git a/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c b/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c
index d4b580f24af..ebae8613810 100644
--- a/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c
+++ b/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c
@@ -1,384 +1,384 @@
-/**
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
- * SPDX-License-Identifier: Apache-2.0.
- */
-
-#include <emmintrin.h>
-#include <immintrin.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <aws/common/common.h>
-
-/***** Decode logic *****/
-
-/*
- * Decodes ranges of bytes in place
- * For each byte of 'in' that is between lo and hi (inclusive), adds offset and _adds_ it to the corresponding offset in
- * out.
- */
-static inline __m256i translate_range(__m256i in, uint8_t lo, uint8_t hi, uint8_t offset) {
- __m256i lovec = _mm256_set1_epi8(lo);
- __m256i hivec = _mm256_set1_epi8((char)(hi - lo));
- __m256i offsetvec = _mm256_set1_epi8(offset);
-
- __m256i tmp = _mm256_sub_epi8(in, lovec);
- /*
- * we'll use the unsigned min operator to do our comparison. Note that
- * there's no unsigned compare as a comparison intrinsic.
- */
- __m256i mask = _mm256_min_epu8(tmp, hivec);
- /* if mask = tmp, then keep that byte */
- mask = _mm256_cmpeq_epi8(mask, tmp);
-
- tmp = _mm256_add_epi8(tmp, offsetvec);
- tmp = _mm256_and_si256(tmp, mask);
- return tmp;
-}
-
-/*
- * For each 8-bit element in in, if the element equals match, add to the corresponding element in out the value decode.
- */
-static inline __m256i translate_exact(__m256i in, uint8_t match, uint8_t decode) {
- __m256i mask = _mm256_cmpeq_epi8(in, _mm256_set1_epi8(match));
- return _mm256_and_si256(mask, _mm256_set1_epi8(decode));
-}
-
-/*
- * Input: a pointer to a 256-bit vector of base64 characters
- * The pointed-to-vector is replaced by a 256-bit vector of 6-bit decoded parts;
- * on decode failure, returns false, else returns true on success.
- */
-static inline bool decode_vec(__m256i *in) {
- __m256i tmp1, tmp2, tmp3;
-
- /*
- * Base64 decoding table, see RFC4648
- *
- * Note that we use multiple vector registers to try to allow the CPU to
- * paralellize the merging ORs
- */
- tmp1 = translate_range(*in, 'A', 'Z', 0 + 1);
- tmp2 = translate_range(*in, 'a', 'z', 26 + 1);
- tmp3 = translate_range(*in, '0', '9', 52 + 1);
- tmp1 = _mm256_or_si256(tmp1, translate_exact(*in, '+', 62 + 1));
- tmp2 = _mm256_or_si256(tmp2, translate_exact(*in, '/', 63 + 1));
- tmp3 = _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
-
- /*
- * We use 0 to mark decode failures, so everything is decoded to one higher
- * than normal. We'll shift this down now.
- */
- *in = _mm256_sub_epi8(tmp3, _mm256_set1_epi8(1));
-
- /* If any byte is now zero, we had a decode failure */
- __m256i mask = _mm256_cmpeq_epi8(tmp3, _mm256_set1_epi8(0));
- return _mm256_testz_si256(mask, mask);
-}
-
-AWS_ALIGNED_TYPEDEF(uint8_t, aligned256[32], 32);
-
-/*
- * Input: a 256-bit vector, interpreted as 32 * 6-bit values
- * Output: a 256-bit vector, the lower 24 bytes of which contain the packed version of the input
- */
-static inline __m256i pack_vec(__m256i in) {
- /*
- * Our basic strategy is to split the input vector into three vectors, for each 6-bit component
- * of each 24-bit group, shift the groups into place, then OR the vectors together. Conveniently,
- * we can do this on a (32 bit) dword-by-dword basis.
- *
- * It's important to note that we're interpreting the vector as being little-endian. That is,
- * on entry, we have dwords that look like this:
- *
- * MSB LSB
- * 00DD DDDD 00CC CCCC 00BB BBBB 00AA AAAA
- *
- * And we want to translate to:
- *
- * MSB LSB
- * 0000 0000 AAAA AABB BBBB CCCC CCDD DDDD
- *
- * After which point we can pack these dwords together to produce our final output.
- */
- __m256i maskA = _mm256_set1_epi32(0xFF); // low bits
- __m256i maskB = _mm256_set1_epi32(0xFF00);
- __m256i maskC = _mm256_set1_epi32(0xFF0000);
- __m256i maskD = _mm256_set1_epi32((int)0xFF000000);
-
- __m256i bitsA = _mm256_slli_epi32(_mm256_and_si256(in, maskA), 18);
- __m256i bitsB = _mm256_slli_epi32(_mm256_and_si256(in, maskB), 4);
- __m256i bitsC = _mm256_srli_epi32(_mm256_and_si256(in, maskC), 10);
- __m256i bitsD = _mm256_srli_epi32(_mm256_and_si256(in, maskD), 24);
-
- __m256i dwords = _mm256_or_si256(_mm256_or_si256(bitsA, bitsB), _mm256_or_si256(bitsC, bitsD));
- /*
- * Now we have a series of dwords with empty MSBs.
- * We need to pack them together (and shift down) with a shuffle operation.
- * Unfortunately the shuffle operation operates independently within each 128-bit lane,
- * so we'll need to do this in two steps: First we compact dwords within each lane, then
- * we do a dword shuffle to compact the two lanes together.
-
- * 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 <- byte index (little endian)
- * -- 09 0a 0b -- 06 07 08 -- 03 04 05 -- 00 01 02 <- data index
- *
- * We also reverse the order of 3-byte fragments within each lane; we've constructed
- * those fragments in little endian but the order of fragments within the overall
- * vector is in memory order (big endian)
- */
- const aligned256 shufvec_buf = {
- /* clang-format off */
- /* MSB */
- 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */
- 2, 1, 0,
- 6, 5, 4,
- 10, 9, 8,
- 14, 13, 12,
-
- 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */
- 2, 1, 0,
- 6, 5, 4,
- 10, 9, 8,
- 14, 13, 12
- /* LSB */
- /* clang-format on */
- };
- __m256i shufvec = _mm256_load_si256((__m256i const *)&shufvec_buf);
-
- dwords = _mm256_shuffle_epi8(dwords, shufvec);
- /*
- * Now shuffle the 32-bit words:
- * A B C 0 D E F 0 -> 0 0 A B C D E F
- */
- __m256i shuf32 = _mm256_set_epi32(0, 0, 7, 6, 5, 3, 2, 1);
-
- dwords = _mm256_permutevar8x32_epi32(dwords, shuf32);
-
- return dwords;
-}
-
-static inline bool decode(const unsigned char *in, unsigned char *out) {
- __m256i vec = _mm256_loadu_si256((__m256i const *)in);
- if (!decode_vec(&vec)) {
- return false;
- }
- vec = pack_vec(vec);
-
- /*
- * We'll do overlapping writes to get both the low 128 bits and the high 64-bits written.
- * Input (memory order): 0 1 2 3 4 5 - - (dwords)
- * Input (little endian) - - 5 4 3 2 1 0
- * Output in memory:
- * [0 1 2 3] [4 5]
- */
- __m128i lo = _mm256_extracti128_si256(vec, 0);
- /*
- * Unfortunately some compilers don't support _mm256_extract_epi64,
- * so we'll just copy right out of the vector as a fallback
- */
-
-#ifdef HAVE_MM256_EXTRACT_EPI64
- uint64_t hi = _mm256_extract_epi64(vec, 2);
- const uint64_t *p_hi = &hi;
-#else
- const uint64_t *p_hi = (uint64_t *)&vec + 2;
-#endif
-
- _mm_storeu_si128((__m128i *)out, lo);
- memcpy(out + 16, p_hi, sizeof(*p_hi));
-
- return true;
-}
-
-size_t aws_common_private_base64_decode_sse41(const unsigned char *in, unsigned char *out, size_t len) {
- if (len % 4) {
- return (size_t)-1;
- }
-
- size_t outlen = 0;
- while (len > 32) {
- if (!decode(in, out)) {
- return (size_t)-1;
- }
- len -= 32;
- in += 32;
- out += 24;
- outlen += 24;
- }
-
- if (len > 0) {
- unsigned char tmp_in[32];
- unsigned char tmp_out[24];
-
- memset(tmp_out, 0xEE, sizeof(tmp_out));
-
- /* We need to ensure the vector contains valid b64 characters */
- memset(tmp_in, 'A', sizeof(tmp_in));
- memcpy(tmp_in, in, len);
-
- size_t final_out = (3 * len) / 4;
-
- /* Check for end-of-string padding (up to 2 characters) */
- for (int i = 0; i < 2; i++) {
- if (tmp_in[len - 1] == '=') {
- tmp_in[len - 1] = 'A'; /* make sure the inner loop doesn't bail out */
- len--;
- final_out--;
- }
- }
-
- if (!decode(tmp_in, tmp_out)) {
- return (size_t)-1;
- }
-
- /* Check that there are no trailing ones bits */
- for (size_t i = final_out; i < sizeof(tmp_out); i++) {
- if (tmp_out[i]) {
- return (size_t)-1;
- }
- }
-
- memcpy(out, tmp_out, final_out);
- outlen += final_out;
- }
- return outlen;
-}
-
-/***** Encode logic *****/
-static inline __m256i encode_chars(__m256i in) {
- __m256i tmp1, tmp2, tmp3;
-
- /*
- * Base64 encoding table, see RFC4648
- *
- * We again use fan-in for the ORs here.
- */
- tmp1 = translate_range(in, 0, 25, 'A');
- tmp2 = translate_range(in, 26, 26 + 25, 'a');
- tmp3 = translate_range(in, 52, 61, '0');
- tmp1 = _mm256_or_si256(tmp1, translate_exact(in, 62, '+'));
- tmp2 = _mm256_or_si256(tmp2, translate_exact(in, 63, '/'));
-
- return _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
-}
-
-/*
- * Input: A 256-bit vector, interpreted as 24 bytes (LSB) plus 8 bytes of high-byte padding
- * Output: A 256-bit vector of base64 characters
- */
-static inline __m256i encode_stride(__m256i vec) {
- /*
- * First, since byte-shuffle operations operate within 128-bit subvectors, swap around the dwords
- * to balance the amount of actual data between 128-bit subvectors.
- * After this we want the LE representation to look like: -- XX XX XX -- XX XX XX
- */
- __m256i shuf32 = _mm256_set_epi32(7, 5, 4, 3, 6, 2, 1, 0);
- vec = _mm256_permutevar8x32_epi32(vec, shuf32);
-
- /*
- * Next, within each group of 3 bytes, we need to byteswap into little endian form so our bitshifts
- * will work properly. We also shuffle around so that each dword has one 3-byte group, plus one byte
- * (MSB) of zero-padding.
- * Because this is a byte-shuffle, indexes are within each 128-bit subvector.
- *
- * -- -- -- -- 11 10 09 08 07 06 05 04 03 02 01 00
- */
-
- const aligned256 shufvec_buf = {
- /* clang-format off */
- /* MSB */
- 2, 1, 0, 0xFF,
- 5, 4, 3, 0xFF,
- 8, 7, 6, 0xFF,
- 11, 10, 9, 0xFF,
-
- 2, 1, 0, 0xFF,
- 5, 4, 3, 0xFF,
- 8, 7, 6, 0xFF,
- 11, 10, 9, 0xFF
- /* LSB */
- /* clang-format on */
- };
- vec = _mm256_shuffle_epi8(vec, _mm256_load_si256((__m256i const *)&shufvec_buf));
-
- /*
- * Now shift and mask to split out 6-bit groups.
- * We'll also do a second byteswap to get back into big-endian
- */
- __m256i mask0 = _mm256_set1_epi32(0x3F);
- __m256i mask1 = _mm256_set1_epi32(0x3F << 6);
- __m256i mask2 = _mm256_set1_epi32(0x3F << 12);
- __m256i mask3 = _mm256_set1_epi32(0x3F << 18);
-
- __m256i digit0 = _mm256_and_si256(mask0, vec);
- __m256i digit1 = _mm256_and_si256(mask1, vec);
- __m256i digit2 = _mm256_and_si256(mask2, vec);
- __m256i digit3 = _mm256_and_si256(mask3, vec);
-
- /*
- * Because we want to byteswap, the low-order digit0 goes into the
- * high-order byte
- */
- digit0 = _mm256_slli_epi32(digit0, 24);
- digit1 = _mm256_slli_epi32(digit1, 10);
- digit2 = _mm256_srli_epi32(digit2, 4);
- digit3 = _mm256_srli_epi32(digit3, 18);
-
- vec = _mm256_or_si256(_mm256_or_si256(digit0, digit1), _mm256_or_si256(digit2, digit3));
-
- /* Finally translate to the base64 character set */
- return encode_chars(vec);
-}
-
-void aws_common_private_base64_encode_sse41(const uint8_t *input, uint8_t *output, size_t inlen) {
- __m256i instride, outstride;
-
- while (inlen >= 32) {
- /*
- * Where possible, we'll load a full vector at a time and ignore the over-read.
- * However, if we have < 32 bytes left, this would result in a potential read
- * of unreadable pages, so we use bounce buffers below.
- */
- instride = _mm256_loadu_si256((__m256i const *)input);
- outstride = encode_stride(instride);
- _mm256_storeu_si256((__m256i *)output, outstride);
-
- input += 24;
- output += 32;
- inlen -= 24;
- }
-
- while (inlen) {
- /*
- * We need to go through a bounce buffer for anything remaining, as we
- * don't want to over-read or over-write the ends of the buffers.
- */
- size_t stridelen = inlen > 24 ? 24 : inlen;
- size_t outlen = ((stridelen + 2) / 3) * 4;
-
- memset(&instride, 0, sizeof(instride));
- memcpy(&instride, input, stridelen);
-
- outstride = encode_stride(instride);
- memcpy(output, &outstride, outlen);
-
- if (inlen < 24) {
- if (inlen % 3 >= 1) {
- /* AA== or AAA= */
- output[outlen - 1] = '=';
- }
- if (inlen % 3 == 1) {
- /* AA== */
- output[outlen - 2] = '=';
- }
-
- return;
- }
-
- input += stridelen;
- output += outlen;
- inlen -= stridelen;
- }
-}
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <aws/common/common.h>
+
+/***** Decode logic *****/
+
+/*
+ * Decodes ranges of bytes in place
+ * For each byte of 'in' that is between lo and hi (inclusive), adds offset and _adds_ it to the corresponding offset in
+ * out.
+ */
+static inline __m256i translate_range(__m256i in, uint8_t lo, uint8_t hi, uint8_t offset) {
+ __m256i lovec = _mm256_set1_epi8(lo);
+ __m256i hivec = _mm256_set1_epi8((char)(hi - lo));
+ __m256i offsetvec = _mm256_set1_epi8(offset);
+
+ __m256i tmp = _mm256_sub_epi8(in, lovec);
+ /*
+ * we'll use the unsigned min operator to do our comparison. Note that
+ * there's no unsigned compare as a comparison intrinsic.
+ */
+ __m256i mask = _mm256_min_epu8(tmp, hivec);
+ /* if mask = tmp, then keep that byte */
+ mask = _mm256_cmpeq_epi8(mask, tmp);
+
+ tmp = _mm256_add_epi8(tmp, offsetvec);
+ tmp = _mm256_and_si256(tmp, mask);
+ return tmp;
+}
+
+/*
+ * For each 8-bit element in in, if the element equals match, add to the corresponding element in out the value decode.
+ */
+static inline __m256i translate_exact(__m256i in, uint8_t match, uint8_t decode) {
+ __m256i mask = _mm256_cmpeq_epi8(in, _mm256_set1_epi8(match));
+ return _mm256_and_si256(mask, _mm256_set1_epi8(decode));
+}
+
+/*
+ * Input: a pointer to a 256-bit vector of base64 characters
+ * The pointed-to-vector is replaced by a 256-bit vector of 6-bit decoded parts;
+ * on decode failure, returns false, else returns true on success.
+ */
+static inline bool decode_vec(__m256i *in) {
+ __m256i tmp1, tmp2, tmp3;
+
+ /*
+ * Base64 decoding table, see RFC4648
+ *
+ * Note that we use multiple vector registers to try to allow the CPU to
+ * paralellize the merging ORs
+ */
+ tmp1 = translate_range(*in, 'A', 'Z', 0 + 1);
+ tmp2 = translate_range(*in, 'a', 'z', 26 + 1);
+ tmp3 = translate_range(*in, '0', '9', 52 + 1);
+ tmp1 = _mm256_or_si256(tmp1, translate_exact(*in, '+', 62 + 1));
+ tmp2 = _mm256_or_si256(tmp2, translate_exact(*in, '/', 63 + 1));
+ tmp3 = _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
+
+ /*
+ * We use 0 to mark decode failures, so everything is decoded to one higher
+ * than normal. We'll shift this down now.
+ */
+ *in = _mm256_sub_epi8(tmp3, _mm256_set1_epi8(1));
+
+ /* If any byte is now zero, we had a decode failure */
+ __m256i mask = _mm256_cmpeq_epi8(tmp3, _mm256_set1_epi8(0));
+ return _mm256_testz_si256(mask, mask);
+}
+
+AWS_ALIGNED_TYPEDEF(uint8_t, aligned256[32], 32);
+
+/*
+ * Input: a 256-bit vector, interpreted as 32 * 6-bit values
+ * Output: a 256-bit vector, the lower 24 bytes of which contain the packed version of the input
+ */
+static inline __m256i pack_vec(__m256i in) {
+ /*
+ * Our basic strategy is to split the input vector into three vectors, for each 6-bit component
+ * of each 24-bit group, shift the groups into place, then OR the vectors together. Conveniently,
+ * we can do this on a (32 bit) dword-by-dword basis.
+ *
+ * It's important to note that we're interpreting the vector as being little-endian. That is,
+ * on entry, we have dwords that look like this:
+ *
+ * MSB LSB
+ * 00DD DDDD 00CC CCCC 00BB BBBB 00AA AAAA
+ *
+ * And we want to translate to:
+ *
+ * MSB LSB
+ * 0000 0000 AAAA AABB BBBB CCCC CCDD DDDD
+ *
+ * After which point we can pack these dwords together to produce our final output.
+ */
+ __m256i maskA = _mm256_set1_epi32(0xFF); // low bits
+ __m256i maskB = _mm256_set1_epi32(0xFF00);
+ __m256i maskC = _mm256_set1_epi32(0xFF0000);
+ __m256i maskD = _mm256_set1_epi32((int)0xFF000000);
+
+ __m256i bitsA = _mm256_slli_epi32(_mm256_and_si256(in, maskA), 18);
+ __m256i bitsB = _mm256_slli_epi32(_mm256_and_si256(in, maskB), 4);
+ __m256i bitsC = _mm256_srli_epi32(_mm256_and_si256(in, maskC), 10);
+ __m256i bitsD = _mm256_srli_epi32(_mm256_and_si256(in, maskD), 24);
+
+ __m256i dwords = _mm256_or_si256(_mm256_or_si256(bitsA, bitsB), _mm256_or_si256(bitsC, bitsD));
+ /*
+ * Now we have a series of dwords with empty MSBs.
+ * We need to pack them together (and shift down) with a shuffle operation.
+ * Unfortunately the shuffle operation operates independently within each 128-bit lane,
+ * so we'll need to do this in two steps: First we compact dwords within each lane, then
+ * we do a dword shuffle to compact the two lanes together.
+
+ * 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 <- byte index (little endian)
+ * -- 09 0a 0b -- 06 07 08 -- 03 04 05 -- 00 01 02 <- data index
+ *
+ * We also reverse the order of 3-byte fragments within each lane; we've constructed
+ * those fragments in little endian but the order of fragments within the overall
+ * vector is in memory order (big endian)
+ */
+ const aligned256 shufvec_buf = {
+ /* clang-format off */
+ /* MSB */
+ 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */
+ 2, 1, 0,
+ 6, 5, 4,
+ 10, 9, 8,
+ 14, 13, 12,
+
+ 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */
+ 2, 1, 0,
+ 6, 5, 4,
+ 10, 9, 8,
+ 14, 13, 12
+ /* LSB */
+ /* clang-format on */
+ };
+ __m256i shufvec = _mm256_load_si256((__m256i const *)&shufvec_buf);
+
+ dwords = _mm256_shuffle_epi8(dwords, shufvec);
+ /*
+ * Now shuffle the 32-bit words:
+ * A B C 0 D E F 0 -> 0 0 A B C D E F
+ */
+ __m256i shuf32 = _mm256_set_epi32(0, 0, 7, 6, 5, 3, 2, 1);
+
+ dwords = _mm256_permutevar8x32_epi32(dwords, shuf32);
+
+ return dwords;
+}
+
+static inline bool decode(const unsigned char *in, unsigned char *out) {
+ __m256i vec = _mm256_loadu_si256((__m256i const *)in);
+ if (!decode_vec(&vec)) {
+ return false;
+ }
+ vec = pack_vec(vec);
+
+ /*
+ * We'll do overlapping writes to get both the low 128 bits and the high 64-bits written.
+ * Input (memory order): 0 1 2 3 4 5 - - (dwords)
+ * Input (little endian) - - 5 4 3 2 1 0
+ * Output in memory:
+ * [0 1 2 3] [4 5]
+ */
+ __m128i lo = _mm256_extracti128_si256(vec, 0);
+ /*
+ * Unfortunately some compilers don't support _mm256_extract_epi64,
+ * so we'll just copy right out of the vector as a fallback
+ */
+
+#ifdef HAVE_MM256_EXTRACT_EPI64
+ uint64_t hi = _mm256_extract_epi64(vec, 2);
+ const uint64_t *p_hi = &hi;
+#else
+ const uint64_t *p_hi = (uint64_t *)&vec + 2;
+#endif
+
+ _mm_storeu_si128((__m128i *)out, lo);
+ memcpy(out + 16, p_hi, sizeof(*p_hi));
+
+ return true;
+}
+
+size_t aws_common_private_base64_decode_sse41(const unsigned char *in, unsigned char *out, size_t len) {
+ if (len % 4) {
+ return (size_t)-1;
+ }
+
+ size_t outlen = 0;
+ while (len > 32) {
+ if (!decode(in, out)) {
+ return (size_t)-1;
+ }
+ len -= 32;
+ in += 32;
+ out += 24;
+ outlen += 24;
+ }
+
+ if (len > 0) {
+ unsigned char tmp_in[32];
+ unsigned char tmp_out[24];
+
+ memset(tmp_out, 0xEE, sizeof(tmp_out));
+
+ /* We need to ensure the vector contains valid b64 characters */
+ memset(tmp_in, 'A', sizeof(tmp_in));
+ memcpy(tmp_in, in, len);
+
+ size_t final_out = (3 * len) / 4;
+
+ /* Check for end-of-string padding (up to 2 characters) */
+ for (int i = 0; i < 2; i++) {
+ if (tmp_in[len - 1] == '=') {
+ tmp_in[len - 1] = 'A'; /* make sure the inner loop doesn't bail out */
+ len--;
+ final_out--;
+ }
+ }
+
+ if (!decode(tmp_in, tmp_out)) {
+ return (size_t)-1;
+ }
+
+ /* Check that there are no trailing ones bits */
+ for (size_t i = final_out; i < sizeof(tmp_out); i++) {
+ if (tmp_out[i]) {
+ return (size_t)-1;
+ }
+ }
+
+ memcpy(out, tmp_out, final_out);
+ outlen += final_out;
+ }
+ return outlen;
+}
+
+/***** Encode logic *****/
+static inline __m256i encode_chars(__m256i in) {
+ __m256i tmp1, tmp2, tmp3;
+
+ /*
+ * Base64 encoding table, see RFC4648
+ *
+ * We again use fan-in for the ORs here.
+ */
+ tmp1 = translate_range(in, 0, 25, 'A');
+ tmp2 = translate_range(in, 26, 26 + 25, 'a');
+ tmp3 = translate_range(in, 52, 61, '0');
+ tmp1 = _mm256_or_si256(tmp1, translate_exact(in, 62, '+'));
+ tmp2 = _mm256_or_si256(tmp2, translate_exact(in, 63, '/'));
+
+ return _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
+}
+
+/*
+ * Input: A 256-bit vector, interpreted as 24 bytes (LSB) plus 8 bytes of high-byte padding
+ * Output: A 256-bit vector of base64 characters
+ */
+static inline __m256i encode_stride(__m256i vec) {
+ /*
+ * First, since byte-shuffle operations operate within 128-bit subvectors, swap around the dwords
+ * to balance the amount of actual data between 128-bit subvectors.
+ * After this we want the LE representation to look like: -- XX XX XX -- XX XX XX
+ */
+ __m256i shuf32 = _mm256_set_epi32(7, 5, 4, 3, 6, 2, 1, 0);
+ vec = _mm256_permutevar8x32_epi32(vec, shuf32);
+
+ /*
+ * Next, within each group of 3 bytes, we need to byteswap into little endian form so our bitshifts
+ * will work properly. We also shuffle around so that each dword has one 3-byte group, plus one byte
+ * (MSB) of zero-padding.
+ * Because this is a byte-shuffle, indexes are within each 128-bit subvector.
+ *
+ * -- -- -- -- 11 10 09 08 07 06 05 04 03 02 01 00
+ */
+
+ const aligned256 shufvec_buf = {
+ /* clang-format off */
+ /* MSB */
+ 2, 1, 0, 0xFF,
+ 5, 4, 3, 0xFF,
+ 8, 7, 6, 0xFF,
+ 11, 10, 9, 0xFF,
+
+ 2, 1, 0, 0xFF,
+ 5, 4, 3, 0xFF,
+ 8, 7, 6, 0xFF,
+ 11, 10, 9, 0xFF
+ /* LSB */
+ /* clang-format on */
+ };
+ vec = _mm256_shuffle_epi8(vec, _mm256_load_si256((__m256i const *)&shufvec_buf));
+
+ /*
+ * Now shift and mask to split out 6-bit groups.
+ * We'll also do a second byteswap to get back into big-endian
+ */
+ __m256i mask0 = _mm256_set1_epi32(0x3F);
+ __m256i mask1 = _mm256_set1_epi32(0x3F << 6);
+ __m256i mask2 = _mm256_set1_epi32(0x3F << 12);
+ __m256i mask3 = _mm256_set1_epi32(0x3F << 18);
+
+ __m256i digit0 = _mm256_and_si256(mask0, vec);
+ __m256i digit1 = _mm256_and_si256(mask1, vec);
+ __m256i digit2 = _mm256_and_si256(mask2, vec);
+ __m256i digit3 = _mm256_and_si256(mask3, vec);
+
+ /*
+ * Because we want to byteswap, the low-order digit0 goes into the
+ * high-order byte
+ */
+ digit0 = _mm256_slli_epi32(digit0, 24);
+ digit1 = _mm256_slli_epi32(digit1, 10);
+ digit2 = _mm256_srli_epi32(digit2, 4);
+ digit3 = _mm256_srli_epi32(digit3, 18);
+
+ vec = _mm256_or_si256(_mm256_or_si256(digit0, digit1), _mm256_or_si256(digit2, digit3));
+
+ /* Finally translate to the base64 character set */
+ return encode_chars(vec);
+}
+
+void aws_common_private_base64_encode_sse41(const uint8_t *input, uint8_t *output, size_t inlen) {
+ __m256i instride, outstride;
+
+ while (inlen >= 32) {
+ /*
+ * Where possible, we'll load a full vector at a time and ignore the over-read.
+ * However, if we have < 32 bytes left, this would result in a potential read
+ * of unreadable pages, so we use bounce buffers below.
+ */
+ instride = _mm256_loadu_si256((__m256i const *)input);
+ outstride = encode_stride(instride);
+ _mm256_storeu_si256((__m256i *)output, outstride);
+
+ input += 24;
+ output += 32;
+ inlen -= 24;
+ }
+
+ while (inlen) {
+ /*
+ * We need to go through a bounce buffer for anything remaining, as we
+ * don't want to over-read or over-write the ends of the buffers.
+ */
+ size_t stridelen = inlen > 24 ? 24 : inlen;
+ size_t outlen = ((stridelen + 2) / 3) * 4;
+
+ memset(&instride, 0, sizeof(instride));
+ memcpy(&instride, input, stridelen);
+
+ outstride = encode_stride(instride);
+ memcpy(output, &outstride, outlen);
+
+ if (inlen < 24) {
+ if (inlen % 3 >= 1) {
+ /* AA== or AAA= */
+ output[outlen - 1] = '=';
+ }
+ if (inlen % 3 == 1) {
+ /* AA== */
+ output[outlen - 2] = '=';
+ }
+
+ return;
+ }
+
+ input += stridelen;
+ output += outlen;
+ inlen -= stridelen;
+ }
+}