Restoring authorship annotation for <[email protected]>. Commit 2 of 2.

author: orivej <[email protected]> 2022-02-10 16:45:01 +0300
committer: Daniil Cherednik <[email protected]> 2022-02-10 16:45:01 +0300
commit: 2d37894b1b037cf24231090eda8589bbb44fb6fc (patch)
tree: be835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/restricted/aws/aws-c-common/source/arch/intel
parent: 718c552901d703c502ccbefdfc3c9028d608b947 (diff)
3 files changed, 516 insertions, 516 deletions
diff --git a/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c b/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c
index 07970177799..d2ceab01060 100644
--- a/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c
+++ b/contrib/restricted/aws/aws-c-common/source/arch/intel/asm/cpuid.c
@@ -1,29 +1,29 @@
-/** 
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 
- * SPDX-License-Identifier: Apache-2.0. 
- */ 
- 
-#include <aws/common/cpuid.h> 
- 
-void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd) { 
-    uint32_t ebx = 0; 
-    uint32_t edx = 0; 
- 
-#if defined(__i386__) && defined(__PIC__) 
-    /* in case of PIC under 32-bit EBX cannot be clobbered */ 
-    __asm__ __volatile__("movl %%ebx, %%edi \n\t " 
-                         "cpuid \n\t " 
-                         "xchgl %%ebx, %%edi" 
-                         : "=D"(ebx), 
-#else 
-    __asm__ __volatile__("cpuid" 
-                         : "+b"(ebx), 
-#endif 
-                           "+a"(eax), 
-                           "+c"(ecx), 
-                           "=d"(edx)); 
-    abcd[0] = eax; 
-    abcd[1] = ebx; 
-    abcd[2] = ecx; 
-    abcd[3] = edx; 
-} 
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <aws/common/cpuid.h>
+
+void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd) {
+    uint32_t ebx = 0;
+    uint32_t edx = 0;
+
+#if defined(__i386__) && defined(__PIC__)
+    /* in case of PIC under 32-bit EBX cannot be clobbered */
+    __asm__ __volatile__("movl %%ebx, %%edi \n\t "
+                         "cpuid \n\t "
+                         "xchgl %%ebx, %%edi"
+                         : "=D"(ebx),
+#else
+    __asm__ __volatile__("cpuid"
+                         : "+b"(ebx),
+#endif
+                           "+a"(eax),
+                           "+c"(ecx),
+                           "=d"(edx));
+    abcd[0] = eax;
+    abcd[1] = ebx;
+    abcd[2] = ecx;
+    abcd[3] = edx;
+}
diff --git a/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c b/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c
index ddd9640c620..6385c146fb0 100644
--- a/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c
+++ b/contrib/restricted/aws/aws-c-common/source/arch/intel/cpuid.c
@@ -1,103 +1,103 @@
-/** 
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 
- * SPDX-License-Identifier: Apache-2.0. 
- */ 
- 
-/* 
- * MSVC wants us to use the non-portable _dupenv_s instead; since we need 
- * to remain portable, tell MSVC to suppress this warning. 
- */ 
-#define _CRT_SECURE_NO_WARNINGS 
- 
-#include <aws/common/cpuid.h> 
-#include <stdlib.h> 
- 
-extern void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd); 
- 
-typedef bool(has_feature_fn)(void); 
- 
-static bool s_has_clmul(void) { 
-    uint32_t abcd[4]; 
-    uint32_t clmul_mask = 0x00000002; 
-    aws_run_cpuid(1, 0, abcd); 
- 
-    if ((abcd[2] & clmul_mask) != clmul_mask) 
-        return false; 
- 
-    return true; 
-} 
- 
-static bool s_has_sse41(void) { 
-    uint32_t abcd[4]; 
-    uint32_t sse41_mask = 0x00080000; 
-    aws_run_cpuid(1, 0, abcd); 
- 
-    if ((abcd[2] & sse41_mask) != sse41_mask) 
-        return false; 
- 
-    return true; 
-} 
- 
-static bool s_has_sse42(void) { 
-    uint32_t abcd[4]; 
-    uint32_t sse42_mask = 0x00100000; 
-    aws_run_cpuid(1, 0, abcd); 
- 
-    if ((abcd[2] & sse42_mask) != sse42_mask) 
-        return false; 
- 
-    return true; 
-} 
- 
-static bool s_has_avx2(void) { 
-    uint32_t abcd[4]; 
-    uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8); 
-    /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1   && 
-       CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 && 
-       CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */ 
-    aws_run_cpuid(7, 0, abcd); 
- 
-    if ((abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask) 
-        return false; 
- 
-    return true; 
-} 
- 
-has_feature_fn *s_check_cpu_feature[AWS_CPU_FEATURE_COUNT] = { 
-    [AWS_CPU_FEATURE_CLMUL] = s_has_clmul, 
-    [AWS_CPU_FEATURE_SSE_4_1] = s_has_sse41, 
-    [AWS_CPU_FEATURE_SSE_4_2] = s_has_sse42, 
-    [AWS_CPU_FEATURE_AVX2] = s_has_avx2, 
-}; 
- 
-bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) { 
-    if (s_check_cpu_feature[feature_name]) 
-        return s_check_cpu_feature[feature_name](); 
-    return false; 
-} 
- 
-#define CPUID_AVAILABLE 0 
-#define CPUID_UNAVAILABLE 1 
-static int cpuid_state = 2; 
- 
-bool aws_common_private_has_avx2(void) { 
-    if (AWS_LIKELY(cpuid_state == 0)) { 
-        return true; 
-    } 
-    if (AWS_LIKELY(cpuid_state == 1)) { 
-        return false; 
-    } 
- 
-    /* Provide a hook for testing fallbacks and benchmarking */ 
-    const char *env_avx2_enabled = getenv("AWS_COMMON_AVX2"); 
-    if (env_avx2_enabled) { 
-        int is_enabled = atoi(env_avx2_enabled); 
-        cpuid_state = !is_enabled; 
-        return is_enabled; 
-    } 
- 
-    bool available = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2); 
-    cpuid_state = available ? CPUID_AVAILABLE : CPUID_UNAVAILABLE; 
- 
-    return available; 
-} 
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+/*
+ * MSVC wants us to use the non-portable _dupenv_s instead; since we need
+ * to remain portable, tell MSVC to suppress this warning.
+ */
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <aws/common/cpuid.h>
+#include <stdlib.h>
+
+extern void aws_run_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd);
+
+typedef bool(has_feature_fn)(void);
+
+static bool s_has_clmul(void) {
+    uint32_t abcd[4];
+    uint32_t clmul_mask = 0x00000002;
+    aws_run_cpuid(1, 0, abcd);
+
+    if ((abcd[2] & clmul_mask) != clmul_mask)
+        return false;
+
+    return true;
+}
+
+static bool s_has_sse41(void) {
+    uint32_t abcd[4];
+    uint32_t sse41_mask = 0x00080000;
+    aws_run_cpuid(1, 0, abcd);
+
+    if ((abcd[2] & sse41_mask) != sse41_mask)
+        return false;
+
+    return true;
+}
+
+static bool s_has_sse42(void) {
+    uint32_t abcd[4];
+    uint32_t sse42_mask = 0x00100000;
+    aws_run_cpuid(1, 0, abcd);
+
+    if ((abcd[2] & sse42_mask) != sse42_mask)
+        return false;
+
+    return true;
+}
+
+static bool s_has_avx2(void) {
+    uint32_t abcd[4];
+    uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8);
+    /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1   &&
+       CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 &&
+       CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */
+    aws_run_cpuid(7, 0, abcd);
+
+    if ((abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask)
+        return false;
+
+    return true;
+}
+
+has_feature_fn *s_check_cpu_feature[AWS_CPU_FEATURE_COUNT] = {
+    [AWS_CPU_FEATURE_CLMUL] = s_has_clmul,
+    [AWS_CPU_FEATURE_SSE_4_1] = s_has_sse41,
+    [AWS_CPU_FEATURE_SSE_4_2] = s_has_sse42,
+    [AWS_CPU_FEATURE_AVX2] = s_has_avx2,
+};
+
+bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) {
+    if (s_check_cpu_feature[feature_name])
+        return s_check_cpu_feature[feature_name]();
+    return false;
+}
+
+#define CPUID_AVAILABLE 0
+#define CPUID_UNAVAILABLE 1
+static int cpuid_state = 2;
+
+bool aws_common_private_has_avx2(void) {
+    if (AWS_LIKELY(cpuid_state == 0)) {
+        return true;
+    }
+    if (AWS_LIKELY(cpuid_state == 1)) {
+        return false;
+    }
+
+    /* Provide a hook for testing fallbacks and benchmarking */
+    const char *env_avx2_enabled = getenv("AWS_COMMON_AVX2");
+    if (env_avx2_enabled) {
+        int is_enabled = atoi(env_avx2_enabled);
+        cpuid_state = !is_enabled;
+        return is_enabled;
+    }
+
+    bool available = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2);
+    cpuid_state = available ? CPUID_AVAILABLE : CPUID_UNAVAILABLE;
+
+    return available;
+}
diff --git a/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c b/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c
index d4b580f24af..ebae8613810 100644
--- a/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c
+++ b/contrib/restricted/aws/aws-c-common/source/arch/intel/encoding_avx2.c
@@ -1,384 +1,384 @@
-/** 
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 
- * SPDX-License-Identifier: Apache-2.0. 
- */ 
- 
-#include <emmintrin.h> 
-#include <immintrin.h> 
- 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <string.h> 
- 
-#include <aws/common/common.h> 
- 
-/***** Decode logic *****/ 
- 
-/* 
- * Decodes ranges of bytes in place 
- * For each byte of 'in' that is between lo and hi (inclusive), adds offset and _adds_ it to the corresponding offset in 
- * out. 
- */ 
-static inline __m256i translate_range(__m256i in, uint8_t lo, uint8_t hi, uint8_t offset) { 
-    __m256i lovec = _mm256_set1_epi8(lo); 
-    __m256i hivec = _mm256_set1_epi8((char)(hi - lo)); 
-    __m256i offsetvec = _mm256_set1_epi8(offset); 
- 
-    __m256i tmp = _mm256_sub_epi8(in, lovec); 
-    /* 
-     * we'll use the unsigned min operator to do our comparison. Note that 
-     * there's no unsigned compare as a comparison intrinsic. 
-     */ 
-    __m256i mask = _mm256_min_epu8(tmp, hivec); 
-    /* if mask = tmp, then keep that byte */ 
-    mask = _mm256_cmpeq_epi8(mask, tmp); 
- 
-    tmp = _mm256_add_epi8(tmp, offsetvec); 
-    tmp = _mm256_and_si256(tmp, mask); 
-    return tmp; 
-} 
- 
-/* 
- * For each 8-bit element in in, if the element equals match, add to the corresponding element in out the value decode. 
- */ 
-static inline __m256i translate_exact(__m256i in, uint8_t match, uint8_t decode) { 
-    __m256i mask = _mm256_cmpeq_epi8(in, _mm256_set1_epi8(match)); 
-    return _mm256_and_si256(mask, _mm256_set1_epi8(decode)); 
-} 
- 
-/* 
- * Input: a pointer to a 256-bit vector of base64 characters 
- * The pointed-to-vector is replaced by a 256-bit vector of 6-bit decoded parts; 
- * on decode failure, returns false, else returns true on success. 
- */ 
-static inline bool decode_vec(__m256i *in) { 
-    __m256i tmp1, tmp2, tmp3; 
- 
-    /* 
-     * Base64 decoding table, see RFC4648 
-     * 
-     * Note that we use multiple vector registers to try to allow the CPU to 
-     * paralellize the merging ORs 
-     */ 
-    tmp1 = translate_range(*in, 'A', 'Z', 0 + 1); 
-    tmp2 = translate_range(*in, 'a', 'z', 26 + 1); 
-    tmp3 = translate_range(*in, '0', '9', 52 + 1); 
-    tmp1 = _mm256_or_si256(tmp1, translate_exact(*in, '+', 62 + 1)); 
-    tmp2 = _mm256_or_si256(tmp2, translate_exact(*in, '/', 63 + 1)); 
-    tmp3 = _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2)); 
- 
-    /* 
-     * We use 0 to mark decode failures, so everything is decoded to one higher 
-     * than normal. We'll shift this down now. 
-     */ 
-    *in = _mm256_sub_epi8(tmp3, _mm256_set1_epi8(1)); 
- 
-    /* If any byte is now zero, we had a decode failure */ 
-    __m256i mask = _mm256_cmpeq_epi8(tmp3, _mm256_set1_epi8(0)); 
-    return _mm256_testz_si256(mask, mask); 
-} 
- 
-AWS_ALIGNED_TYPEDEF(uint8_t, aligned256[32], 32); 
- 
-/* 
- * Input: a 256-bit vector, interpreted as 32 * 6-bit values 
- * Output: a 256-bit vector, the lower 24 bytes of which contain the packed version of the input 
- */ 
-static inline __m256i pack_vec(__m256i in) { 
-    /* 
-     * Our basic strategy is to split the input vector into three vectors, for each 6-bit component 
-     * of each 24-bit group, shift the groups into place, then OR the vectors together. Conveniently, 
-     * we can do this on a (32 bit) dword-by-dword basis. 
-     * 
-     * It's important to note that we're interpreting the vector as being little-endian. That is, 
-     * on entry, we have dwords that look like this: 
-     * 
-     * MSB                                 LSB 
-     * 00DD DDDD 00CC CCCC 00BB BBBB 00AA AAAA 
-     * 
-     * And we want to translate to: 
-     * 
-     * MSB                                 LSB 
-     * 0000 0000 AAAA AABB BBBB CCCC CCDD DDDD 
-     * 
-     * After which point we can pack these dwords together to produce our final output. 
-     */ 
-    __m256i maskA = _mm256_set1_epi32(0xFF); // low bits 
-    __m256i maskB = _mm256_set1_epi32(0xFF00); 
-    __m256i maskC = _mm256_set1_epi32(0xFF0000); 
-    __m256i maskD = _mm256_set1_epi32((int)0xFF000000); 
- 
-    __m256i bitsA = _mm256_slli_epi32(_mm256_and_si256(in, maskA), 18); 
-    __m256i bitsB = _mm256_slli_epi32(_mm256_and_si256(in, maskB), 4); 
-    __m256i bitsC = _mm256_srli_epi32(_mm256_and_si256(in, maskC), 10); 
-    __m256i bitsD = _mm256_srli_epi32(_mm256_and_si256(in, maskD), 24); 
- 
-    __m256i dwords = _mm256_or_si256(_mm256_or_si256(bitsA, bitsB), _mm256_or_si256(bitsC, bitsD)); 
-    /* 
-     * Now we have a series of dwords with empty MSBs. 
-     * We need to pack them together (and shift down) with a shuffle operation. 
-     * Unfortunately the shuffle operation operates independently within each 128-bit lane, 
-     * so we'll need to do this in two steps: First we compact dwords within each lane, then 
-     * we do a dword shuffle to compact the two lanes together. 
- 
-     * 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 <- byte index (little endian) 
-     * -- 09 0a 0b -- 06 07 08 -- 03 04 05 -- 00 01 02 <- data index 
-     * 
-     * We also reverse the order of 3-byte fragments within each lane; we've constructed 
-     * those fragments in little endian but the order of fragments within the overall 
-     * vector is in memory order (big endian) 
-     */ 
-    const aligned256 shufvec_buf = { 
-        /* clang-format off */ 
-        /* MSB */ 
-        0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */ 
-        2,  1,  0, 
-        6,  5,  4, 
-        10,  9,  8, 
-        14, 13, 12, 
- 
-        0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */ 
-        2,  1,  0, 
-        6,  5,  4, 
-        10,  9,  8, 
-        14, 13, 12 
-        /* LSB */ 
-        /* clang-format on */ 
-    }; 
-    __m256i shufvec = _mm256_load_si256((__m256i const *)&shufvec_buf); 
- 
-    dwords = _mm256_shuffle_epi8(dwords, shufvec); 
-    /* 
-     * Now shuffle the 32-bit words: 
-     * A B C 0 D E F 0 -> 0 0 A B C D E F 
-     */ 
-    __m256i shuf32 = _mm256_set_epi32(0, 0, 7, 6, 5, 3, 2, 1); 
- 
-    dwords = _mm256_permutevar8x32_epi32(dwords, shuf32); 
- 
-    return dwords; 
-} 
- 
-static inline bool decode(const unsigned char *in, unsigned char *out) { 
-    __m256i vec = _mm256_loadu_si256((__m256i const *)in); 
-    if (!decode_vec(&vec)) { 
-        return false; 
-    } 
-    vec = pack_vec(vec); 
- 
-    /* 
-     * We'll do overlapping writes to get both the low 128 bits and the high 64-bits written. 
-     * Input (memory order): 0 1 2 3 4 5 - - (dwords) 
-     * Input (little endian) - - 5 4 3 2 1 0 
-     * Output in memory: 
-     * [0 1 2 3] [4 5] 
-     */ 
-    __m128i lo = _mm256_extracti128_si256(vec, 0); 
-    /* 
-     * Unfortunately some compilers don't support _mm256_extract_epi64, 
-     * so we'll just copy right out of the vector as a fallback 
-     */ 
- 
-#ifdef HAVE_MM256_EXTRACT_EPI64 
-    uint64_t hi = _mm256_extract_epi64(vec, 2); 
-    const uint64_t *p_hi = &hi; 
-#else 
-    const uint64_t *p_hi = (uint64_t *)&vec + 2; 
-#endif 
- 
-    _mm_storeu_si128((__m128i *)out, lo); 
-    memcpy(out + 16, p_hi, sizeof(*p_hi)); 
- 
-    return true; 
-} 
- 
-size_t aws_common_private_base64_decode_sse41(const unsigned char *in, unsigned char *out, size_t len) { 
-    if (len % 4) { 
-        return (size_t)-1; 
-    } 
- 
-    size_t outlen = 0; 
-    while (len > 32) { 
-        if (!decode(in, out)) { 
-            return (size_t)-1; 
-        } 
-        len -= 32; 
-        in += 32; 
-        out += 24; 
-        outlen += 24; 
-    } 
- 
-    if (len > 0) { 
-        unsigned char tmp_in[32]; 
-        unsigned char tmp_out[24]; 
- 
-        memset(tmp_out, 0xEE, sizeof(tmp_out)); 
- 
-        /* We need to ensure the vector contains valid b64 characters */ 
-        memset(tmp_in, 'A', sizeof(tmp_in)); 
-        memcpy(tmp_in, in, len); 
- 
-        size_t final_out = (3 * len) / 4; 
- 
-        /* Check for end-of-string padding (up to 2 characters) */ 
-        for (int i = 0; i < 2; i++) { 
-            if (tmp_in[len - 1] == '=') { 
-                tmp_in[len - 1] = 'A'; /* make sure the inner loop doesn't bail out */ 
-                len--; 
-                final_out--; 
-            } 
-        } 
- 
-        if (!decode(tmp_in, tmp_out)) { 
-            return (size_t)-1; 
-        } 
- 
-        /* Check that there are no trailing ones bits */ 
-        for (size_t i = final_out; i < sizeof(tmp_out); i++) { 
-            if (tmp_out[i]) { 
-                return (size_t)-1; 
-            } 
-        } 
- 
-        memcpy(out, tmp_out, final_out); 
-        outlen += final_out; 
-    } 
-    return outlen; 
-} 
- 
-/***** Encode logic *****/ 
-static inline __m256i encode_chars(__m256i in) { 
-    __m256i tmp1, tmp2, tmp3; 
- 
-    /* 
-     * Base64 encoding table, see RFC4648 
-     * 
-     * We again use fan-in for the ORs here. 
-     */ 
-    tmp1 = translate_range(in, 0, 25, 'A'); 
-    tmp2 = translate_range(in, 26, 26 + 25, 'a'); 
-    tmp3 = translate_range(in, 52, 61, '0'); 
-    tmp1 = _mm256_or_si256(tmp1, translate_exact(in, 62, '+')); 
-    tmp2 = _mm256_or_si256(tmp2, translate_exact(in, 63, '/')); 
- 
-    return _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2)); 
-} 
- 
-/* 
- * Input: A 256-bit vector, interpreted as 24 bytes (LSB) plus 8 bytes of high-byte padding 
- * Output: A 256-bit vector of base64 characters 
- */ 
-static inline __m256i encode_stride(__m256i vec) { 
-    /* 
-     * First, since byte-shuffle operations operate within 128-bit subvectors, swap around the dwords 
-     * to balance the amount of actual data between 128-bit subvectors. 
-     * After this we want the LE representation to look like: -- XX XX XX -- XX XX XX 
-     */ 
-    __m256i shuf32 = _mm256_set_epi32(7, 5, 4, 3, 6, 2, 1, 0); 
-    vec = _mm256_permutevar8x32_epi32(vec, shuf32); 
- 
-    /* 
-     * Next, within each group of 3 bytes, we need to byteswap into little endian form so our bitshifts 
-     * will work properly. We also shuffle around so that each dword has one 3-byte group, plus one byte 
-     * (MSB) of zero-padding. 
-     * Because this is a byte-shuffle, indexes are within each 128-bit subvector. 
-     * 
-     * -- -- -- -- 11 10 09 08 07 06 05 04 03 02 01 00 
-     */ 
- 
-    const aligned256 shufvec_buf = { 
-        /* clang-format off */ 
-        /* MSB */ 
-        2, 1, 0, 0xFF, 
-        5, 4, 3, 0xFF, 
-        8, 7, 6, 0xFF, 
-        11, 10, 9, 0xFF, 
- 
-        2, 1, 0, 0xFF, 
-        5, 4, 3, 0xFF, 
-        8, 7, 6, 0xFF, 
-        11, 10, 9, 0xFF 
-        /* LSB */ 
-        /* clang-format on */ 
-    }; 
-    vec = _mm256_shuffle_epi8(vec, _mm256_load_si256((__m256i const *)&shufvec_buf)); 
- 
-    /* 
-     * Now shift and mask to split out 6-bit groups. 
-     * We'll also do a second byteswap to get back into big-endian 
-     */ 
-    __m256i mask0 = _mm256_set1_epi32(0x3F); 
-    __m256i mask1 = _mm256_set1_epi32(0x3F << 6); 
-    __m256i mask2 = _mm256_set1_epi32(0x3F << 12); 
-    __m256i mask3 = _mm256_set1_epi32(0x3F << 18); 
- 
-    __m256i digit0 = _mm256_and_si256(mask0, vec); 
-    __m256i digit1 = _mm256_and_si256(mask1, vec); 
-    __m256i digit2 = _mm256_and_si256(mask2, vec); 
-    __m256i digit3 = _mm256_and_si256(mask3, vec); 
- 
-    /* 
-     * Because we want to byteswap, the low-order digit0 goes into the 
-     * high-order byte 
-     */ 
-    digit0 = _mm256_slli_epi32(digit0, 24); 
-    digit1 = _mm256_slli_epi32(digit1, 10); 
-    digit2 = _mm256_srli_epi32(digit2, 4); 
-    digit3 = _mm256_srli_epi32(digit3, 18); 
- 
-    vec = _mm256_or_si256(_mm256_or_si256(digit0, digit1), _mm256_or_si256(digit2, digit3)); 
- 
-    /* Finally translate to the base64 character set */ 
-    return encode_chars(vec); 
-} 
- 
-void aws_common_private_base64_encode_sse41(const uint8_t *input, uint8_t *output, size_t inlen) { 
-    __m256i instride, outstride; 
- 
-    while (inlen >= 32) { 
-        /* 
-         * Where possible, we'll load a full vector at a time and ignore the over-read. 
-         * However, if we have < 32 bytes left, this would result in a potential read 
-         * of unreadable pages, so we use bounce buffers below. 
-         */ 
-        instride = _mm256_loadu_si256((__m256i const *)input); 
-        outstride = encode_stride(instride); 
-        _mm256_storeu_si256((__m256i *)output, outstride); 
- 
-        input += 24; 
-        output += 32; 
-        inlen -= 24; 
-    } 
- 
-    while (inlen) { 
-        /* 
-         * We need to go through a bounce buffer for anything remaining, as we 
-         * don't want to over-read or over-write the ends of the buffers. 
-         */ 
-        size_t stridelen = inlen > 24 ? 24 : inlen; 
-        size_t outlen = ((stridelen + 2) / 3) * 4; 
- 
-        memset(&instride, 0, sizeof(instride)); 
-        memcpy(&instride, input, stridelen); 
- 
-        outstride = encode_stride(instride); 
-        memcpy(output, &outstride, outlen); 
- 
-        if (inlen < 24) { 
-            if (inlen % 3 >= 1) { 
-                /* AA== or AAA= */ 
-                output[outlen - 1] = '='; 
-            } 
-            if (inlen % 3 == 1) { 
-                /* AA== */ 
-                output[outlen - 2] = '='; 
-            } 
- 
-            return; 
-        } 
- 
-        input += stridelen; 
-        output += outlen; 
-        inlen -= stridelen; 
-    } 
-} 
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <aws/common/common.h>
+
+/***** Decode logic *****/
+
+/*
+ * Decodes ranges of bytes in place
+ * For each byte of 'in' that is between lo and hi (inclusive), adds offset and _adds_ it to the corresponding offset in
+ * out.
+ */
+static inline __m256i translate_range(__m256i in, uint8_t lo, uint8_t hi, uint8_t offset) {
+    __m256i lovec = _mm256_set1_epi8(lo);
+    __m256i hivec = _mm256_set1_epi8((char)(hi - lo));
+    __m256i offsetvec = _mm256_set1_epi8(offset);
+
+    __m256i tmp = _mm256_sub_epi8(in, lovec);
+    /*
+     * we'll use the unsigned min operator to do our comparison. Note that
+     * there's no unsigned compare as a comparison intrinsic.
+     */
+    __m256i mask = _mm256_min_epu8(tmp, hivec);
+    /* if mask = tmp, then keep that byte */
+    mask = _mm256_cmpeq_epi8(mask, tmp);
+
+    tmp = _mm256_add_epi8(tmp, offsetvec);
+    tmp = _mm256_and_si256(tmp, mask);
+    return tmp;
+}
+
+/*
+ * For each 8-bit element in in, if the element equals match, add to the corresponding element in out the value decode.
+ */
+static inline __m256i translate_exact(__m256i in, uint8_t match, uint8_t decode) {
+    __m256i mask = _mm256_cmpeq_epi8(in, _mm256_set1_epi8(match));
+    return _mm256_and_si256(mask, _mm256_set1_epi8(decode));
+}
+
+/*
+ * Input: a pointer to a 256-bit vector of base64 characters
+ * The pointed-to-vector is replaced by a 256-bit vector of 6-bit decoded parts;
+ * on decode failure, returns false, else returns true on success.
+ */
+static inline bool decode_vec(__m256i *in) {
+    __m256i tmp1, tmp2, tmp3;
+
+    /*
+     * Base64 decoding table, see RFC4648
+     *
+     * Note that we use multiple vector registers to try to allow the CPU to
+     * paralellize the merging ORs
+     */
+    tmp1 = translate_range(*in, 'A', 'Z', 0 + 1);
+    tmp2 = translate_range(*in, 'a', 'z', 26 + 1);
+    tmp3 = translate_range(*in, '0', '9', 52 + 1);
+    tmp1 = _mm256_or_si256(tmp1, translate_exact(*in, '+', 62 + 1));
+    tmp2 = _mm256_or_si256(tmp2, translate_exact(*in, '/', 63 + 1));
+    tmp3 = _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
+
+    /*
+     * We use 0 to mark decode failures, so everything is decoded to one higher
+     * than normal. We'll shift this down now.
+     */
+    *in = _mm256_sub_epi8(tmp3, _mm256_set1_epi8(1));
+
+    /* If any byte is now zero, we had a decode failure */
+    __m256i mask = _mm256_cmpeq_epi8(tmp3, _mm256_set1_epi8(0));
+    return _mm256_testz_si256(mask, mask);
+}
+
+AWS_ALIGNED_TYPEDEF(uint8_t, aligned256[32], 32);
+
+/*
+ * Input: a 256-bit vector, interpreted as 32 * 6-bit values
+ * Output: a 256-bit vector, the lower 24 bytes of which contain the packed version of the input
+ */
+static inline __m256i pack_vec(__m256i in) {
+    /*
+     * Our basic strategy is to split the input vector into three vectors, for each 6-bit component
+     * of each 24-bit group, shift the groups into place, then OR the vectors together. Conveniently,
+     * we can do this on a (32 bit) dword-by-dword basis.
+     *
+     * It's important to note that we're interpreting the vector as being little-endian. That is,
+     * on entry, we have dwords that look like this:
+     *
+     * MSB                                 LSB
+     * 00DD DDDD 00CC CCCC 00BB BBBB 00AA AAAA
+     *
+     * And we want to translate to:
+     *
+     * MSB                                 LSB
+     * 0000 0000 AAAA AABB BBBB CCCC CCDD DDDD
+     *
+     * After which point we can pack these dwords together to produce our final output.
+     */
+    __m256i maskA = _mm256_set1_epi32(0xFF); // low bits
+    __m256i maskB = _mm256_set1_epi32(0xFF00);
+    __m256i maskC = _mm256_set1_epi32(0xFF0000);
+    __m256i maskD = _mm256_set1_epi32((int)0xFF000000);
+
+    __m256i bitsA = _mm256_slli_epi32(_mm256_and_si256(in, maskA), 18);
+    __m256i bitsB = _mm256_slli_epi32(_mm256_and_si256(in, maskB), 4);
+    __m256i bitsC = _mm256_srli_epi32(_mm256_and_si256(in, maskC), 10);
+    __m256i bitsD = _mm256_srli_epi32(_mm256_and_si256(in, maskD), 24);
+
+    __m256i dwords = _mm256_or_si256(_mm256_or_si256(bitsA, bitsB), _mm256_or_si256(bitsC, bitsD));
+    /*
+     * Now we have a series of dwords with empty MSBs.
+     * We need to pack them together (and shift down) with a shuffle operation.
+     * Unfortunately the shuffle operation operates independently within each 128-bit lane,
+     * so we'll need to do this in two steps: First we compact dwords within each lane, then
+     * we do a dword shuffle to compact the two lanes together.
+
+     * 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 <- byte index (little endian)
+     * -- 09 0a 0b -- 06 07 08 -- 03 04 05 -- 00 01 02 <- data index
+     *
+     * We also reverse the order of 3-byte fragments within each lane; we've constructed
+     * those fragments in little endian but the order of fragments within the overall
+     * vector is in memory order (big endian)
+     */
+    const aligned256 shufvec_buf = {
+        /* clang-format off */
+        /* MSB */
+        0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */
+        2,  1,  0,
+        6,  5,  4,
+        10,  9,  8,
+        14, 13, 12,
+
+        0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */
+        2,  1,  0,
+        6,  5,  4,
+        10,  9,  8,
+        14, 13, 12
+        /* LSB */
+        /* clang-format on */
+    };
+    __m256i shufvec = _mm256_load_si256((__m256i const *)&shufvec_buf);
+
+    dwords = _mm256_shuffle_epi8(dwords, shufvec);
+    /*
+     * Now shuffle the 32-bit words:
+     * A B C 0 D E F 0 -> 0 0 A B C D E F
+     */
+    __m256i shuf32 = _mm256_set_epi32(0, 0, 7, 6, 5, 3, 2, 1);
+
+    dwords = _mm256_permutevar8x32_epi32(dwords, shuf32);
+
+    return dwords;
+}
+
+static inline bool decode(const unsigned char *in, unsigned char *out) {
+    __m256i vec = _mm256_loadu_si256((__m256i const *)in);
+    if (!decode_vec(&vec)) {
+        return false;
+    }
+    vec = pack_vec(vec);
+
+    /*
+     * We'll do overlapping writes to get both the low 128 bits and the high 64-bits written.
+     * Input (memory order): 0 1 2 3 4 5 - - (dwords)
+     * Input (little endian) - - 5 4 3 2 1 0
+     * Output in memory:
+     * [0 1 2 3] [4 5]
+     */
+    __m128i lo = _mm256_extracti128_si256(vec, 0);
+    /*
+     * Unfortunately some compilers don't support _mm256_extract_epi64,
+     * so we'll just copy right out of the vector as a fallback
+     */
+
+#ifdef HAVE_MM256_EXTRACT_EPI64
+    uint64_t hi = _mm256_extract_epi64(vec, 2);
+    const uint64_t *p_hi = &hi;
+#else
+    const uint64_t *p_hi = (uint64_t *)&vec + 2;
+#endif
+
+    _mm_storeu_si128((__m128i *)out, lo);
+    memcpy(out + 16, p_hi, sizeof(*p_hi));
+
+    return true;
+}
+
+size_t aws_common_private_base64_decode_sse41(const unsigned char *in, unsigned char *out, size_t len) {
+    if (len % 4) {
+        return (size_t)-1;
+    }
+
+    size_t outlen = 0;
+    while (len > 32) {
+        if (!decode(in, out)) {
+            return (size_t)-1;
+        }
+        len -= 32;
+        in += 32;
+        out += 24;
+        outlen += 24;
+    }
+
+    if (len > 0) {
+        unsigned char tmp_in[32];
+        unsigned char tmp_out[24];
+
+        memset(tmp_out, 0xEE, sizeof(tmp_out));
+
+        /* We need to ensure the vector contains valid b64 characters */
+        memset(tmp_in, 'A', sizeof(tmp_in));
+        memcpy(tmp_in, in, len);
+
+        size_t final_out = (3 * len) / 4;
+
+        /* Check for end-of-string padding (up to 2 characters) */
+        for (int i = 0; i < 2; i++) {
+            if (tmp_in[len - 1] == '=') {
+                tmp_in[len - 1] = 'A'; /* make sure the inner loop doesn't bail out */
+                len--;
+                final_out--;
+            }
+        }
+
+        if (!decode(tmp_in, tmp_out)) {
+            return (size_t)-1;
+        }
+
+        /* Check that there are no trailing ones bits */
+        for (size_t i = final_out; i < sizeof(tmp_out); i++) {
+            if (tmp_out[i]) {
+                return (size_t)-1;
+            }
+        }
+
+        memcpy(out, tmp_out, final_out);
+        outlen += final_out;
+    }
+    return outlen;
+}
+
+/***** Encode logic *****/
+static inline __m256i encode_chars(__m256i in) {
+    __m256i tmp1, tmp2, tmp3;
+
+    /*
+     * Base64 encoding table, see RFC4648
+     *
+     * We again use fan-in for the ORs here.
+     */
+    tmp1 = translate_range(in, 0, 25, 'A');
+    tmp2 = translate_range(in, 26, 26 + 25, 'a');
+    tmp3 = translate_range(in, 52, 61, '0');
+    tmp1 = _mm256_or_si256(tmp1, translate_exact(in, 62, '+'));
+    tmp2 = _mm256_or_si256(tmp2, translate_exact(in, 63, '/'));
+
+    return _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
+}
+
+/*
+ * Input: A 256-bit vector, interpreted as 24 bytes (LSB) plus 8 bytes of high-byte padding
+ * Output: A 256-bit vector of base64 characters
+ */
+static inline __m256i encode_stride(__m256i vec) {
+    /*
+     * First, since byte-shuffle operations operate within 128-bit subvectors, swap around the dwords
+     * to balance the amount of actual data between 128-bit subvectors.
+     * After this we want the LE representation to look like: -- XX XX XX -- XX XX XX
+     */
+    __m256i shuf32 = _mm256_set_epi32(7, 5, 4, 3, 6, 2, 1, 0);
+    vec = _mm256_permutevar8x32_epi32(vec, shuf32);
+
+    /*
+     * Next, within each group of 3 bytes, we need to byteswap into little endian form so our bitshifts
+     * will work properly. We also shuffle around so that each dword has one 3-byte group, plus one byte
+     * (MSB) of zero-padding.
+     * Because this is a byte-shuffle, indexes are within each 128-bit subvector.
+     *
+     * -- -- -- -- 11 10 09 08 07 06 05 04 03 02 01 00
+     */
+
+    const aligned256 shufvec_buf = {
+        /* clang-format off */
+        /* MSB */
+        2, 1, 0, 0xFF,
+        5, 4, 3, 0xFF,
+        8, 7, 6, 0xFF,
+        11, 10, 9, 0xFF,
+
+        2, 1, 0, 0xFF,
+        5, 4, 3, 0xFF,
+        8, 7, 6, 0xFF,
+        11, 10, 9, 0xFF
+        /* LSB */
+        /* clang-format on */
+    };
+    vec = _mm256_shuffle_epi8(vec, _mm256_load_si256((__m256i const *)&shufvec_buf));
+
+    /*
+     * Now shift and mask to split out 6-bit groups.
+     * We'll also do a second byteswap to get back into big-endian
+     */
+    __m256i mask0 = _mm256_set1_epi32(0x3F);
+    __m256i mask1 = _mm256_set1_epi32(0x3F << 6);
+    __m256i mask2 = _mm256_set1_epi32(0x3F << 12);
+    __m256i mask3 = _mm256_set1_epi32(0x3F << 18);
+
+    __m256i digit0 = _mm256_and_si256(mask0, vec);
+    __m256i digit1 = _mm256_and_si256(mask1, vec);
+    __m256i digit2 = _mm256_and_si256(mask2, vec);
+    __m256i digit3 = _mm256_and_si256(mask3, vec);
+
+    /*
+     * Because we want to byteswap, the low-order digit0 goes into the
+     * high-order byte
+     */
+    digit0 = _mm256_slli_epi32(digit0, 24);
+    digit1 = _mm256_slli_epi32(digit1, 10);
+    digit2 = _mm256_srli_epi32(digit2, 4);
+    digit3 = _mm256_srli_epi32(digit3, 18);
+
+    vec = _mm256_or_si256(_mm256_or_si256(digit0, digit1), _mm256_or_si256(digit2, digit3));
+
+    /* Finally translate to the base64 character set */
+    return encode_chars(vec);
+}
+
+void aws_common_private_base64_encode_sse41(const uint8_t *input, uint8_t *output, size_t inlen) {
+    __m256i instride, outstride;
+
+    while (inlen >= 32) {
+        /*
+         * Where possible, we'll load a full vector at a time and ignore the over-read.
+         * However, if we have < 32 bytes left, this would result in a potential read
+         * of unreadable pages, so we use bounce buffers below.
+         */
+        instride = _mm256_loadu_si256((__m256i const *)input);
+        outstride = encode_stride(instride);
+        _mm256_storeu_si256((__m256i *)output, outstride);
+
+        input += 24;
+        output += 32;
+        inlen -= 24;
+    }
+
+    while (inlen) {
+        /*
+         * We need to go through a bounce buffer for anything remaining, as we
+         * don't want to over-read or over-write the ends of the buffers.
+         */
+        size_t stridelen = inlen > 24 ? 24 : inlen;
+        size_t outlen = ((stridelen + 2) / 3) * 4;
+
+        memset(&instride, 0, sizeof(instride));
+        memcpy(&instride, input, stridelen);
+
+        outstride = encode_stride(instride);
+        memcpy(output, &outstride, outlen);
+
+        if (inlen < 24) {
+            if (inlen % 3 >= 1) {
+                /* AA== or AAA= */
+                output[outlen - 1] = '=';
+            }
+            if (inlen % 3 == 1) {
+                /* AA== */
+                output[outlen - 2] = '=';
+            }
+
+            return;
+        }
+
+        input += stridelen;
+        output += outlen;
+        inlen -= stridelen;
+    }
+}
author	orivej <[email protected]>	2022-02-10 16:45:01 +0300
committer	Daniil Cherednik <[email protected]>	2022-02-10 16:45:01 +0300
commit	2d37894b1b037cf24231090eda8589bbb44fb6fc (patch)
tree	be835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/restricted/aws/aws-c-common/source/arch/intel
parent	718c552901d703c502ccbefdfc3c9028d608b947 (diff)