diff options
author | yazevnul <yazevnul@yandex-team.ru> | 2022-02-10 16:46:48 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:48 +0300 |
commit | 9abfb1a53b7f7b791444d1378e645d8fad9b06ed (patch) | |
tree | 49e222ea1c5804306084bb3ae065bb702625360f /contrib/libs/base64/neon32/codec_neon32.c | |
parent | 8cbc307de0221f84c80c42dcbe07d40727537e2c (diff) | |
download | ydb-9abfb1a53b7f7b791444d1378e645d8fad9b06ed.tar.gz |
Restoring authorship annotation for <yazevnul@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/base64/neon32/codec_neon32.c')
-rw-r--r-- | contrib/libs/base64/neon32/codec_neon32.c | 320 |
1 files changed, 160 insertions, 160 deletions
diff --git a/contrib/libs/base64/neon32/codec_neon32.c b/contrib/libs/base64/neon32/codec_neon32.c index 05fcfc3e63..2c9ae02f75 100644 --- a/contrib/libs/base64/neon32/codec_neon32.c +++ b/contrib/libs/base64/neon32/codec_neon32.c @@ -1,160 +1,160 @@ -#if (defined(__ARM_NEON) && !defined(__ARM_NEON__)) -#define __ARM_NEON__ -#endif - -#include <stdint.h> -#include <stddef.h> -#include <stdlib.h> -#ifdef __ARM_NEON__ -#include <arm_neon.h> -#endif - -#include "libbase64.h" -#include "codecs.h" - -#if (defined(__arm__) && defined(__ARM_NEON__)) - -#define CMPGT(s,n) vcgtq_u8((s), vdupq_n_u8(n)) -#define CMPEQ(s,n) vceqq_u8((s), vdupq_n_u8(n)) -#define REPLACE(s,n) vandq_u8((s), vdupq_n_u8(n)) -#define RANGE(s,a,b) vandq_u8(vcgeq_u8((s), vdupq_n_u8(a)), vcleq_u8((s), vdupq_n_u8(b))) - -static inline uint8x16x4_t -enc_reshuffle (uint8x16x3_t in) -{ - uint8x16x4_t out; - - // Divide bits of three input bytes over four output bytes: - out.val[0] = vshrq_n_u8(in.val[0], 2); - out.val[1] = vorrq_u8(vshrq_n_u8(in.val[1], 4), vshlq_n_u8(in.val[0], 4)); - out.val[2] = vorrq_u8(vshrq_n_u8(in.val[2], 6), vshlq_n_u8(in.val[1], 2)); - out.val[3] = in.val[2]; - - // Clear top two bits: - out.val[0] = vandq_u8(out.val[0], vdupq_n_u8(0x3F)); - out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F)); - out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F)); - out.val[3] = vandq_u8(out.val[3], vdupq_n_u8(0x3F)); - - return out; -} - -static inline uint8x16x4_t -enc_translate (uint8x16x4_t in) -{ - uint8x16x4_t mask1, mask2, mask3, mask4, out; - - // Translate values 0..63 to the Base64 alphabet. There are five sets: - // # From To Abs Delta Characters - // 0 [0..25] [65..90] +65 +65 ABCDEFGHIJKLMNOPQRSTUVWXYZ - // 1 [26..51] [97..122] +71 +6 abcdefghijklmnopqrstuvwxyz - // 2 [52..61] [48..57] -4 -75 0123456789 - // 3 [62] [43] -19 -15 + - // 4 [63] [47] -16 +3 / - - // Create cumulative masks for characters in sets [1,2,3,4], [2,3,4], - // [3,4], and [4]: - mask1.val[0] = CMPGT(in.val[0], 25); - mask1.val[1] = CMPGT(in.val[1], 25); - mask1.val[2] = CMPGT(in.val[2], 25); - mask1.val[3] = CMPGT(in.val[3], 25); - - mask2.val[0] = CMPGT(in.val[0], 51); - mask2.val[1] = CMPGT(in.val[1], 51); - mask2.val[2] = CMPGT(in.val[2], 51); - mask2.val[3] = CMPGT(in.val[3], 51); - - mask3.val[0] = CMPGT(in.val[0], 61); - mask3.val[1] = CMPGT(in.val[1], 61); - mask3.val[2] = CMPGT(in.val[2], 61); - mask3.val[3] = CMPGT(in.val[3], 61); - - mask4.val[0] = CMPEQ(in.val[0], 63); - mask4.val[1] = CMPEQ(in.val[1], 63); - mask4.val[2] = CMPEQ(in.val[2], 63); - mask4.val[3] = CMPEQ(in.val[3], 63); - - // All characters are at least in cumulative set 0, so add 'A': - out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65)); - out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65)); - out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65)); - out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65)); - - // For inputs which are also in any of the other cumulative sets, - // add delta values against the previous set(s) to correct the shift: - out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6)); - out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6)); - out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6)); - out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6)); - - out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75)); - out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75)); - out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75)); - out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75)); - - out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15)); - out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15)); - out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15)); - out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15)); - - out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3)); - out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3)); - out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3)); - out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3)); - - return out; -} - -#endif - -// Stride size is so large on these NEON 32-bit functions -// (48 bytes encode, 32 bytes decode) that we inline the -// uint32 codec to stay performant on smaller inputs. - -void -neon32_base64_stream_encode - ( struct neon32_base64_state *state - , const char *src - , size_t srclen - , char *out - , size_t *outlen - ) -{ -#if (defined(__arm__) && defined(__ARM_NEON__)) - #include "enc_head.c" - #include "enc_neon.c" - #include "enc_uint32.c" - #include "enc_tail.c" -#else - (void)state; - (void)src; - (void)srclen; - (void)out; - (void)outlen; - abort(); -#endif -} - -int -neon32_base64_stream_decode - ( struct neon32_base64_state *state - , const char *src - , size_t srclen - , char *out - , size_t *outlen - ) -{ -#if (defined(__arm__) && defined(__ARM_NEON__)) - #include "dec_head.c" - #include "dec_neon.c" - #include "dec_uint32.c" - #include "dec_tail.c" -#else - (void)state; - (void)src; - (void)srclen; - (void)out; - (void)outlen; - abort(); -#endif -} +#if (defined(__ARM_NEON) && !defined(__ARM_NEON__)) +#define __ARM_NEON__ +#endif + +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#ifdef __ARM_NEON__ +#include <arm_neon.h> +#endif + +#include "libbase64.h" +#include "codecs.h" + +#if (defined(__arm__) && defined(__ARM_NEON__)) + +#define CMPGT(s,n) vcgtq_u8((s), vdupq_n_u8(n)) +#define CMPEQ(s,n) vceqq_u8((s), vdupq_n_u8(n)) +#define REPLACE(s,n) vandq_u8((s), vdupq_n_u8(n)) +#define RANGE(s,a,b) vandq_u8(vcgeq_u8((s), vdupq_n_u8(a)), vcleq_u8((s), vdupq_n_u8(b))) + +static inline uint8x16x4_t +enc_reshuffle (uint8x16x3_t in) +{ + uint8x16x4_t out; + + // Divide bits of three input bytes over four output bytes: + out.val[0] = vshrq_n_u8(in.val[0], 2); + out.val[1] = vorrq_u8(vshrq_n_u8(in.val[1], 4), vshlq_n_u8(in.val[0], 4)); + out.val[2] = vorrq_u8(vshrq_n_u8(in.val[2], 6), vshlq_n_u8(in.val[1], 2)); + out.val[3] = in.val[2]; + + // Clear top two bits: + out.val[0] = vandq_u8(out.val[0], vdupq_n_u8(0x3F)); + out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F)); + out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F)); + out.val[3] = vandq_u8(out.val[3], vdupq_n_u8(0x3F)); + + return out; +} + +static inline uint8x16x4_t +enc_translate (uint8x16x4_t in) +{ + uint8x16x4_t mask1, mask2, mask3, mask4, out; + + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Delta Characters + // 0 [0..25] [65..90] +65 +65 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 +6 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 -75 0123456789 + // 3 [62] [43] -19 -15 + + // 4 [63] [47] -16 +3 / + + // Create cumulative masks for characters in sets [1,2,3,4], [2,3,4], + // [3,4], and [4]: + mask1.val[0] = CMPGT(in.val[0], 25); + mask1.val[1] = CMPGT(in.val[1], 25); + mask1.val[2] = CMPGT(in.val[2], 25); + mask1.val[3] = CMPGT(in.val[3], 25); + + mask2.val[0] = CMPGT(in.val[0], 51); + mask2.val[1] = CMPGT(in.val[1], 51); + mask2.val[2] = CMPGT(in.val[2], 51); + mask2.val[3] = CMPGT(in.val[3], 51); + + mask3.val[0] = CMPGT(in.val[0], 61); + mask3.val[1] = CMPGT(in.val[1], 61); + mask3.val[2] = CMPGT(in.val[2], 61); + mask3.val[3] = CMPGT(in.val[3], 61); + + mask4.val[0] = CMPEQ(in.val[0], 63); + mask4.val[1] = CMPEQ(in.val[1], 63); + mask4.val[2] = CMPEQ(in.val[2], 63); + mask4.val[3] = CMPEQ(in.val[3], 63); + + // All characters are at least in cumulative set 0, so add 'A': + out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65)); + out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65)); + out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65)); + out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65)); + + // For inputs which are also in any of the other cumulative sets, + // add delta values against the previous set(s) to correct the shift: + out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6)); + out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6)); + out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6)); + out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6)); + + out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75)); + out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75)); + out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75)); + out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75)); + + out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15)); + out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15)); + out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15)); + out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15)); + + out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3)); + out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3)); + out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3)); + out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3)); + + return out; +} + +#endif + +// Stride size is so large on these NEON 32-bit functions +// (48 bytes encode, 32 bytes decode) that we inline the +// uint32 codec to stay performant on smaller inputs. + +void +neon32_base64_stream_encode + ( struct neon32_base64_state *state + , const char *src + , size_t srclen + , char *out + , size_t *outlen + ) +{ +#if (defined(__arm__) && defined(__ARM_NEON__)) + #include "enc_head.c" + #include "enc_neon.c" + #include "enc_uint32.c" + #include "enc_tail.c" +#else + (void)state; + (void)src; + (void)srclen; + (void)out; + (void)outlen; + abort(); +#endif +} + +int +neon32_base64_stream_decode + ( struct neon32_base64_state *state + , const char *src + , size_t srclen + , char *out + , size_t *outlen + ) +{ +#if (defined(__arm__) && defined(__ARM_NEON__)) + #include "dec_head.c" + #include "dec_neon.c" + #include "dec_uint32.c" + #include "dec_tail.c" +#else + (void)state; + (void)src; + (void)srclen; + (void)out; + (void)outlen; + abort(); +#endif +} |