diff options
author | yazevnul <yazevnul@yandex-team.ru> | 2022-02-10 16:46:48 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:48 +0300 |
commit | 9abfb1a53b7f7b791444d1378e645d8fad9b06ed (patch) | |
tree | 49e222ea1c5804306084bb3ae065bb702625360f /contrib/libs/base64/neon64/enc_neon.c | |
parent | 8cbc307de0221f84c80c42dcbe07d40727537e2c (diff) | |
download | ydb-9abfb1a53b7f7b791444d1378e645d8fad9b06ed.tar.gz |
Restoring authorship annotation for <yazevnul@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/base64/neon64/enc_neon.c')
-rw-r--r-- | contrib/libs/base64/neon64/enc_neon.c | 74 |
1 files changed, 37 insertions, 37 deletions
diff --git a/contrib/libs/base64/neon64/enc_neon.c b/contrib/libs/base64/neon64/enc_neon.c index 2ba5a561e9..9cf28a11f8 100644 --- a/contrib/libs/base64/neon64/enc_neon.c +++ b/contrib/libs/base64/neon64/enc_neon.c @@ -1,37 +1,37 @@ -// If we have ARM NEON support, pick off 48 bytes at a time: -while (srclen >= 48) -{ - uint8x16x3_t str; - uint8x16x4_t res; - - // Load 48 bytes and deinterleave: - str = vld3q_u8((uint8_t *)c); - - // Divide bits of three input bytes over four output bytes: - res.val[0] = vshrq_n_u8(str.val[0], 2); - res.val[1] = vshrq_n_u8(str.val[1], 4) | vshlq_n_u8(str.val[0], 4); - res.val[2] = vshrq_n_u8(str.val[2], 6) | vshlq_n_u8(str.val[1], 2); - res.val[3] = str.val[2]; - - // Clear top two bits: - res.val[0] &= vdupq_n_u8(0x3F); - res.val[1] &= vdupq_n_u8(0x3F); - res.val[2] &= vdupq_n_u8(0x3F); - res.val[3] &= vdupq_n_u8(0x3F); - - // The bits have now been shifted to the right locations; - // translate their values 0..63 to the Base64 alphabet. - // Use a 64-byte table lookup: - res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]); - res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]); - res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]); - res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]); - - // Interleave and store result: - vst4q_u8((uint8_t *)o, res); - - c += 48; // 3 * 16 bytes of input - o += 64; // 4 * 16 bytes of output - outl += 64; - srclen -= 48; -} +// If we have ARM NEON support, pick off 48 bytes at a time: +while (srclen >= 48) +{ + uint8x16x3_t str; + uint8x16x4_t res; + + // Load 48 bytes and deinterleave: + str = vld3q_u8((uint8_t *)c); + + // Divide bits of three input bytes over four output bytes: + res.val[0] = vshrq_n_u8(str.val[0], 2); + res.val[1] = vshrq_n_u8(str.val[1], 4) | vshlq_n_u8(str.val[0], 4); + res.val[2] = vshrq_n_u8(str.val[2], 6) | vshlq_n_u8(str.val[1], 2); + res.val[3] = str.val[2]; + + // Clear top two bits: + res.val[0] &= vdupq_n_u8(0x3F); + res.val[1] &= vdupq_n_u8(0x3F); + res.val[2] &= vdupq_n_u8(0x3F); + res.val[3] &= vdupq_n_u8(0x3F); + + // The bits have now been shifted to the right locations; + // translate their values 0..63 to the Base64 alphabet. + // Use a 64-byte table lookup: + res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]); + res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]); + res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]); + res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]); + + // Interleave and store result: + vst4q_u8((uint8_t *)o, res); + + c += 48; // 3 * 16 bytes of input + o += 64; // 4 * 16 bytes of output + outl += 64; + srclen -= 48; +} |