diff options
author | yazevnul <yazevnul@yandex-team.ru> | 2022-02-10 16:46:46 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:46 +0300 |
commit | 8cbc307de0221f84c80c42dcbe07d40727537e2c (patch) | |
tree | 625d5a673015d1df891e051033e9fcde5c7be4e5 /contrib/libs/base64/neon64/enc_neon.c | |
parent | 30d1ef3941e0dc835be7609de5ebee66958f215a (diff) | |
download | ydb-8cbc307de0221f84c80c42dcbe07d40727537e2c.tar.gz |
Restoring authorship annotation for <yazevnul@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/base64/neon64/enc_neon.c')
-rw-r--r-- | contrib/libs/base64/neon64/enc_neon.c | 74 |
1 files changed, 37 insertions, 37 deletions
diff --git a/contrib/libs/base64/neon64/enc_neon.c b/contrib/libs/base64/neon64/enc_neon.c index 9cf28a11f8..2ba5a561e9 100644 --- a/contrib/libs/base64/neon64/enc_neon.c +++ b/contrib/libs/base64/neon64/enc_neon.c @@ -1,37 +1,37 @@ -// If we have ARM NEON support, pick off 48 bytes at a time: -while (srclen >= 48) -{ - uint8x16x3_t str; - uint8x16x4_t res; - - // Load 48 bytes and deinterleave: - str = vld3q_u8((uint8_t *)c); - - // Divide bits of three input bytes over four output bytes: - res.val[0] = vshrq_n_u8(str.val[0], 2); - res.val[1] = vshrq_n_u8(str.val[1], 4) | vshlq_n_u8(str.val[0], 4); - res.val[2] = vshrq_n_u8(str.val[2], 6) | vshlq_n_u8(str.val[1], 2); - res.val[3] = str.val[2]; - - // Clear top two bits: - res.val[0] &= vdupq_n_u8(0x3F); - res.val[1] &= vdupq_n_u8(0x3F); - res.val[2] &= vdupq_n_u8(0x3F); - res.val[3] &= vdupq_n_u8(0x3F); - - // The bits have now been shifted to the right locations; - // translate their values 0..63 to the Base64 alphabet. - // Use a 64-byte table lookup: - res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]); - res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]); - res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]); - res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]); - - // Interleave and store result: - vst4q_u8((uint8_t *)o, res); - - c += 48; // 3 * 16 bytes of input - o += 64; // 4 * 16 bytes of output - outl += 64; - srclen -= 48; -} +// If we have ARM NEON support, pick off 48 bytes at a time: +while (srclen >= 48) +{ + uint8x16x3_t str; + uint8x16x4_t res; + + // Load 48 bytes and deinterleave: + str = vld3q_u8((uint8_t *)c); + + // Divide bits of three input bytes over four output bytes: + res.val[0] = vshrq_n_u8(str.val[0], 2); + res.val[1] = vshrq_n_u8(str.val[1], 4) | vshlq_n_u8(str.val[0], 4); + res.val[2] = vshrq_n_u8(str.val[2], 6) | vshlq_n_u8(str.val[1], 2); + res.val[3] = str.val[2]; + + // Clear top two bits: + res.val[0] &= vdupq_n_u8(0x3F); + res.val[1] &= vdupq_n_u8(0x3F); + res.val[2] &= vdupq_n_u8(0x3F); + res.val[3] &= vdupq_n_u8(0x3F); + + // The bits have now been shifted to the right locations; + // translate their values 0..63 to the Base64 alphabet. + // Use a 64-byte table lookup: + res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]); + res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]); + res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]); + res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]); + + // Interleave and store result: + vst4q_u8((uint8_t *)o, res); + + c += 48; // 3 * 16 bytes of input + o += 64; // 4 * 16 bytes of output + outl += 64; + srclen -= 48; +} |