diff options
author | yazevnul <yazevnul@yandex-team.ru> | 2022-02-10 16:46:46 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:46 +0300 |
commit | 8cbc307de0221f84c80c42dcbe07d40727537e2c (patch) | |
tree | 625d5a673015d1df891e051033e9fcde5c7be4e5 /contrib/libs/base64/neon32/dec_neon.c | |
parent | 30d1ef3941e0dc835be7609de5ebee66958f215a (diff) | |
download | ydb-8cbc307de0221f84c80c42dcbe07d40727537e2c.tar.gz |
Restoring authorship annotation for <yazevnul@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/base64/neon32/dec_neon.c')
-rw-r--r-- | contrib/libs/base64/neon32/dec_neon.c | 154 |
1 files changed, 77 insertions, 77 deletions
diff --git a/contrib/libs/base64/neon32/dec_neon.c b/contrib/libs/base64/neon32/dec_neon.c index 713d8ca9a4..30d846a916 100644 --- a/contrib/libs/base64/neon32/dec_neon.c +++ b/contrib/libs/base64/neon32/dec_neon.c @@ -1,77 +1,77 @@ -// If we have NEON support, pick off 64 bytes at a time for as long as we can. -// Unlike the SSE codecs, we don't write trailing zero bytes to output, so we -// don't need to check if we have enough remaining input to cover them: -while (srclen >= 64) -{ - uint8x16x4_t set1, set2, set3, set4, set5, set6, set7, delta; - uint8x16x3_t dec; - - // Load 64 bytes and deinterleave: - uint8x16x4_t str = vld4q_u8((uint8_t *)c); - - // The input consists of six character sets in the Base64 alphabet, - // which we need to map back to the 6-bit values they represent. - // There are three ranges, two singles, and then there's the rest. - // - // # From To Add Characters - // 1 [43] [62] +19 + - // 2 [47] [63] +16 / - // 3 [48..57] [52..61] +4 0..9 - // 4 [65..90] [0..25] -65 A..Z - // 5 [97..122] [26..51] -71 a..z - // (6) Everything else => invalid input - - // Benchmarking on the Raspberry Pi 2B and Clang shows that looping - // generates slightly faster code than explicit unrolling: - for (int i = 0; i < 4; i++) { - set1.val[i] = CMPEQ(str.val[i], '+'); - set2.val[i] = CMPEQ(str.val[i], '/'); - set3.val[i] = RANGE(str.val[i], '0', '9'); - set4.val[i] = RANGE(str.val[i], 'A', 'Z'); - set5.val[i] = RANGE(str.val[i], 'a', 'z'); - set6.val[i] = CMPEQ(str.val[i], '-'); - set7.val[i] = CMPEQ(str.val[i], '_'); - - delta.val[i] = REPLACE(set1.val[i], 19); - delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set2.val[i], 16)); - delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set3.val[i], 4)); - delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set4.val[i], -65)); - delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set5.val[i], -71)); - delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set6.val[i], 17)); - delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set7.val[i], -32)); - } - - // Check for invalid input: if any of the delta values are zero, - // fall back on bytewise code to do error checking and reporting: - uint8x16_t classified = CMPEQ(delta.val[0], 0); - classified = vorrq_u8(classified, CMPEQ(delta.val[1], 0)); - classified = vorrq_u8(classified, CMPEQ(delta.val[2], 0)); - classified = vorrq_u8(classified, CMPEQ(delta.val[3], 0)); - - // Extract both 32-bit halves; check that all bits are zero: - if (vgetq_lane_u32((uint32x4_t)classified, 0) != 0 - || vgetq_lane_u32((uint32x4_t)classified, 1) != 0 - || vgetq_lane_u32((uint32x4_t)classified, 2) != 0 - || vgetq_lane_u32((uint32x4_t)classified, 3) != 0) { - break; - } - - // Now simply add the delta values to the input: - str.val[0] = vaddq_u8(str.val[0], delta.val[0]); - str.val[1] = vaddq_u8(str.val[1], delta.val[1]); - str.val[2] = vaddq_u8(str.val[2], delta.val[2]); - str.val[3] = vaddq_u8(str.val[3], delta.val[3]); - - // Compress four bytes into three: - dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4); - dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2); - dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3]; - - // Interleave and store decoded result: - vst3q_u8((uint8_t *)o, dec); - - c += 64; - o += 48; - outl += 48; - srclen -= 64; -} +// If we have NEON support, pick off 64 bytes at a time for as long as we can. +// Unlike the SSE codecs, we don't write trailing zero bytes to output, so we +// don't need to check if we have enough remaining input to cover them: +while (srclen >= 64) +{ + uint8x16x4_t set1, set2, set3, set4, set5, set6, set7, delta; + uint8x16x3_t dec; + + // Load 64 bytes and deinterleave: + uint8x16x4_t str = vld4q_u8((uint8_t *)c); + + // The input consists of six character sets in the Base64 alphabet, + // which we need to map back to the 6-bit values they represent. + // There are three ranges, two singles, and then there's the rest. + // + // # From To Add Characters + // 1 [43] [62] +19 + + // 2 [47] [63] +16 / + // 3 [48..57] [52..61] +4 0..9 + // 4 [65..90] [0..25] -65 A..Z + // 5 [97..122] [26..51] -71 a..z + // (6) Everything else => invalid input + + // Benchmarking on the Raspberry Pi 2B and Clang shows that looping + // generates slightly faster code than explicit unrolling: + for (int i = 0; i < 4; i++) { + set1.val[i] = CMPEQ(str.val[i], '+'); + set2.val[i] = CMPEQ(str.val[i], '/'); + set3.val[i] = RANGE(str.val[i], '0', '9'); + set4.val[i] = RANGE(str.val[i], 'A', 'Z'); + set5.val[i] = RANGE(str.val[i], 'a', 'z'); + set6.val[i] = CMPEQ(str.val[i], '-'); + set7.val[i] = CMPEQ(str.val[i], '_'); + + delta.val[i] = REPLACE(set1.val[i], 19); + delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set2.val[i], 16)); + delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set3.val[i], 4)); + delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set4.val[i], -65)); + delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set5.val[i], -71)); + delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set6.val[i], 17)); + delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set7.val[i], -32)); + } + + // Check for invalid input: if any of the delta values are zero, + // fall back on bytewise code to do error checking and reporting: + uint8x16_t classified = CMPEQ(delta.val[0], 0); + classified = vorrq_u8(classified, CMPEQ(delta.val[1], 0)); + classified = vorrq_u8(classified, CMPEQ(delta.val[2], 0)); + classified = vorrq_u8(classified, CMPEQ(delta.val[3], 0)); + + // Extract both 32-bit halves; check that all bits are zero: + if (vgetq_lane_u32((uint32x4_t)classified, 0) != 0 + || vgetq_lane_u32((uint32x4_t)classified, 1) != 0 + || vgetq_lane_u32((uint32x4_t)classified, 2) != 0 + || vgetq_lane_u32((uint32x4_t)classified, 3) != 0) { + break; + } + + // Now simply add the delta values to the input: + str.val[0] = vaddq_u8(str.val[0], delta.val[0]); + str.val[1] = vaddq_u8(str.val[1], delta.val[1]); + str.val[2] = vaddq_u8(str.val[2], delta.val[2]); + str.val[3] = vaddq_u8(str.val[3], delta.val[3]); + + // Compress four bytes into three: + dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4); + dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2); + dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3]; + + // Interleave and store decoded result: + vst3q_u8((uint8_t *)o, dec); + + c += 64; + o += 48; + outl += 48; + srclen -= 64; +} |