blob: 9cf28a11f8c6ccb04c1d5c3a883a21a6836aa997 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
// If we have ARM NEON support, pick off 48 bytes at a time:
while (srclen >= 48)
{
uint8x16x3_t str;
uint8x16x4_t res;
// Load 48 bytes and deinterleave:
str = vld3q_u8((uint8_t *)c);
// Divide bits of three input bytes over four output bytes:
res.val[0] = vshrq_n_u8(str.val[0], 2);
res.val[1] = vshrq_n_u8(str.val[1], 4) | vshlq_n_u8(str.val[0], 4);
res.val[2] = vshrq_n_u8(str.val[2], 6) | vshlq_n_u8(str.val[1], 2);
res.val[3] = str.val[2];
// Clear top two bits:
res.val[0] &= vdupq_n_u8(0x3F);
res.val[1] &= vdupq_n_u8(0x3F);
res.val[2] &= vdupq_n_u8(0x3F);
res.val[3] &= vdupq_n_u8(0x3F);
// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]);
res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]);
res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]);
res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]);
// Interleave and store result:
vst4q_u8((uint8_t *)o, res);
c += 48; // 3 * 16 bytes of input
o += 64; // 4 * 16 bytes of output
outl += 64;
srclen -= 48;
}
|