contrib/libs/base64/neon64/enc_neon.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

// If we have ARM NEON support, pick off 48 bytes at a time: 
while (srclen >= 48) 
{ 
	uint8x16x3_t str; 
	uint8x16x4_t res; 
 
	// Load 48 bytes and deinterleave: 
	str = vld3q_u8((uint8_t *)c); 
 
	// Divide bits of three input bytes over four output bytes: 
	res.val[0] = vshrq_n_u8(str.val[0], 2); 
	res.val[1] = vshrq_n_u8(str.val[1], 4) | vshlq_n_u8(str.val[0], 4); 
	res.val[2] = vshrq_n_u8(str.val[2], 6) | vshlq_n_u8(str.val[1], 2); 
	res.val[3] = str.val[2]; 
 
	// Clear top two bits: 
	res.val[0] &= vdupq_n_u8(0x3F); 
	res.val[1] &= vdupq_n_u8(0x3F); 
	res.val[2] &= vdupq_n_u8(0x3F); 
	res.val[3] &= vdupq_n_u8(0x3F); 
 
	// The bits have now been shifted to the right locations; 
	// translate their values 0..63 to the Base64 alphabet. 
	// Use a 64-byte table lookup: 
	res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]); 
	res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]); 
	res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]); 
	res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]); 
 
	// Interleave and store result: 
	vst4q_u8((uint8_t *)o, res); 
 
	c += 48;	// 3 * 16 bytes of input 
	o += 64;	// 4 * 16 bytes of output 
	outl += 64; 
	srclen -= 48; 
}