contrib/libs/base64/neon32/codec_neon32.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

#if (defined(__ARM_NEON) && !defined(__ARM_NEON__))
#define __ARM_NEON__
#endif

#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#ifdef __ARM_NEON__
#include <arm_neon.h>
#endif

#include "libbase64.h"
#include "codecs.h"

#if (defined(__arm__) && defined(__ARM_NEON__))

#define CMPGT(s,n)	vcgtq_u8((s), vdupq_n_u8(n))
#define CMPEQ(s,n)	vceqq_u8((s), vdupq_n_u8(n))
#define REPLACE(s,n)	vandq_u8((s), vdupq_n_u8(n))
#define RANGE(s,a,b)	vandq_u8(vcgeq_u8((s), vdupq_n_u8(a)), vcleq_u8((s), vdupq_n_u8(b)))

static inline uint8x16x4_t
enc_reshuffle (uint8x16x3_t in)
{
	uint8x16x4_t out;

	// Divide bits of three input bytes over four output bytes:
	out.val[0] = vshrq_n_u8(in.val[0], 2);
	out.val[1] = vorrq_u8(vshrq_n_u8(in.val[1], 4), vshlq_n_u8(in.val[0], 4));
	out.val[2] = vorrq_u8(vshrq_n_u8(in.val[2], 6), vshlq_n_u8(in.val[1], 2));
	out.val[3] = in.val[2];

	// Clear top two bits:
	out.val[0] = vandq_u8(out.val[0], vdupq_n_u8(0x3F));
	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
	out.val[3] = vandq_u8(out.val[3], vdupq_n_u8(0x3F));

	return out;
}

static inline uint8x16x4_t
enc_translate (uint8x16x4_t in)
{
	uint8x16x4_t mask1, mask2, mask3, mask4, out;

	// Translate values 0..63 to the Base64 alphabet. There are five sets:
	// #  From      To         Abs  Delta  Characters
	// 0  [0..25]   [65..90]   +65  +65    ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// 1  [26..51]  [97..122]  +71   +6    abcdefghijklmnopqrstuvwxyz
	// 2  [52..61]  [48..57]    -4  -75    0123456789
	// 3  [62]      [43]       -19  -15    +
	// 4  [63]      [47]       -16   +3    /

	// Create cumulative masks for characters in sets [1,2,3,4], [2,3,4],
	// [3,4], and [4]:
	mask1.val[0] = CMPGT(in.val[0], 25);
	mask1.val[1] = CMPGT(in.val[1], 25);
	mask1.val[2] = CMPGT(in.val[2], 25);
	mask1.val[3] = CMPGT(in.val[3], 25);

	mask2.val[0] = CMPGT(in.val[0], 51);
	mask2.val[1] = CMPGT(in.val[1], 51);
	mask2.val[2] = CMPGT(in.val[2], 51);
	mask2.val[3] = CMPGT(in.val[3], 51);

	mask3.val[0] = CMPGT(in.val[0], 61);
	mask3.val[1] = CMPGT(in.val[1], 61);
	mask3.val[2] = CMPGT(in.val[2], 61);
	mask3.val[3] = CMPGT(in.val[3], 61);

	mask4.val[0] = CMPEQ(in.val[0], 63);
	mask4.val[1] = CMPEQ(in.val[1], 63);
	mask4.val[2] = CMPEQ(in.val[2], 63);
	mask4.val[3] = CMPEQ(in.val[3], 63);

	// All characters are at least in cumulative set 0, so add 'A':
	out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65));
	out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65));
	out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65));
	out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65));

	// For inputs which are also in any of the other cumulative sets,
	// add delta values against the previous set(s) to correct the shift:
	out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6));
	out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6));
	out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6));
	out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6));

	out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75));
	out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75));
	out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75));
	out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75));

	out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15));
	out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15));
	out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15));
	out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15));

	out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3));
	out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3));
	out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3));
	out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3));

	return out;
}

#endif

// Stride size is so large on these NEON 32-bit functions
// (48 bytes encode, 32 bytes decode) that we inline the
// uint32 codec to stay performant on smaller inputs.

void
neon32_base64_stream_encode
	( struct neon32_base64_state	*state
	, const char		*src
	, size_t		 srclen
	, char			*out
	, size_t		*outlen
	)
{
#if (defined(__arm__) && defined(__ARM_NEON__))
	#include "enc_head.c"
	#include "enc_neon.c"
	#include "enc_uint32.c"
	#include "enc_tail.c"
#else
    (void)state;
    (void)src;
    (void)srclen;
    (void)out;
    (void)outlen;
    abort();
#endif
}

int
neon32_base64_stream_decode
	( struct neon32_base64_state	*state
	, const char		*src
	, size_t		 srclen
	, char			*out
	, size_t		*outlen
	)
{
#if (defined(__arm__) && defined(__ARM_NEON__))
	#include "dec_head.c"
	#include "dec_neon.c"
	#include "dec_uint32.c"
	#include "dec_tail.c"
#else
    (void)state;
    (void)src;
    (void)srclen;
    (void)out;
    (void)outlen;
    abort();
#endif
}