1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
|
size_t _tb64xdec( const unsigned char *in, size_t inlen, unsigned char *out);
size_t tb64memcpy(const unsigned char *in, size_t inlen, unsigned char *out); // testing only
#define PREFETCH(_ip_,_i_,_rw_) __builtin_prefetch(_ip_+(_i_),_rw_)
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define BSWAP32(a) a
#define BSWAP64(a) a
#else
#define BSWAP32(a) bswap32(a)
#define BSWAP64(a) bswap64(a)
#endif
#ifdef NB64CHECK
#define CHECK0(a)
#define CHECK1(a)
#else
#define CHECK0(a) a
#ifdef B64CHECK
#define CHECK1(a) a
#else
#define CHECK1(a)
#endif
#endif
//--------------------- Encoding ----------------------------------------------------------
extern unsigned char tb64lutse[];
#define SU32(_u_) (tb64lutse[(_u_>> 8) & 0x3f] << 24 |\
tb64lutse[(_u_>>14) & 0x3f] << 16 |\
tb64lutse[(_u_>>20) & 0x3f] << 8 |\
tb64lutse[(_u_>>26) & 0x3f])
#define ETAIL()\
unsigned _l = (in+inlen) - ip;\
if(_l == 3) { unsigned _u = ip[0]<<24 | ip[1]<<16 | ip[2]<<8; stou32(op, SU32(_u)); op+=4; ip+=3; }\
else if(_l) { *op++ = tb64lutse[(ip[0]>>2)&0x3f];\
if(_l == 2) *op++ = tb64lutse[(ip[0] & 0x3) << 4 | (ip[1] & 0xf0) >> 4],\
*op++ = tb64lutse[(ip[1] & 0xf) << 2];\
else *op++ = tb64lutse[(ip[0] & 0x3) << 4], *op++ = '=';\
*op++ = '=';\
}
extern const unsigned short tb64lutxe[];
#define XU32(_u_) (tb64lutxe[(_u_ >> 8) & 0xfff] << 16 |\
tb64lutxe[ _u_ >> 20])
#define EXTAIL() for(; op < (out+outlen)-4; op += 4, ip += 3) { unsigned _u = BSWAP32(ctou32(ip)); stou32(op, XU32(_u)); } ETAIL()
//--------------------- Decoding ----------------------------------------------------------
extern const unsigned tb64lutxd0[];
extern const unsigned tb64lutxd1[];
extern const unsigned tb64lutxd2[];
extern const unsigned tb64lutxd3[];
#define DU32(_u_) (tb64lutxd0[(unsigned char)(_u_ )] |\
tb64lutxd1[(unsigned char)(_u_>> 8)] |\
tb64lutxd2[(unsigned char)(_u_>> 16)] |\
tb64lutxd3[ _u_>> 24 ] )
#if 0
static ALWAYS_INLINE size_t _tb64xd(const unsigned char *in, size_t inlen, unsigned char *out) {
const unsigned char *ip = in;
unsigned char *op = out;
for(; ip < (in+inlen)-4; ip += 4, op += 3) { unsigned u = ctou32(ip); u = DU32(u); stou32(op, u); }
unsigned u = 0, l = (in+inlen) - ip;
if(l == 4) // last 4 bytes
if( ip[3]=='=') { l = 3;
if( ip[2]=='=') { l = 2;
if(ip[1]=='=') l = 1;
}
}
unsigned char *up = (unsigned char *)&u;
switch(l) {
case 4: u = ctou32(ip); u = DU32(u); *op++ = up[0]; *op++ = up[1]; *op++ = up[2]; break; // 4->3 bytes
case 3: u = tb64lutxd0[ip[0]] | tb64lutxd1[ip[1]] | tb64lutxd2[ip[2]]; *op++ = up[0]; *op++ = up[1]; break; // 3->2 bytes
case 2: u = tb64lutxd0[ip[0]] | tb64lutxd1[ip[1]]; *op++ = up[0]; break; // 2->1 byte
case 1: u = tb64lutxd0[ip[0]]; *op++ = up[0]; break; // 1->1 byte
}
return op-out;
}
#else
static ALWAYS_INLINE size_t _tb64xd(const unsigned char *in, size_t inlen, unsigned char *out) {
const unsigned char *ip = in;
unsigned char *op = out;
unsigned cu = 0;
for(; ip < (in+inlen)-4; ip += 4, op += 3) { unsigned u = ctou32(ip); u = DU32(u); stou32(op, u); cu |= u; }
unsigned u = 0, l = (in+inlen) - ip;
if(l == 4) // last 4 bytes
if( ip[3]=='=') { l = 3;
if( ip[2]=='=') { l = 2;
if(ip[1]=='=') l = 1;
}
}
unsigned char *up = (unsigned char *)&u;
switch(l) {
case 4: u = ctou32(ip); u = DU32(u); *op++ = up[0]; *op++ = up[1]; *op++ = up[2]; cu |= u; break; // 4->3 bytes
case 3: u = tb64lutxd0[ip[0]] | tb64lutxd1[ip[1]] | tb64lutxd2[ip[2]]; *op++ = up[0]; *op++ = up[1]; cu |= u; break; // 3->2 bytes
case 2: u = tb64lutxd0[ip[0]] | tb64lutxd1[ip[1]]; *op++ = up[0]; cu |= u; break; // 2->1 byte
case 1: u = tb64lutxd0[ip[0]]; *op++ = up[0]; cu |= u; break; // 1->1 byte
}
return (cu == -1)?0:(op-out);
}
#endif
//--------------------------- sse -----------------------------------------------------------------
#if defined(__SSSE3__)
#include <tmmintrin.h>
#define MM_PACK8TO6(v, cpv) {\
const __m128i merge_ab_and_bc = _mm_maddubs_epi16(v, _mm_set1_epi32(0x01400140)); /*/dec_reshuffle: https://arxiv.org/abs/1704.00605 P.17*/\
v = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));\
v = _mm_shuffle_epi8(v, cpv);\
}
#define MM_MAP8TO6(iv, shifted, delta_asso, delta_values, ov) { /*map 8-bits ascii to 6-bits bin*/\
shifted = _mm_srli_epi32(iv, 3);\
const __m128i delta_hash = _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, iv), shifted);\
ov = _mm_add_epi8(_mm_shuffle_epi8(delta_values, delta_hash), iv);\
}
#define MM_B64CHK(iv, shifted, check_asso, check_values, vx) {\
const __m128i check_hash = _mm_avg_epu8( _mm_shuffle_epi8(check_asso, iv), shifted);\
const __m128i chk = _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), iv);\
vx = _mm_or_si128(vx, chk);\
}
static ALWAYS_INLINE __m128i mm_map6to8(const __m128i v) {
const __m128i offsets = _mm_set_epi8( 0, 0,-16,-19, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 71, 65);
__m128i vidx = _mm_subs_epu8(v, _mm_set1_epi8(51));
vidx = _mm_sub_epi8(vidx, _mm_cmpgt_epi8(v, _mm_set1_epi8(25)));
return _mm_add_epi8(v, _mm_shuffle_epi8(offsets, vidx));
}
static ALWAYS_INLINE __m128i mm_unpack6to8(__m128i v) {
__m128i va = _mm_mulhi_epu16(_mm_and_si128(v, _mm_set1_epi32(0x0fc0fc00)), _mm_set1_epi32(0x04000040));
__m128i vb = _mm_mullo_epi16(_mm_and_si128(v, _mm_set1_epi32(0x003f03f0)), _mm_set1_epi32(0x01000010));
return _mm_or_si128(va, vb);
}
#endif
|