1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
|
#include <Poco/UTF8Encoding.h>
#include <IO/WriteBufferValidUTF8.h>
#include <base/types.h>
#include <base/simd.h>
#ifdef __SSE2__
#include <emmintrin.h>
#endif
#if defined(__aarch64__) && defined(__ARM_NEON)
# include <arm_neon.h>
# pragma clang diagnostic ignored "-Wreserved-identifier"
#endif
namespace DB
{
const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096;
/** Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
* left as-is for anyone who may want to do such conversion, which was
* allowed in earlier algorithms.
*/
extern const UInt8 length_of_utf8_sequence[256] =
{
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
};
WriteBufferValidUTF8::WriteBufferValidUTF8(
WriteBuffer & output_buffer_, bool group_replacements_, const char * replacement_, size_t size)
: BufferWithOwnMemory<WriteBuffer>(std::max(static_cast<size_t>(32), size)), output_buffer(output_buffer_),
group_replacements(group_replacements_), replacement(replacement_)
{
}
inline void WriteBufferValidUTF8::putReplacement()
{
if (replacement.empty() || (group_replacements && just_put_replacement))
return;
just_put_replacement = true;
output_buffer.write(replacement.data(), replacement.size());
}
inline void WriteBufferValidUTF8::putValid(char *data, size_t len)
{
if (len == 0)
return;
just_put_replacement = false;
output_buffer.write(data, len);
}
void WriteBufferValidUTF8::nextImpl()
{
char * p = memory.data();
char * valid_start = p;
while (p < pos)
{
#ifdef __SSE2__
/// Fast skip of ASCII for x86.
static constexpr size_t SIMD_BYTES = 16;
const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))))
p += SIMD_BYTES;
if (!(p < pos))
break;
#elif defined(__aarch64__) && defined(__ARM_NEON)
/// Fast skip of ASCII for aarch64.
static constexpr size_t SIMD_BYTES = 16;
const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
/// Other options include
/// vmaxvq_u8(input) < 0b10000000;
/// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
/// SIMDJSON uses it for 64 byte masks, so it's a little different.
/// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
/// shrn version has universally <=3 cycles, on servers 2 cycles.
while (p < simd_end && getNibbleMask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
p += SIMD_BYTES;
if (!(p < pos))
break;
#endif
UInt8 len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];
if (len > 4)
{ // NOLINT
/// Invalid start of sequence. Skip one byte.
putValid(valid_start, p - valid_start);
putReplacement();
++p;
valid_start = p;
}
else if (p + len > pos)
{
/// Sequence was not fully written to this buffer.
break;
}
else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len))
{
/// Valid sequence.
p += len;
}
else
{
/// Invalid sequence. Skip just first byte.
putValid(valid_start, p - valid_start);
putReplacement();
++p;
valid_start = p;
}
}
putValid(valid_start, p - valid_start);
size_t cnt = pos - p;
/// Shift unfinished sequence to start of buffer.
for (size_t i = 0; i < cnt; ++i)
memory[i] = p[i];
working_buffer = Buffer(&memory[cnt], memory.data() + memory.size());
}
WriteBufferValidUTF8::~WriteBufferValidUTF8()
{
finalize();
}
void WriteBufferValidUTF8::finalizeImpl()
{
/// Write all complete sequences from buffer.
nextImpl();
/// If unfinished sequence at end, then write replacement.
if (working_buffer.begin() != memory.data())
putReplacement();
}
}
|