aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/toValidUTF8.cpp
blob: 41d29d9c494a4f5829c2dc7571cddfa2dac900be (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <IO/WriteBufferFromVector.h>
#include <IO/WriteHelpers.h>
#include <Poco/UTF8Encoding.h>

#include <string_view>

#include <base/simd.h>

#ifdef __SSE2__
#    include <emmintrin.h>
#endif

#if defined(__aarch64__) && defined(__ARM_NEON)
#    include <arm_neon.h>
#      pragma clang diagnostic ignored "-Wreserved-identifier"
#endif

namespace DB
{

namespace ErrorCodes
{
    extern const int ILLEGAL_COLUMN;
}

extern const UInt8 length_of_utf8_sequence[256];

namespace
{

struct ToValidUTF8Impl
{
    static void toValidUTF8One(const char * begin, const char * end, WriteBuffer & write_buffer)
    {
        static constexpr std::string_view replacement = "\xEF\xBF\xBD";

        const char * p = begin;
        const char * valid_start = begin;

        /// The last recorded character was `replacement`.
        bool just_put_replacement = false;

        auto put_valid = [&write_buffer, &just_put_replacement](const char * data, size_t len)
        {
            if (len == 0)
                return;
            just_put_replacement = false;
            write_buffer.write(data, len);
        };

        auto put_replacement = [&write_buffer, &just_put_replacement]()
        {
            if (just_put_replacement)
                return;
            just_put_replacement = true;
            write_buffer.write(replacement.data(), replacement.size());
        };

        while (p < end)
        {
#ifdef __SSE2__
            /// Fast skip of ASCII
            static constexpr size_t SIMD_BYTES = 16;
            const char * simd_end = p + (end - p) / SIMD_BYTES * SIMD_BYTES;

            while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(p))))
                p += SIMD_BYTES;

            if (!(p < end))
                break;
#elif defined(__aarch64__) && defined(__ARM_NEON)
            /// Fast skip of ASCII for aarch64.
            static constexpr size_t SIMD_BYTES = 16;
            const char * simd_end = p + (end - p) / SIMD_BYTES * SIMD_BYTES;
            /// Other options include
            /// vmaxvq_u8(input) < 0b10000000;
            /// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
            /// SIMDJSON uses it for 64 byte masks, so it's a little different.
            /// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
            /// shrn version has universally <=3 cycles, on servers 2 cycles.
            while (p < simd_end && getNibbleMask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
                p += SIMD_BYTES;

            if (!(p < end))
                break;
#endif

            size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];

            if (len > 4)
            {
                /// Invalid start of sequence. Skip one byte.
                put_valid(valid_start, p - valid_start);
                put_replacement();
                ++p;
                valid_start = p;
            }
            else if (p + len > end)
            {
                /// Sequence was not fully written to this buffer.
                break;
            }
            else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<const unsigned char *>(p), static_cast<int>(len)))
            {
                /// Valid sequence.
                p += len;
            }
            else
            {
                /// Invalid sequence. Skip just first byte.
                put_valid(valid_start, p - valid_start);
                put_replacement();
                ++p;
                valid_start = p;
            }
        }

        put_valid(valid_start, p - valid_start);

        if (p != end)
            put_replacement();
    }

    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        ColumnString::Chars & res_data,
        ColumnString::Offsets & res_offsets)
    {
        const size_t offsets_size = offsets.size();
        /// It can be larger than that, but we believe it is unlikely to happen.
        res_data.resize(data.size());
        res_offsets.resize(offsets_size);

        size_t prev_offset = 0;
        WriteBufferFromVector<ColumnString::Chars> write_buffer(res_data);
        for (size_t i = 0; i < offsets_size; ++i)
        {
            const char * haystack_data = reinterpret_cast<const char *>(&data[prev_offset]);
            const size_t haystack_size = offsets[i] - prev_offset - 1;
            toValidUTF8One(haystack_data, haystack_data + haystack_size, write_buffer);
            writeChar(0, write_buffer);
            res_offsets[i] = write_buffer.count();
            prev_offset = offsets[i];
        }
        write_buffer.finalize();
    }

    [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
    {
        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by toValidUTF8 function");
    }
};

struct NameToValidUTF8
{
    static constexpr auto name = "toValidUTF8";
};
using FunctionToValidUTF8 = FunctionStringToString<ToValidUTF8Impl, NameToValidUTF8>;

}

REGISTER_FUNCTION(ToValidUTF8)
{
    factory.registerFunction<FunctionToValidUTF8>();
}

}