contrib/clickhouse/src/Common/StringUtils/StringUtils.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325

#pragma once

#include <algorithm>
#include <string>
#include <cstring>
#include <cstddef>
#include <cstdint>
#include <type_traits>


namespace detail
{
    bool startsWith(const std::string & s, const char * prefix, size_t prefix_size);
    bool endsWith(const std::string & s, const char * suffix, size_t suffix_size);
}


inline bool startsWith(const std::string & s, const std::string & prefix)
{
    return detail::startsWith(s, prefix.data(), prefix.size());
}

inline bool endsWith(const std::string & s, const std::string & suffix)
{
    return detail::endsWith(s, suffix.data(), suffix.size());
}


/// With GCC, strlen is evaluated compile time if we pass it a constant
/// string that is known at compile time.
inline bool startsWith(const std::string & s, const char * prefix)
{
    return detail::startsWith(s, prefix, strlen(prefix));
}

inline bool endsWith(const std::string & s, const char * suffix)
{
    return detail::endsWith(s, suffix, strlen(suffix));
}

/// Given an integer, return the adequate suffix for
/// printing an ordinal number.
template <typename T>
std::string getOrdinalSuffix(T n)
{
    static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>,
        "Unsigned integer value required");

    const auto last_digit = n % 10;

    if ((last_digit < 1 || last_digit > 3)
        || ((n > 10) && (((n / 10) % 10) == 1)))
        return "th";

    switch (last_digit)
    {
        case 1: return "st";
        case 2: return "nd";
        case 3: return "rd";
        default: return "th";
    }
}

/// More efficient than libc, because doesn't respect locale. But for some functions table implementation could be better.

inline bool isASCII(char c)
{
    return static_cast<unsigned char>(c) < 0x80;
}

inline bool isLowerAlphaASCII(char c)
{
    return (c >= 'a' && c <= 'z');
}

inline bool isUpperAlphaASCII(char c)
{
    return (c >= 'A' && c <= 'Z');
}

inline bool isAlphaASCII(char c)
{
    return isLowerAlphaASCII(c) || isUpperAlphaASCII(c);
}

inline bool isNumericASCII(char c)
{
    /// This is faster than
    /// return UInt8(UInt8(c) - UInt8('0')) < UInt8(10);
    /// on Intel CPUs when compiled by gcc 8.
    return (c >= '0' && c <= '9');
}

inline bool isHexDigit(char c)
{
    return isNumericASCII(c)
        || (c >= 'a' && c <= 'f')
        || (c >= 'A' && c <= 'F');
}

inline bool isAlphaNumericASCII(char c)
{
    return isAlphaASCII(c)
        || isNumericASCII(c);
}

inline bool isWordCharASCII(char c)
{
    return isAlphaNumericASCII(c)
        || c == '_';
}

inline bool isValidIdentifierBegin(char c)
{
    return isAlphaASCII(c)
        || c == '_';
}

inline bool isWhitespaceASCII(char c)
{
    return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v';
}

/// Since |isWhiteSpaceASCII()| is used inside algorithms it's easier to implement another function than add extra argument.
inline bool isWhitespaceASCIIOneLine(char c)
{
    return c == ' ' || c == '\t' || c == '\f' || c == '\v';
}

inline bool isControlASCII(char c)
{
    return static_cast<unsigned char>(c) <= 31;
}

inline bool isPrintableASCII(char c)
{
    uint8_t uc = c;
    return uc >= 32 && uc <= 126;   /// 127 is ASCII DEL.
}

inline bool isPunctuationASCII(char c)
{
    uint8_t uc = c;
    return (uc >= 33 && uc <= 47)
        || (uc >= 58 && uc <= 64)
        || (uc >= 91 && uc <= 96)
        || (uc >= 123 && uc <= 125);
}


inline bool isValidIdentifier(std::string_view str)
{
    return !str.empty()
        && isValidIdentifierBegin(str[0])
        && std::all_of(str.begin() + 1, str.end(), isWordCharASCII)
        /// NULL is not a valid identifier in SQL, any case.
        && !(str.size() == strlen("null") && 0 == strncasecmp(str.data(), "null", strlen("null")));
}


inline bool isNumberSeparator(bool is_start_of_block, bool is_hex, const char * pos, const char * end)
{
    if (*pos != '_')
        return false;
    if (is_start_of_block && *pos == '_')
        return false; // e.g. _123, 12e_3
    if (pos + 1 < end && !(is_hex ? isHexDigit(pos[1]) : isNumericASCII(pos[1])))
        return false; // e.g. 1__2, 1_., 1_e, 1_p, 1_;
    if (pos + 1 == end)
        return false; // e.g. 12_
    return true;
}

/// Works assuming isAlphaASCII.
inline char toLowerIfAlphaASCII(char c)
{
    return c | 0x20;
}

inline char toUpperIfAlphaASCII(char c)
{
    return c & (~0x20);
}

inline char alternateCaseIfAlphaASCII(char c)
{
    return c ^ 0x20;
}

inline const char * skipWhitespacesUTF8(const char * pos, const char * end)
{
    /// https://en.wikipedia.org/wiki/Whitespace_character
    /// with some adjustments.

    /// Code points: 0085 00A0 180E 2000..200A 2028..2029 200B..200D 202F 205F 2060 3000 FEFF
    /// The corresponding UTF-8 is: C285 C2A0 E1A08E E28080..E2808A E280A8..E280A9 E2808B..E2808D E280AF E2819F E281A0 E38080 EFBBBF

    /// We check for these bytes directly in UTF8 for simplicity reasons.

    /** C2
      *    85
      *    A0
      * E1 A0 8E
      * E2
      *    80
      *       80..8A
      *       A8..A9
      *       8B..8D
      *       AF
      *    81
      *       9F
      *       A0
      * E3 80 80
      * EF BB BF
      */

    while (pos < end)
    {
        if (isWhitespaceASCII(*pos))
        {
            ++pos;
        }
        else
        {
            const uint8_t * upos = reinterpret_cast<const uint8_t *>(pos);

            if (pos + 1 < end && upos[0] == 0xC2 && (upos[1] == 0x85 || upos[1] == 0xA0))
            {
                pos += 2;
            }
            else if (pos + 2 < end
                &&    ((upos[0] == 0xE1 && upos[1] == 0xA0 && upos[2] == 0x8E)
                    || (upos[0] == 0xE2
                        &&    ((upos[1] == 0x80
                            &&    ((upos[2] >= 0x80 && upos[2] <= 0x8A)
                                || (upos[2] >= 0xA8 && upos[2] <= 0xA9)
                                || (upos[2] >= 0x8B && upos[2] <= 0x8D)
                                || (upos[2] == 0xAF)))
                            || (upos[1] == 0x81 && (upos[2] == 0x9F || upos[2] == 0xA0))))
                    || (upos[0] == 0xE3 && upos[1] == 0x80 && upos[2] == 0x80)
                    || (upos[0] == 0xEF && upos[1] == 0xBB && upos[2] == 0xBF)))
            {
                pos += 3;
            }
            else
                break;
        }
    }

    return pos;
}

inline bool equalsCaseInsensitive(char a, char b)
{
    return a == b || (isAlphaASCII(a) && alternateCaseIfAlphaASCII(a) == b);
}


template <typename F>
std::string trim(const std::string & str, F && predicate)
{
    size_t cut_front = 0;
    size_t cut_back = 0;
    size_t size = str.size();

    for (size_t i = 0; i < size; ++i)
    {
        if (predicate(str[i]))
            ++cut_front;
        else
            break;
    }

    if (cut_front == size)
        return {};

    for (auto it = str.rbegin(); it != str.rend(); ++it)
    {
        if (predicate(*it))
            ++cut_back;
        else
            break;
    }

    return str.substr(cut_front, size - cut_front - cut_back);
}

inline void trimLeft(std::string_view & str, char c = ' ')
{
    while (str.starts_with(c))
        str.remove_prefix(1);
}

inline void trimLeft(std::string & str, char c = ' ')
{
    str.erase(0, str.find_first_not_of(c));
}

inline void trimRight(std::string_view & str, char c = ' ')
{
    while (str.ends_with(c))
        str.remove_suffix(1);
}

inline void trimRight(std::string & str, char c = ' ')
{
    str.erase(str.find_last_not_of(c) + 1);
}

inline void trim(std::string_view & str, char c = ' ')
{
    trimLeft(str, c);
    trimRight(str, c);
}

inline void trim(std::string & str, char c = ' ')
{
    trimRight(str, c);
    trimLeft(str, c);
}

constexpr bool containsGlobs(const std::string & str)
{
    return str.find_first_of("*?{") != std::string::npos;
}