contrib/clickhouse/src/Common/UTF8Helpers.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

#pragma once

#include <optional>
#include <base/types.h>
#include <base/simd.h>
#include <Common/BitHelpers.h>
#include <Poco/UTF8Encoding.h>

#ifdef __SSE2__
#include <emmintrin.h>
#endif

#if defined(__aarch64__) && defined(__ARM_NEON)
#    include <arm_neon.h>
#      pragma clang diagnostic ignored "-Wreserved-identifier"
#endif


namespace DB
{


namespace UTF8
{

static const UInt8 CONTINUATION_OCTET_MASK = 0b11000000u;
static const UInt8 CONTINUATION_OCTET = 0b10000000u;

/// return true if `octet` binary repr starts with 10 (octet is a UTF-8 sequence continuation)
inline bool isContinuationOctet(const UInt8 octet)
{
    return (octet & CONTINUATION_OCTET_MASK) == CONTINUATION_OCTET;
}

/// moves `s` backward until either first non-continuation octet or begin
inline void syncBackward(const UInt8 * & s, const UInt8 * const begin)
{
    while (isContinuationOctet(*s) && s > begin)
        --s;
}

/// moves `s` forward until either first non-continuation octet or string end is met
inline void syncForward(const UInt8 * & s, const UInt8 * const end)
{
    while (s < end && isContinuationOctet(*s))
        ++s;
}

/// returns UTF-8 code point sequence length judging by it's first octet
inline size_t seqLength(const UInt8 first_octet)
{
    if (first_octet < 0x80 || first_octet >= 0xF8)  /// The specs of UTF-8.
        return 1;

    const size_t bits = 8;
    const auto first_zero = bitScanReverse(static_cast<UInt8>(~first_octet));

    return bits - 1 - first_zero;
}

inline size_t countCodePoints(const UInt8 * data, size_t size)
{
    size_t res = 0;
    const auto * end = data + size;

#ifdef __SSE2__
    constexpr auto bytes_sse = sizeof(__m128i);
    const auto * src_end_sse = data + size / bytes_sse * bytes_sse;

    const auto threshold = _mm_set1_epi8(0xBF);

    for (; data < src_end_sse; data += bytes_sse)
        res += __builtin_popcount(_mm_movemask_epi8(
            _mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(data)), threshold)));
#elif defined(__aarch64__) && defined(__ARM_NEON)
    constexpr auto bytes_sse = 16;
    const auto * src_end_sse = data + size / bytes_sse * bytes_sse;

    const auto threshold = vdupq_n_s8(0xBF);

    for (; data < src_end_sse; data += bytes_sse)
        res += std::popcount(getNibbleMask(vcgtq_s8(vld1q_s8(reinterpret_cast<const int8_t *>(data)), threshold)));
    res >>= 2;
#endif

    for (; data < end; ++data) /// Skip UTF-8 continuation bytes.
        // TODO: this expressions is overwritten only because our clang-14 fails to compile previous
        // one with avx512. Remove this patch once clang-16 is in arcadia.
        // res += static_cast<Int8>(*data) > static_cast<Int8>(0xBF);
        res += (static_cast<UInt8>(*data) >> 6) != 0b10;

    return res;
}


template <typename CharT>
requires (sizeof(CharT) == 1)
size_t convertCodePointToUTF8(int code_point, CharT * out_bytes, size_t out_length)
{
    static const Poco::UTF8Encoding utf8;
    int res = utf8.convert(
        code_point,
        reinterpret_cast<uint8_t *>(out_bytes),
        static_cast<int>(out_length));
    assert(res >= 0);
    return res;
}

template <typename CharT>
requires (sizeof(CharT) == 1)
std::optional<uint32_t> convertUTF8ToCodePoint(const CharT * in_bytes, size_t in_length)
{
    static const Poco::UTF8Encoding utf8;
    int res = utf8.queryConvert(
        reinterpret_cast<const uint8_t *>(in_bytes),
        static_cast<int>(in_length));

    if (res >= 0)
        return res;
    return {};
}


/// returns UTF-8 wcswidth. Invalid sequence is treated as zero width character.
/// `prefix` is used to compute the `\t` width which extends the string before
/// and include `\t` to the nearest longer length with multiple of eight.
size_t computeWidth(const UInt8 * data, size_t size, size_t prefix = 0) noexcept;


/** Calculate the maximum number of bytes, so that substring of this size fits in 'limit' width.
  *
  * For example, we have string "x你好", it has 3 code points and visible width of 5 and byte size of 7.

  * Suppose we have limit = 3.
  * Then we have to return 4 as maximum number of bytes
  *  and the truncated string will be "x你": two code points, visible width 3, byte size 4.
  *
  * The same result will be for limit 4, because the last character would not fit.
  */
size_t computeBytesBeforeWidth(const UInt8 * data, size_t size, size_t prefix, size_t limit) noexcept;

}


}