contrib/clickhouse/src/Functions/formatString.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

#pragma once

#include <Columns/ColumnString.h>
#include <base/types.h>
#include <Common/Exception.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/format.h>
#include <Common/memcpySmall.h>


#include <algorithm>
#include <optional>
#include <string>
#include <utility>
#include <vector>


namespace DB
{

struct FormatImpl
{
    static constexpr size_t right_padding = 15;

    template <typename... Args>
    static inline void formatExecute(bool possibly_has_column_string, bool possibly_has_column_fixed_string, Args &&... args)
    {
        if (possibly_has_column_string && possibly_has_column_fixed_string)
            format<true, true>(std::forward<Args>(args)...);
        else if (!possibly_has_column_string && possibly_has_column_fixed_string)
            format<false, true>(std::forward<Args>(args)...);
        else if (possibly_has_column_string && !possibly_has_column_fixed_string)
            format<true, false>(std::forward<Args>(args)...);
        else
            format<false, false>(std::forward<Args>(args)...);
    }

    /// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string.
    /// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString.
    /// fixed_string_N for savings N to fixed strings.
    /// constant_strings for constant strings. If data[i] is nullptr, it is constant string.
    /// res_data is result_data, res_offsets is offset result.
    /// input_rows_count is the number of rows processed.
    /// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size().
    template <bool has_column_string, bool has_column_fixed_string>
    static inline void format(
        String pattern,
        const std::vector<const ColumnString::Chars *> & data,
        const std::vector<const ColumnString::Offsets *> & offsets,
        [[maybe_unused]] /* Because sometimes !has_column_fixed_string */ const std::vector<size_t> & fixed_string_N,
        const std::vector<std::optional<String>> & constant_strings,
        ColumnString::Chars & res_data,
        ColumnString::Offsets & res_offsets,
        size_t input_rows_count)
    {
        const size_t argument_number = offsets.size();

        /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this
        /// array will be filled with [1, 3, 1, 0] but without constant string indices.
        Format::IndexPositions index_positions;

        /// Vector of substrings of pattern that will be copied to the answer, not string view because of escaping and iterators invalidation.
        /// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``].
        std::vector<String> substrings;

        Format::init(pattern, argument_number, constant_strings, index_positions, substrings);

        UInt64 final_size = 0;

        for (String & str : substrings)
        {
            /// To use memcpySmallAllowReadWriteOverflow15 for substrings we should allocate a bit more to each string.
            /// That was chosen due to performance issues.
            if (!str.empty())
                str.reserve(str.size() + right_padding);
            final_size += str.size();
        }

        /// The substring number is repeated input_rows_times.
        final_size *= input_rows_count;

        /// Strings without null termination.
        for (size_t i = 1; i < substrings.size(); ++i)
        {
            final_size += data[index_positions[i - 1]]->size();
            /// Fixed strings do not have zero terminating character.
            if (offsets[index_positions[i - 1]])
                final_size -= input_rows_count;
        }

        /// Null termination characters.
        final_size += input_rows_count;

        res_data.resize(final_size);
        res_offsets.resize(input_rows_count);

        UInt64 offset = 0;
        for (UInt64 i = 0; i < input_rows_count; ++i)
        {
            memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[0].data(), substrings[0].size());
            offset += substrings[0].size();
            /// All strings are constant, we should have substrings.size() == 1.
            if constexpr (has_column_string || has_column_fixed_string)
            {
                for (size_t j = 1; j < substrings.size(); ++j)
                {
                    UInt64 arg = index_positions[j - 1];
                    const auto * offset_ptr = offsets[arg];
                    UInt64 arg_offset = 0;
                    UInt64 size = 0;

                    if constexpr (has_column_string)
                    {
                        if (!has_column_fixed_string || offset_ptr)
                        {
                            arg_offset = (*offset_ptr)[i - 1];
                            size = (*offset_ptr)[i] - arg_offset - 1;
                        }
                    }

                    if constexpr (has_column_fixed_string)
                    {
                        if (!has_column_string || !offset_ptr)
                        {
                            arg_offset = fixed_string_N[arg] * i;
                            size = fixed_string_N[arg];
                        }
                    }

                    memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, data[arg]->data() + arg_offset, size);
                    offset += size;
                    memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[j].data(), substrings[j].size());
                    offset += substrings[j].size();
                }
            }
            res_data[offset] = '\0';
            ++offset;
            res_offsets[i] = offset;
        }

        /*
         * Invariant of `offset == final_size` must be held.
         *
         * if (offset != final_size)
         *    abort();
         */
    }
};

}