aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/Regexps.h
blob: e2c79a609dcebf603925f4ec6a837db57acb0996 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#pragma once

#include <map>
#include <memory>
#include <mutex>
#include <optional>
#include <string>
#include <utility>
#include <vector>
#include <Common/Exception.h>
#include <Common/OptimizedRegularExpression.h>
#include <Common/ProfileEvents.h>
#include <Common/likePatternToRegexp.h>
#include <base/defines.h>
#include <base/StringRef.h>
#include <boost/container_hash/hash.hpp>

#include "clickhouse_config.h"

#if USE_VECTORSCAN
#    error #include <hs.h>
#endif

namespace ProfileEvents
{
extern const Event RegexpCreated;
}


namespace DB
{
namespace ErrorCodes
{
    extern const int CANNOT_ALLOCATE_MEMORY;
    extern const int LOGICAL_ERROR;
    extern const int BAD_ARGUMENTS;
}

namespace Regexps
{

using Regexp = OptimizedRegularExpressionSingleThreaded;
using RegexpPtr = std::shared_ptr<Regexp>;

template <bool like, bool no_capture, bool case_insensitive>
inline Regexp createRegexp(const String & pattern)
{
    int flags = OptimizedRegularExpression::RE_DOT_NL;
    if constexpr (no_capture)
        flags |= OptimizedRegularExpression::RE_NO_CAPTURE;
    if constexpr (case_insensitive)
        flags |= OptimizedRegularExpression::RE_CASELESS;

    if constexpr (like)
        return {likePatternToRegexp(pattern), flags};
    else
        return {pattern, flags};
}

/// Caches compiled re2 objects for given string patterns. Intended to support the common situation of a small set of patterns which are
/// evaluated over and over within the same query. In these situations, usage of the cache will save unnecessary pattern re-compilation.
/// However, we must be careful that caching does not add too much static overhead to overall pattern evaluation. Therefore, the cache is
/// intentionally very lightweight: a) no thread-safety/mutexes, b) small & fixed capacity, c) no collision list, d) but also no open
/// addressing, instead collisions simply replace the existing element.
class LocalCacheTable
{
public:
    using RegexpPtr = std::shared_ptr<Regexp>;

    template <bool like, bool no_capture, bool case_insensitive>
    RegexpPtr getOrSet(const String & pattern)
    {
        Bucket & bucket = known_regexps[hasher(pattern) % CACHE_SIZE];

        if (bucket.regexp == nullptr) [[unlikely]]
            /// insert new entry
            bucket = {pattern, std::make_shared<Regexp>(createRegexp<like, no_capture, case_insensitive>(pattern))};
        else
            if (pattern != bucket.pattern)
                /// replace existing entry
                bucket = {pattern, std::make_shared<Regexp>(createRegexp<like, no_capture, case_insensitive>(pattern))};

        return bucket.regexp;
    }

private:
    constexpr static size_t CACHE_SIZE = 100; /// collision probability

    std::hash<String> hasher;
    struct Bucket
    {
        String pattern;   /// key
        RegexpPtr regexp; /// value
    };
    using CacheTable = std::array<Bucket, CACHE_SIZE>;
    CacheTable known_regexps;
};

}

#if USE_VECTORSCAN

namespace MultiRegexps
{
template <typename Deleter, Deleter deleter>
struct HyperscanDeleter
{
    template <typename T>
    void operator()(T * ptr) const
    {
        deleter(ptr);
    }
};

/// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception.
using CompilerErrorPtr = std::unique_ptr<hs_compile_error_t, HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
using ScratchPtr = std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
using DataBasePtr = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;

/// Database is immutable/thread-safe across multiple threads. Scratch is not but we can copy it whenever we use it in the searcher.
class Regexps
{
public:
    Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} { }

    hs_database_t * getDB() const { return db.get(); }
    hs_scratch_t * getScratch() const { return scratch.get(); }

private:
    DataBasePtr db;
    ScratchPtr scratch;
};

class DeferredConstructedRegexps
{
public:
    explicit DeferredConstructedRegexps(std::function<Regexps()> constructor_)
        : constructor(std::move(constructor_))
    {}

    Regexps * get()
    {
        std::lock_guard lock(mutex);
        if (regexps)
            return &*regexps;
        regexps = constructor();
        return &*regexps;
    }

private:
    std::function<Regexps()> constructor TSA_GUARDED_BY(mutex);
    std::optional<Regexps> regexps TSA_GUARDED_BY(mutex);
    std::mutex mutex;
};

using DeferredConstructedRegexpsPtr = std::shared_ptr<DeferredConstructedRegexps>;

template <bool save_indices, bool with_edit_distance>
inline Regexps constructRegexps(const std::vector<String> & str_patterns, [[maybe_unused]] std::optional<UInt32> edit_distance)
{
    /// Common pointers
    std::vector<const char *> patterns;
    std::vector<unsigned int> flags;

    /// Pointer for external edit distance compilation
    std::vector<hs_expr_ext> ext_exprs;
    std::vector<const hs_expr_ext *> ext_exprs_ptrs;

    patterns.reserve(str_patterns.size());
    flags.reserve(str_patterns.size());

    if constexpr (with_edit_distance)
    {
        ext_exprs.reserve(str_patterns.size());
        ext_exprs_ptrs.reserve(str_patterns.size());
    }

    for (std::string_view ref : str_patterns)
    {
        patterns.push_back(ref.data());
        /* Flags below are the pattern matching flags.
         * HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good
         * performance practice according to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode
         * HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match.
         * HS_FLAG_UTF8 is a flag where UTF8 literals are matched.
         * HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice
         * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag
         */
        flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8);
        if constexpr (with_edit_distance)
        {
            /// Hyperscan currently does not support UTF8 matching with edit distance.
            flags.back() &= ~HS_FLAG_UTF8;
            ext_exprs.emplace_back();
            /// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance.
            ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE;
            ext_exprs.back().edit_distance = edit_distance.value();
            ext_exprs_ptrs.push_back(&ext_exprs.back());
        }
    }
    hs_database_t * db = nullptr;
    hs_compile_error_t * compile_error;

    std::unique_ptr<unsigned int[]> ids;

    /// We mark the patterns to provide the callback results.
    if constexpr (save_indices)
    {
        ids.reset(new unsigned int[patterns.size()]);
        for (size_t i = 0; i < patterns.size(); ++i)
            ids[i] = static_cast<unsigned>(i + 1);
    }

    hs_error_t err;
    if constexpr (!with_edit_distance)
        err = hs_compile_multi(
            patterns.data(),
            flags.data(),
            ids.get(),
            static_cast<unsigned>(patterns.size()),
            HS_MODE_BLOCK,
            nullptr,
            &db,
            &compile_error);
    else
        err = hs_compile_ext_multi(
            patterns.data(),
            flags.data(),
            ids.get(),
            ext_exprs_ptrs.data(),
            static_cast<unsigned>(patterns.size()),
            HS_MODE_BLOCK,
            nullptr,
            &db,
            &compile_error);

    if (err != HS_SUCCESS)
    {
        /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
        CompilerErrorPtr error(compile_error);

        if (error->expression < 0)
            throw Exception::createRuntime(ErrorCodes::LOGICAL_ERROR, String(error->message));
        else
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Pattern '{}' failed with error '{}'", str_patterns[error->expression], String(error->message));
    }

    ProfileEvents::increment(ProfileEvents::RegexpCreated);

    /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
    /// function which is faster than allocating scratch space each time in each thread.
    hs_scratch_t * scratch = nullptr;
    err = hs_alloc_scratch(db, &scratch);

    /// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch.
    if (err != HS_SUCCESS)
        throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not allocate scratch space for vectorscan");

    return {db, scratch};
}

/// Maps string pattern vectors + edit distance to compiled vectorscan regexps. Uses the same eviction mechanism as the LocalCacheTable for
/// re2 patterns. Because vectorscan regexes are overall more heavy-weight (more expensive compilation, regexes can grow up to multiple
/// MBs, usage of scratch space), 1. GlobalCacheTable is a global singleton and, as a result, needs locking 2. the pattern compilation is
/// done outside GlobalCacheTable's lock, at the cost of another level of locking.
struct GlobalCacheTable
{
    constexpr static size_t CACHE_SIZE = 500; /// collision probability

    struct Bucket
    {
        std::vector<String> patterns;          /// key
        std::optional<UInt32> edit_distance;   /// key
        /// The compiled patterns and their state (vectorscan 'database' + scratch space) are wrapped in a shared_ptr. Refcounting guarantees
        /// that eviction of a pattern does not affect parallel threads still using the pattern.
        DeferredConstructedRegexpsPtr regexps; /// value
    };

    std::array<Bucket, CACHE_SIZE> known_regexps TSA_GUARDED_BY(mutex);
    std::mutex mutex;

    static size_t getBucketIndexFor(const std::vector<String> patterns, std::optional<UInt32> edit_distance)
    {
        size_t hash = 0;
        for (const auto & pattern : patterns)
            boost::hash_combine(hash, pattern);
        boost::hash_combine(hash, edit_distance);
        return hash % CACHE_SIZE;
    }
};

/// If with_edit_distance is False, edit_distance must be nullopt. Also, we use templates here because each instantiation of function template
/// has its own copy of local static variables which must not be the same for different hyperscan compilations.
template <bool save_indices, bool with_edit_distance>
inline DeferredConstructedRegexpsPtr getOrSet(const std::vector<std::string_view> & patterns, std::optional<UInt32> edit_distance)
{
    static GlobalCacheTable pool; /// Different variables for different pattern parameters, thread-safe in C++11

    std::vector<String> str_patterns;
    str_patterns.reserve(patterns.size());
    for (const auto & pattern : patterns)
        str_patterns.emplace_back(String(pattern));

    size_t bucket_idx = GlobalCacheTable::getBucketIndexFor(str_patterns, edit_distance);

    /// Lock cache to find compiled regexp for given pattern vector + edit distance.
    std::lock_guard lock(pool.mutex);

    GlobalCacheTable::Bucket & bucket = pool.known_regexps[bucket_idx];

    /// Pattern compilation is expensive and we don't want to block other threads reading from / inserting into the cache while we hold the
    /// cache lock during pattern compilation. Therefore, when a cache entry is created or replaced, only set the regexp constructor method
    /// and compile outside the cache lock.
    /// Note that the string patterns and the edit distance is passed into the constructor lambda by value, i.e. copied - it is not an
    /// option to reference the corresponding string patterns / edit distance key in the cache table bucket because the cache entry may
    /// already be evicted at the time the compilation starts.

    if (bucket.regexps == nullptr) [[unlikely]]
    {
        /// insert new entry
        auto deferred_constructed_regexps = std::make_shared<DeferredConstructedRegexps>(
                [str_patterns, edit_distance]()
                {
                    return constructRegexps<save_indices, with_edit_distance>(str_patterns, edit_distance);
                });
        bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps};
    }
    else
        if (bucket.patterns != str_patterns || bucket.edit_distance != edit_distance)
        {
            /// replace existing entry
            auto deferred_constructed_regexps = std::make_shared<DeferredConstructedRegexps>(
                    [str_patterns, edit_distance]()
                    {
                        return constructRegexps<save_indices, with_edit_distance>(str_patterns, edit_distance);
                    });
            bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps};
        }

    return bucket.regexps;
}

}

#endif // USE_VECTORSCAN

}