aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/extract.cpp
blob: 74c5a2fdd366480ae786f29301351d8b315ef214 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#include <Functions/FunctionsStringSearchToString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/Regexps.h>
#include <Common/OptimizedRegularExpression.h>


namespace DB
{
namespace
{

struct ExtractImpl
{
    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        const std::string & pattern,
        ColumnString::Chars & res_data,
        ColumnString::Offsets & res_offsets)
    {
        res_data.reserve(data.size() / 5);
        res_offsets.resize(offsets.size());

        const Regexps::Regexp regexp = Regexps::createRegexp<false, false, false>(pattern);

        unsigned capture = regexp.getNumberOfSubpatterns() > 0 ? 1 : 0;
        OptimizedRegularExpression::MatchVec matches;
        matches.reserve(capture + 1);
        size_t prev_offset = 0;
        size_t res_offset = 0;

        for (size_t i = 0; i < offsets.size(); ++i)
        {
            size_t cur_offset = offsets[i];

            unsigned count
                = regexp.match(reinterpret_cast<const char *>(&data[prev_offset]), cur_offset - prev_offset - 1, matches, capture + 1);
            if (count > capture && matches[capture].offset != std::string::npos)
            {
                const auto & match = matches[capture];
                res_data.resize(res_offset + match.length + 1);
                memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], &data[prev_offset + match.offset], match.length);
                res_offset += match.length;
            }
            else
            {
                res_data.resize(res_offset + 1);
            }

            res_data[res_offset] = 0;
            ++res_offset;
            res_offsets[i] = res_offset;

            prev_offset = cur_offset;
        }
    }
};

struct NameExtract
{
    static constexpr auto name = "extract";
};

using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;

}

REGISTER_FUNCTION(Extract)
{
    factory.registerFunction<FunctionExtract>();
}

}