aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/HasTokenImpl.h
blob: ac5303e361ed3e20fd95548009ae4ac175069098 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#pragma once

#include <Columns/ColumnString.h>
#include <Common/StringSearcher.h>
#include <Core/ColumnNumbers.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int BAD_ARGUMENTS;
    extern const int ILLEGAL_COLUMN;
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int LOGICAL_ERROR;
}

/** Token search the string, means that needle must be surrounded by some separator chars, like whitespace or puctuation.
  */
template <typename Name, typename Searcher, bool negate>
struct HasTokenImpl
{
    using ResultType = UInt8;

    static constexpr bool use_default_implementation_for_constants = true;
    static constexpr bool supports_start_pos = false;
    static constexpr auto name = Name::name;

    static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1, 2}; }

    static void vectorConstant(
        const ColumnString::Chars & haystack_data,
        const ColumnString::Offsets & haystack_offsets,
        const std::string & pattern,
        const ColumnPtr & start_pos,
        PaddedPODArray<UInt8> & res,
        ColumnUInt8 * res_null)
    {
        if (start_pos != nullptr)
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' does not support start_pos argument", name);

        if (haystack_offsets.empty())
            return;

        const UInt8 * const begin = haystack_data.data();
        const UInt8 * const end = haystack_data.data() + haystack_data.size();
        const UInt8 * pos = begin;

        if (const auto has_separator = std::any_of(pattern.cbegin(), pattern.cend(), isTokenSeparator); has_separator || pattern.empty())
        {
            if (res_null)
            {
                std::fill(res.begin(), res.end(), 0);
                std::fill(res_null->getData().begin(), res_null->getData().end(), true);
                return;
            }
            else if (has_separator)
                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Needle must not contain whitespace or separator characters");
            else if (pattern.empty())
                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Needle cannot be empty, because empty string isn't a token");
            else
                throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected internal state");
        }

        size_t pattern_size = pattern.size();
        Searcher searcher(pattern.data(), pattern_size, end - pos);
        if (res_null)
            std::fill(res_null->getData().begin(), res_null->getData().end(), false);

        /// The current index in the array of strings.
        size_t i = 0;
        /// We will search for the next occurrence in all rows at once.
        while (pos < end && end != (pos = searcher.search(pos, end - pos)))
        {
            /// The found substring is a token
            if ((pos == begin || isTokenSeparator(pos[-1]))
                && (pos + pattern_size == end || isTokenSeparator(pos[pattern_size])))
            {
                /// Let's determine which index it refers to.
                while (begin + haystack_offsets[i] <= pos)
                {
                    res[i] = negate;
                    ++i;
                }

                /// We check that the entry does not pass through the boundaries of strings.
                if (pos + pattern.size() < begin + haystack_offsets[i])
                    res[i] = !negate;
                else
                    res[i] = negate;

                pos = begin + haystack_offsets[i];
                ++i;
            }
            else
            {
                /// Not a token. Jump over it.
                pos += pattern_size;
            }
        }

        /// Tail, in which there can be no substring.
        if (i < res.size())
            memset(&res[i], negate, (res.size() - i) * sizeof(res[0]));
    }

    template <typename... Args>
    static void vectorVector(Args &&...)
    {
        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needle argument", name);
    }

    /// Search different needles in single haystack.
    template <typename... Args>
    static void constantVector(Args &&...)
    {
        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needle argument", name);
    }

    template <typename... Args>
    static void vectorFixedConstant(Args &&...)
    {
        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name);
    }

    template <typename... Args>
    static void vectorFixedVector(Args &&...)
    {
        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name);
    }

private:
    static bool isTokenSeparator(UInt8 c)
    {
        return isASCII(c) && !isAlphaNumericASCII(c);
    }
};

}