aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/countMatches.h
blob: 2834b8c764b3b68edb381087e81d39b9693f05fc (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#pragma once

#include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/Regexps.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int ILLEGAL_COLUMN;
    extern const int LOGICAL_ERROR;
}

using Pos = const char *;

template <class CountMatchesBase>
class FunctionCountMatches : public IFunction
{
public:
    static constexpr auto name = CountMatchesBase::name;
    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionCountMatches<CountMatchesBase>>(); }

    String getName() const override { return name; }
    size_t getNumberOfArguments() const override { return 2; }
    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }

    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
    {
        if (!isStringOrFixedString(arguments[1].type))
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
                "Illegal type {} of second argument (pattern) of function {}. Must be String/FixedString.",
                arguments[1].type->getName(), getName());
        if (!isStringOrFixedString(arguments[0].type))
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
                "Illegal type {} of first argument (haystack) of function {}. Must be String/FixedString.",
                arguments[0].type->getName(), getName());
        const auto * column = arguments[1].column.get();
        if (!column || !checkAndGetColumnConstStringOrFixedString(column))
            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
                "The second argument of function {} should be a constant string with the pattern",
                getName());

        return std::make_shared<DataTypeUInt64>();
    }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
    {
        const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
        const Regexps::Regexp re = Regexps::createRegexp</*is_like*/ false, /*no_capture*/ true, CountMatchesBase::case_insensitive>(column_pattern->getValue<String>());
        OptimizedRegularExpression::MatchVec matches;

        const IColumn * column_haystack = arguments[0].column.get();

        if (const ColumnString * col_str = checkAndGetColumn<ColumnString>(column_haystack))
        {
            auto result_column = ColumnUInt64::create();

            const ColumnString::Chars & src_chars = col_str->getChars();
            const ColumnString::Offsets & src_offsets = col_str->getOffsets();

            ColumnUInt64::Container & vec_res = result_column->getData();
            vec_res.resize(input_rows_count);

            size_t size = src_offsets.size();
            ColumnString::Offset current_src_offset = 0;

            for (size_t i = 0; i < size; ++i)
            {
                Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]);
                current_src_offset = src_offsets[i];
                Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]) - 1;

                std::string_view str(pos, end - pos);
                vec_res[i] = countMatches(str, re, matches);
            }

            return result_column;
        }
        else if (const ColumnConst * col_const_str = checkAndGetColumnConstStringOrFixedString(column_haystack))
        {
            std::string_view str = col_const_str->getDataColumn().getDataAt(0).toView();
            uint64_t matches_count = countMatches(str, re, matches);
            return result_type->createColumnConst(input_rows_count, matches_count);
        }
        else
            throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()");
    }

    static uint64_t countMatches(std::string_view src, const Regexps::Regexp & re, OptimizedRegularExpression::MatchVec & matches)
    {
        /// Only one match is required, no need to copy more.
        static const unsigned matches_limit = 1;

        Pos pos = reinterpret_cast<Pos>(src.data());
        Pos end = reinterpret_cast<Pos>(src.data() + src.size());

        uint64_t match_count = 0;
        while (true)
        {
            if (pos >= end)
                break;
            if (!re.match(pos, end - pos, matches, matches_limit))
                break;
            /// Progress should be made, but with empty match the progress will not be done.
            /// Also note that simply check is pattern empty is not enough,
            /// since for example "'[f]{0}'" will match zero bytes:
            if (!matches[0].length)
                break;
            pos += matches[0].offset + matches[0].length;
            match_count++;
        }

        return match_count;
    }
};

}