aboutsummaryrefslogtreecommitdiffstats
path: root/yql/essentials/sql/v1/lexer/regex/generic.h
blob: cde028cc599852703f21bde09a1fcd9bd0fad555 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#pragma once

#include <util/generic/ptr.h>
#include <util/generic/string.h>
#include <util/generic/vector.h>
#include <util/generic/maybe.h>
#include <util/generic/ylimits.h>

#include <functional>

namespace NSQLTranslationV1 {

    struct TGenericToken {
        static constexpr const char* Error = "<ERROR>";

        TStringBuf Name;
        TStringBuf Content;
        size_t Begin = 0; // In bytes
    };

    class IGenericLexer: public TThrRefBase {
    public:
        using TPtr = TIntrusivePtr<IGenericLexer>;
        using TTokenCallback = std::function<void(TGenericToken&& token)>;

        static constexpr size_t MaxErrorsLimit = Max<size_t>();

        virtual ~IGenericLexer() = default;
        virtual bool Tokenize(
            TStringBuf text,
            const TTokenCallback& onNext,
            size_t maxErrors = IGenericLexer::MaxErrorsLimit) const = 0;
    };

    using TTokenMatcher = std::function<TMaybe<TStringBuf>(TStringBuf prefix)>;

    struct TTokenRule {
        TString TokenName;
        TTokenMatcher Match;
    };

    using TGenericLexerGrammar = TVector<TTokenRule>;

    struct TRegexPattern {
        TString Body;
        TString After = "";
        bool IsCaseInsensitive = false;
    };

    TTokenMatcher Compile(const TRegexPattern& regex);

    IGenericLexer::TPtr MakeGenericLexer(TGenericLexerGrammar grammar);

    TVector<TGenericToken> Tokenize(IGenericLexer::TPtr& lexer, TStringBuf text);

} // namespace NSQLTranslationV1