library/cpp/tokenizer/tokenizer.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

#pragma once

#include <library/cpp/token/nlptypes.h>
#include <library/cpp/langmask/langmask.h>
#include <library/cpp/token/token_structure.h>

#include <util/system/defaults.h>
#include <util/generic/yexception.h>
#include <util/generic/noncopyable.h>

#include <cassert>
#include <cstdlib>

class ITokenHandler {
public:
    // Исключение, которое может кидаться обработчиком из OnToken.
    // Токенайзер проглатывает такое исключение и прекращает токенизацию
    class TAllDoneException: public yexception {
    public:
        TAllDoneException() {
            *this << "Token handler: all done";
        }
    };

    virtual void OnToken(const TWideToken& token, size_t origleng, NLP_TYPE type) = 0;
    virtual ~ITokenHandler() {
    }
};

struct TTokenizerOptions {
    bool SpacePreserve = false;
    TLangMask LangMask = TLangMask();
    bool UrlDecode = true;
    size_t Version = 2;
    bool KeepAffixes = false; // keep prefix/suffix as part of token
};

//! breaks up a text into tokens and calls to @c ITokenHandler::OnToken()
//! @note the tokenizer produces tokens of the following types only:
//!       NLP_WORD, NLP_INTEGER, NLP_FLOAT, NLP_MARK, NLP_SENTBREAK, NLP_PARABREAK, NLP_MISCTEXT.
class TNlpTokenizer: private TNonCopyable {
private:
    ITokenHandler& TokenHandler;
    const bool BackwardCompatible; //!< tokenizer reproduce old tokenization of marks
    TTempArray<wchar16> Buffer;
    const wchar16* TextStart = nullptr;

public:
    explicit TNlpTokenizer(ITokenHandler& handler, bool backwardCompatible = true)
        : TokenHandler(handler)
        , BackwardCompatible(backwardCompatible)
        , Buffer()
    {
    }

    //! the main tokenizing function
    //! @attention zero-character ('\0') considered as word break, so tokenizer does not stop processing
    //!            of text if it meets such character
    //! @attention function isn't thread-safe
    //    in case of spacePreserve==false all whitespaces are replaced with space because
    //    browsers normalize whitespaces: "a \t\n\r b" -> "a b" if tag <pre></pre> isn't used
    //    this change fixes incorrect hyphenations without tag <pre>: "HTML-\nfile" is not "HTMLfile"
    //    browser show this text as: "HTML- file"
    //    in case of urlDecode==true firstly tokenizer tries to decode percent encoded text:
    //    "%D1%82%D0%B5%D0%BA%D1%81%D1%82" -> "текст" and then start tokenization.
    //    By default it's true.
    void Tokenize(const wchar16* text,
                  size_t len,
                  const TTokenizerOptions& opts);

    //! all other Tokenize() functions are for backward compatibility
    void Tokenize(const wchar16* text,
                  size_t len,
                  bool spacePreserve = false,
                  TLangMask langMask = TLangMask());

#ifndef CATBOOST_OPENSOURCE
    //! converts the text from yandex encoding to unicode and calls to the main tokenizing function
    void Tokenize(const char* text,
                  size_t len,
                  bool spacePreserve = false,
                  TLangMask langMask = TLangMask());
#endif

    //! just calls to the main tokenizing function
    void Tokenize(TWtringBuf text,
                  bool spacePreserve = false,
                  TLangMask langMask = TLangMask()) {
        Tokenize(text.begin(),
                 text.size(),
                 spacePreserve,
                 langMask);
    }

    //can point to text, Buffer or whatever
    //set by NlpParser
    //lifetime of data is min(lifetime(text), lifetime(tokenizer))
    const wchar16* GetTextStart() const {
        return TextStart;
    }
};

inline bool IsSpecialTokenizerSymbol(wchar32 ch) {
    return ch >= 128 && NUnicode::CharHasType(ch, (1ULL << Sm_MATH) | (1ULL << Sc_CURRENCY) | (1ULL << So_OTHER));
}

bool IsSpecialTokenizerSymbol(const TWtringBuf s);

inline bool IsAsciiEmojiPart(wchar32 ch) {
    return ch < 128 && !IsAlnum(ch);
}

bool IsAsciiEmojiPart(const TWtringBuf s);

template <class TCallback>
class TCallbackTokenHandler: public ITokenHandler {
    public:
        TCallbackTokenHandler(TCallback callback)
            : Callback(callback)
        {
        }

        virtual void OnToken(const TWideToken& token, size_t origleng, NLP_TYPE type) override {
            Callback(token, origleng, type);
        }
    private:
        TCallback Callback;
};

template <class TCallback>
TCallbackTokenHandler<TCallback> MakeCallbackTokenHandler(const TCallback& callback) {
    return TCallbackTokenHandler<TCallback>(callback);
}