diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/tokenizer.h | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/tokenizer.h')
-rw-r--r-- | library/cpp/tokenizer/tokenizer.h | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/tokenizer.h b/library/cpp/tokenizer/tokenizer.h new file mode 100644 index 00000000000..ade09751f55 --- /dev/null +++ b/library/cpp/tokenizer/tokenizer.h @@ -0,0 +1,133 @@ +#pragma once + +#include <library/cpp/token/nlptypes.h> +#include <library/cpp/langmask/langmask.h> +#include <library/cpp/token/token_structure.h> + +#include <util/system/defaults.h> +#include <util/generic/yexception.h> +#include <util/generic/noncopyable.h> + +#include <cassert> +#include <cstdlib> + +class ITokenHandler { +public: + // Исключение, которое может кидаться обработчиком из OnToken. + // Токенайзер проглатывает такое исключение и прекращает токенизацию + class TAllDoneException: public yexception { + public: + TAllDoneException() { + *this << "Token handler: all done"; + } + }; + + virtual void OnToken(const TWideToken& token, size_t origleng, NLP_TYPE type) = 0; + virtual ~ITokenHandler() { + } +}; + +struct TTokenizerOptions { + bool SpacePreserve = false; + TLangMask LangMask = TLangMask(); + bool UrlDecode = true; + size_t Version = 2; + bool KeepAffixes = false; // keep prefix/suffix as part of token +}; + +//! breaks up a text into tokens and calls to @c ITokenHandler::OnToken() +//! @note the tokenizer produces tokens of the following types only: +//! NLP_WORD, NLP_INTEGER, NLP_FLOAT, NLP_MARK, NLP_SENTBREAK, NLP_PARABREAK, NLP_MISCTEXT. +class TNlpTokenizer: private TNonCopyable { +private: + ITokenHandler& TokenHandler; + const bool BackwardCompatible; //!< tokenizer reproduce old tokenization of marks + TTempArray<wchar16> Buffer; + const wchar16* TextStart = nullptr; + +public: + explicit TNlpTokenizer(ITokenHandler& handler, bool backwardCompatible = true) + : TokenHandler(handler) + , BackwardCompatible(backwardCompatible) + , Buffer() + { + } + + //! the main tokenizing function + //! @attention zero-character ('\0') considered as word break, so tokenizer does not stop processing + //! of text if it meets such character + //! @attention function isn't thread-safe + // in case of spacePreserve==false all whitespaces are replaced with space because + // browsers normalize whitespaces: "a \t\n\r b" -> "a b" if tag <pre></pre> isn't used + // this change fixes incorrect hyphenations without tag <pre>: "HTML-\nfile" is not "HTMLfile" + // browser show this text as: "HTML- file" + // in case of urlDecode==true firstly tokenizer tries to decode percent encoded text: + // "%D1%82%D0%B5%D0%BA%D1%81%D1%82" -> "текст" and then start tokenization. + // By default it's true. + void Tokenize(const wchar16* text, + size_t len, + const TTokenizerOptions& opts); + + //! all other Tokenize() functions are for backward compatibility + void Tokenize(const wchar16* text, + size_t len, + bool spacePreserve = false, + TLangMask langMask = TLangMask()); + +#ifndef CATBOOST_OPENSOURCE + //! converts the text from yandex encoding to unicode and calls to the main tokenizing function + void Tokenize(const char* text, + size_t len, + bool spacePreserve = false, + TLangMask langMask = TLangMask()); +#endif + + //! just calls to the main tokenizing function + void Tokenize(TWtringBuf text, + bool spacePreserve = false, + TLangMask langMask = TLangMask()) { + Tokenize(text.begin(), + text.size(), + spacePreserve, + langMask); + } + + //can point to text, Buffer or whatever + //set by NlpParser + //lifetime of data is min(lifetime(text), lifetime(tokenizer)) + const wchar16* GetTextStart() const { + return TextStart; + } +}; + +inline bool IsSpecialTokenizerSymbol(wchar32 ch) { + return ch >= 128 && NUnicode::CharHasType(ch, (1ULL << Sm_MATH) | (1ULL << Sc_CURRENCY) | (1ULL << So_OTHER)); +} + +bool IsSpecialTokenizerSymbol(const TWtringBuf s); + +inline bool IsAsciiEmojiPart(wchar32 ch) { + return ch < 128 && !IsAlnum(ch); +} + +bool IsAsciiEmojiPart(const TWtringBuf s); + +template <class TCallback> +class TCallbackTokenHandler: public ITokenHandler { + public: + TCallbackTokenHandler(TCallback callback) + : Callback(callback) + { + } + + virtual void OnToken(const TWideToken& token, size_t origleng, NLP_TYPE type) override { + Callback(token, origleng, type); + } + private: + TCallback Callback; +}; + +template <class TCallback> +TCallbackTokenHandler<TCallback> MakeCallbackTokenHandler(const TCallback& callback) { + return TCallbackTokenHandler<TCallback>(callback); +} |