diff options
author | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
commit | b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch) | |
tree | 2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/tokenizer/tokenizer.h | |
parent | 559174a9144de40d6bb3997ea4073c82289b4974 (diff) | |
download | ydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz |
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/tokenizer/tokenizer.h')
-rw-r--r-- | library/cpp/tokenizer/tokenizer.h | 133 |
1 files changed, 0 insertions, 133 deletions
diff --git a/library/cpp/tokenizer/tokenizer.h b/library/cpp/tokenizer/tokenizer.h deleted file mode 100644 index ade09751f55..00000000000 --- a/library/cpp/tokenizer/tokenizer.h +++ /dev/null @@ -1,133 +0,0 @@ -#pragma once - -#include <library/cpp/token/nlptypes.h> -#include <library/cpp/langmask/langmask.h> -#include <library/cpp/token/token_structure.h> - -#include <util/system/defaults.h> -#include <util/generic/yexception.h> -#include <util/generic/noncopyable.h> - -#include <cassert> -#include <cstdlib> - -class ITokenHandler { -public: - // Исключение, которое может кидаться обработчиком из OnToken. - // Токенайзер проглатывает такое исключение и прекращает токенизацию - class TAllDoneException: public yexception { - public: - TAllDoneException() { - *this << "Token handler: all done"; - } - }; - - virtual void OnToken(const TWideToken& token, size_t origleng, NLP_TYPE type) = 0; - virtual ~ITokenHandler() { - } -}; - -struct TTokenizerOptions { - bool SpacePreserve = false; - TLangMask LangMask = TLangMask(); - bool UrlDecode = true; - size_t Version = 2; - bool KeepAffixes = false; // keep prefix/suffix as part of token -}; - -//! breaks up a text into tokens and calls to @c ITokenHandler::OnToken() -//! @note the tokenizer produces tokens of the following types only: -//! NLP_WORD, NLP_INTEGER, NLP_FLOAT, NLP_MARK, NLP_SENTBREAK, NLP_PARABREAK, NLP_MISCTEXT. -class TNlpTokenizer: private TNonCopyable { -private: - ITokenHandler& TokenHandler; - const bool BackwardCompatible; //!< tokenizer reproduce old tokenization of marks - TTempArray<wchar16> Buffer; - const wchar16* TextStart = nullptr; - -public: - explicit TNlpTokenizer(ITokenHandler& handler, bool backwardCompatible = true) - : TokenHandler(handler) - , BackwardCompatible(backwardCompatible) - , Buffer() - { - } - - //! the main tokenizing function - //! @attention zero-character ('\0') considered as word break, so tokenizer does not stop processing - //! of text if it meets such character - //! @attention function isn't thread-safe - // in case of spacePreserve==false all whitespaces are replaced with space because - // browsers normalize whitespaces: "a \t\n\r b" -> "a b" if tag <pre></pre> isn't used - // this change fixes incorrect hyphenations without tag <pre>: "HTML-\nfile" is not "HTMLfile" - // browser show this text as: "HTML- file" - // in case of urlDecode==true firstly tokenizer tries to decode percent encoded text: - // "%D1%82%D0%B5%D0%BA%D1%81%D1%82" -> "текст" and then start tokenization. - // By default it's true. - void Tokenize(const wchar16* text, - size_t len, - const TTokenizerOptions& opts); - - //! all other Tokenize() functions are for backward compatibility - void Tokenize(const wchar16* text, - size_t len, - bool spacePreserve = false, - TLangMask langMask = TLangMask()); - -#ifndef CATBOOST_OPENSOURCE - //! converts the text from yandex encoding to unicode and calls to the main tokenizing function - void Tokenize(const char* text, - size_t len, - bool spacePreserve = false, - TLangMask langMask = TLangMask()); -#endif - - //! just calls to the main tokenizing function - void Tokenize(TWtringBuf text, - bool spacePreserve = false, - TLangMask langMask = TLangMask()) { - Tokenize(text.begin(), - text.size(), - spacePreserve, - langMask); - } - - //can point to text, Buffer or whatever - //set by NlpParser - //lifetime of data is min(lifetime(text), lifetime(tokenizer)) - const wchar16* GetTextStart() const { - return TextStart; - } -}; - -inline bool IsSpecialTokenizerSymbol(wchar32 ch) { - return ch >= 128 && NUnicode::CharHasType(ch, (1ULL << Sm_MATH) | (1ULL << Sc_CURRENCY) | (1ULL << So_OTHER)); -} - -bool IsSpecialTokenizerSymbol(const TWtringBuf s); - -inline bool IsAsciiEmojiPart(wchar32 ch) { - return ch < 128 && !IsAlnum(ch); -} - -bool IsAsciiEmojiPart(const TWtringBuf s); - -template <class TCallback> -class TCallbackTokenHandler: public ITokenHandler { - public: - TCallbackTokenHandler(TCallback callback) - : Callback(callback) - { - } - - virtual void OnToken(const TWideToken& token, size_t origleng, NLP_TYPE type) override { - Callback(token, origleng, type); - } - private: - TCallback Callback; -}; - -template <class TCallback> -TCallbackTokenHandler<TCallback> MakeCallbackTokenHandler(const TCallback& callback) { - return TCallbackTokenHandler<TCallback>(callback); -} |