diff options
author | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
commit | b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch) | |
tree | 2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/tokenizer/split.cpp | |
parent | 559174a9144de40d6bb3997ea4073c82289b4974 (diff) | |
download | ydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz |
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/tokenizer/split.cpp')
-rw-r--r-- | library/cpp/tokenizer/split.cpp | 80 |
1 files changed, 0 insertions, 80 deletions
diff --git a/library/cpp/tokenizer/split.cpp b/library/cpp/tokenizer/split.cpp deleted file mode 100644 index d6f1345aad1..00000000000 --- a/library/cpp/tokenizer/split.cpp +++ /dev/null @@ -1,80 +0,0 @@ -#include "split.h" - -#include "tokenizer.h" - -namespace { - class TSimpleTokenHandler: public ITokenHandler { - public: - TSimpleTokenHandler(TVector<TUtf16String>* outTokens, const TTokenizerSplitParams& params) - : Tokens(outTokens) - , Params(params) - { - } - - void OnToken(const TWideToken& token, size_t, NLP_TYPE type) override { - if (!Params.HandledMask.SafeTest(type)) { - return; - } - - Tokens->push_back(TUtf16String(token.Token, token.Leng)); - } - - private: - TVector<TUtf16String>* Tokens; - TTokenizerSplitParams Params; - }; - - class TSimpleSentenceHandler: public ITokenHandler { - public: - TSimpleSentenceHandler(TVector<TUtf16String>* sentences) - : Sentences(sentences) - { - } - - void OnToken(const TWideToken& token, size_t, NLP_TYPE type) override { - CurToken += token.Text(); - - if (type == NLP_SENTBREAK || type == NLP_PARABREAK) { - Flush(); - } - } - - void Flush() { - if (!CurToken.empty()) { - Sentences->push_back(CurToken); - - CurToken = TUtf16String(); - } - } - - private: - TUtf16String CurToken; - TVector<TUtf16String>* Sentences; - }; -} - -const TTokenizerSplitParams::THandledMask TTokenizerSplitParams::WORDS(NLP_WORD); -const TTokenizerSplitParams::THandledMask TTokenizerSplitParams::NOT_PUNCT(NLP_WORD, NLP_INTEGER, NLP_FLOAT, NLP_MARK); - -TVector<TUtf16String> SplitIntoTokens(const TUtf16String& text, const TTokenizerSplitParams& params) { - TVector<TUtf16String> words; - - TSimpleTokenHandler handler(&words, params); - TNlpTokenizer tokenizer(handler, params.BackwardCompatibility); - TTokenizerOptions opts { params.SpacePreserve, params.TokenizerLangMask, params.UrlDecode }; - tokenizer.Tokenize(text.data(), text.size(), opts); - - return words; -} - -TVector<TUtf16String> SplitIntoSentences(const TUtf16String& text, const TTokenizerSplitParams& params) { - TVector<TUtf16String> sentences; - - TSimpleSentenceHandler handler(&sentences); - TNlpTokenizer tokenizer(handler, params.BackwardCompatibility); - TTokenizerOptions opts { params.SpacePreserve, params.TokenizerLangMask, params.UrlDecode }; - tokenizer.Tokenize(text.data(), text.size(), opts); - handler.Flush(); - - return sentences; -} |