aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/split.cpp
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
committerqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
commitb1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch)
tree2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/tokenizer/split.cpp
parent559174a9144de40d6bb3997ea4073c82289b4974 (diff)
downloadydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/tokenizer/split.cpp')
-rw-r--r--library/cpp/tokenizer/split.cpp80
1 files changed, 0 insertions, 80 deletions
diff --git a/library/cpp/tokenizer/split.cpp b/library/cpp/tokenizer/split.cpp
deleted file mode 100644
index d6f1345aad1..00000000000
--- a/library/cpp/tokenizer/split.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "split.h"
-
-#include "tokenizer.h"
-
-namespace {
- class TSimpleTokenHandler: public ITokenHandler {
- public:
- TSimpleTokenHandler(TVector<TUtf16String>* outTokens, const TTokenizerSplitParams& params)
- : Tokens(outTokens)
- , Params(params)
- {
- }
-
- void OnToken(const TWideToken& token, size_t, NLP_TYPE type) override {
- if (!Params.HandledMask.SafeTest(type)) {
- return;
- }
-
- Tokens->push_back(TUtf16String(token.Token, token.Leng));
- }
-
- private:
- TVector<TUtf16String>* Tokens;
- TTokenizerSplitParams Params;
- };
-
- class TSimpleSentenceHandler: public ITokenHandler {
- public:
- TSimpleSentenceHandler(TVector<TUtf16String>* sentences)
- : Sentences(sentences)
- {
- }
-
- void OnToken(const TWideToken& token, size_t, NLP_TYPE type) override {
- CurToken += token.Text();
-
- if (type == NLP_SENTBREAK || type == NLP_PARABREAK) {
- Flush();
- }
- }
-
- void Flush() {
- if (!CurToken.empty()) {
- Sentences->push_back(CurToken);
-
- CurToken = TUtf16String();
- }
- }
-
- private:
- TUtf16String CurToken;
- TVector<TUtf16String>* Sentences;
- };
-}
-
-const TTokenizerSplitParams::THandledMask TTokenizerSplitParams::WORDS(NLP_WORD);
-const TTokenizerSplitParams::THandledMask TTokenizerSplitParams::NOT_PUNCT(NLP_WORD, NLP_INTEGER, NLP_FLOAT, NLP_MARK);
-
-TVector<TUtf16String> SplitIntoTokens(const TUtf16String& text, const TTokenizerSplitParams& params) {
- TVector<TUtf16String> words;
-
- TSimpleTokenHandler handler(&words, params);
- TNlpTokenizer tokenizer(handler, params.BackwardCompatibility);
- TTokenizerOptions opts { params.SpacePreserve, params.TokenizerLangMask, params.UrlDecode };
- tokenizer.Tokenize(text.data(), text.size(), opts);
-
- return words;
-}
-
-TVector<TUtf16String> SplitIntoSentences(const TUtf16String& text, const TTokenizerSplitParams& params) {
- TVector<TUtf16String> sentences;
-
- TSimpleSentenceHandler handler(&sentences);
- TNlpTokenizer tokenizer(handler, params.BackwardCompatibility);
- TTokenizerOptions opts { params.SpacePreserve, params.TokenizerLangMask, params.UrlDecode };
- tokenizer.Tokenize(text.data(), text.size(), opts);
- handler.Flush();
-
- return sentences;
-}