aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/split.cpp
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/split.cpp
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/split.cpp')
-rw-r--r--library/cpp/tokenizer/split.cpp80
1 files changed, 80 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/split.cpp b/library/cpp/tokenizer/split.cpp
new file mode 100644
index 00000000000..d6f1345aad1
--- /dev/null
+++ b/library/cpp/tokenizer/split.cpp
@@ -0,0 +1,80 @@
+#include "split.h"
+
+#include "tokenizer.h"
+
+namespace {
+ class TSimpleTokenHandler: public ITokenHandler {
+ public:
+ TSimpleTokenHandler(TVector<TUtf16String>* outTokens, const TTokenizerSplitParams& params)
+ : Tokens(outTokens)
+ , Params(params)
+ {
+ }
+
+ void OnToken(const TWideToken& token, size_t, NLP_TYPE type) override {
+ if (!Params.HandledMask.SafeTest(type)) {
+ return;
+ }
+
+ Tokens->push_back(TUtf16String(token.Token, token.Leng));
+ }
+
+ private:
+ TVector<TUtf16String>* Tokens;
+ TTokenizerSplitParams Params;
+ };
+
+ class TSimpleSentenceHandler: public ITokenHandler {
+ public:
+ TSimpleSentenceHandler(TVector<TUtf16String>* sentences)
+ : Sentences(sentences)
+ {
+ }
+
+ void OnToken(const TWideToken& token, size_t, NLP_TYPE type) override {
+ CurToken += token.Text();
+
+ if (type == NLP_SENTBREAK || type == NLP_PARABREAK) {
+ Flush();
+ }
+ }
+
+ void Flush() {
+ if (!CurToken.empty()) {
+ Sentences->push_back(CurToken);
+
+ CurToken = TUtf16String();
+ }
+ }
+
+ private:
+ TUtf16String CurToken;
+ TVector<TUtf16String>* Sentences;
+ };
+}
+
+const TTokenizerSplitParams::THandledMask TTokenizerSplitParams::WORDS(NLP_WORD);
+const TTokenizerSplitParams::THandledMask TTokenizerSplitParams::NOT_PUNCT(NLP_WORD, NLP_INTEGER, NLP_FLOAT, NLP_MARK);
+
+TVector<TUtf16String> SplitIntoTokens(const TUtf16String& text, const TTokenizerSplitParams& params) {
+ TVector<TUtf16String> words;
+
+ TSimpleTokenHandler handler(&words, params);
+ TNlpTokenizer tokenizer(handler, params.BackwardCompatibility);
+ TTokenizerOptions opts { params.SpacePreserve, params.TokenizerLangMask, params.UrlDecode };
+ tokenizer.Tokenize(text.data(), text.size(), opts);
+
+ return words;
+}
+
+TVector<TUtf16String> SplitIntoSentences(const TUtf16String& text, const TTokenizerSplitParams& params) {
+ TVector<TUtf16String> sentences;
+
+ TSimpleSentenceHandler handler(&sentences);
+ TNlpTokenizer tokenizer(handler, params.BackwardCompatibility);
+ TTokenizerOptions opts { params.SpacePreserve, params.TokenizerLangMask, params.UrlDecode };
+ tokenizer.Tokenize(text.data(), text.size(), opts);
+ handler.Flush();
+
+ return sentences;
+}