aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/split.h
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/split.h
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/split.h')
-rw-r--r--library/cpp/tokenizer/split.h35
1 files changed, 35 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/split.h b/library/cpp/tokenizer/split.h
new file mode 100644
index 00000000000..11f57f7f0ee
--- /dev/null
+++ b/library/cpp/tokenizer/split.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <library/cpp/enumbitset/enumbitset.h>
+#include <library/cpp/langmask/langmask.h>
+#include <library/cpp/token/nlptypes.h>
+
+#include <util/generic/bitmap.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+
+struct TTokenizerSplitParams {
+public:
+ typedef TEnumBitSet<NLP_TYPE, NLP_END, NLP_MISCTEXT + 1> THandledMask;
+ static const THandledMask WORDS;
+ static const THandledMask NOT_PUNCT;
+
+public:
+ TTokenizerSplitParams(){};
+
+ TTokenizerSplitParams(const THandledMask& mask)
+ : HandledMask(mask){};
+
+public:
+ /// Token types to handle, not used in SplitIntoSentences
+ THandledMask HandledMask = WORDS;
+
+ /// Tokenizer params, see tokenizer.h for detailed explanation
+ bool BackwardCompatibility = true;
+ bool SpacePreserve = false;
+ TLangMask TokenizerLangMask;
+ bool UrlDecode = true;
+};
+
+TVector<TUtf16String> SplitIntoTokens(const TUtf16String& text, const TTokenizerSplitParams& params = TTokenizerSplitParams());
+TVector<TUtf16String> SplitIntoSentences(const TUtf16String& text, const TTokenizerSplitParams& params = TTokenizerSplitParams());