aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/tokenizer.cpp
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/tokenizer.cpp
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/tokenizer.cpp')
-rw-r--r--library/cpp/tokenizer/tokenizer.cpp78
1 files changed, 78 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/tokenizer.cpp b/library/cpp/tokenizer/tokenizer.cpp
new file mode 100644
index 00000000000..948271a2857
--- /dev/null
+++ b/library/cpp/tokenizer/tokenizer.cpp
@@ -0,0 +1,78 @@
+#ifndef CATBOOST_OPENSOURCE
+#include <library/cpp/charset/wide.h>
+#endif
+
+#include <util/charset/wide.h>
+#include <util/memory/tempbuf.h>
+
+#include "sentbreakfilter.h"
+#include "nlpparser.h"
+#include "tokenizer.h"
+
+#include <util/stream/file.h>
+
+void TNlpTokenizer::Tokenize(const wchar16* str,
+ size_t size,
+ const TTokenizerOptions& opts) {
+ bool semicolonBreaksSentence = opts.LangMask == TLangMask(LANG_GRE);
+ TSentBreakFilter sentBreakFilter(opts.LangMask);
+ THolder<TNlpParser> parser;
+ switch (opts.Version) {
+ case 2:
+ parser = MakeHolder<TVersionedNlpParser<2>>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve,
+ BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode);
+ break;
+ case 3:
+ parser = MakeHolder<TVersionedNlpParser<3>>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve,
+ BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode, opts.KeepAffixes);
+ break;
+ default:
+ parser = MakeHolder<TDefaultNlpParser>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve,
+ BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode);
+ break;
+ }
+ try {
+ parser->Execute(str, size, &TextStart);
+ } catch (const ITokenHandler::TAllDoneException&) {
+ // do nothing
+ }
+}
+
+#ifndef CATBOOST_OPENSOURCE
+void TNlpTokenizer::Tokenize(const char* text,
+ size_t len,
+ bool spacePreserve,
+ TLangMask langMask) {
+ TCharTemp buf(len);
+ wchar16* const data = buf.Data();
+ CharToWide(text, len, data, csYandex);
+ TTokenizerOptions opts {spacePreserve, langMask, /*decodeUrl=*/true};
+ Tokenize(data, len, opts);
+}
+#endif
+
+void TNlpTokenizer::Tokenize(const wchar16* str,
+ size_t size,
+ bool spacePreserve,
+ TLangMask langMask) {
+ TTokenizerOptions opts {spacePreserve, langMask, /*decodeUrl=*/true};
+ Tokenize(str, size, opts);
+}
+
+bool IsSpecialTokenizerSymbol(const TWtringBuf s) {
+ if (s.size() != 1) {
+ return false;
+ }
+ // Only base-plane codepoints can be special tokenizer symbols,
+ // and they can be just casted to wchar32.
+ // Unicode conversion will be needed to process surrogate pairs.
+ return IsSpecialTokenizerSymbol(static_cast<wchar32>(s[0]));
+}
+
+bool IsAsciiEmojiPart(const TWtringBuf s) {
+ // no worries for surrogates here because of Ascii
+ for (auto c : s)
+ if (!IsAsciiEmojiPart(c))
+ return false;
+ return true;
+}