diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/tokenizer.cpp | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/tokenizer.cpp')
-rw-r--r-- | library/cpp/tokenizer/tokenizer.cpp | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/tokenizer.cpp b/library/cpp/tokenizer/tokenizer.cpp new file mode 100644 index 00000000000..948271a2857 --- /dev/null +++ b/library/cpp/tokenizer/tokenizer.cpp @@ -0,0 +1,78 @@ +#ifndef CATBOOST_OPENSOURCE +#include <library/cpp/charset/wide.h> +#endif + +#include <util/charset/wide.h> +#include <util/memory/tempbuf.h> + +#include "sentbreakfilter.h" +#include "nlpparser.h" +#include "tokenizer.h" + +#include <util/stream/file.h> + +void TNlpTokenizer::Tokenize(const wchar16* str, + size_t size, + const TTokenizerOptions& opts) { + bool semicolonBreaksSentence = opts.LangMask == TLangMask(LANG_GRE); + TSentBreakFilter sentBreakFilter(opts.LangMask); + THolder<TNlpParser> parser; + switch (opts.Version) { + case 2: + parser = MakeHolder<TVersionedNlpParser<2>>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve, + BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode); + break; + case 3: + parser = MakeHolder<TVersionedNlpParser<3>>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve, + BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode, opts.KeepAffixes); + break; + default: + parser = MakeHolder<TDefaultNlpParser>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve, + BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode); + break; + } + try { + parser->Execute(str, size, &TextStart); + } catch (const ITokenHandler::TAllDoneException&) { + // do nothing + } +} + +#ifndef CATBOOST_OPENSOURCE +void TNlpTokenizer::Tokenize(const char* text, + size_t len, + bool spacePreserve, + TLangMask langMask) { + TCharTemp buf(len); + wchar16* const data = buf.Data(); + CharToWide(text, len, data, csYandex); + TTokenizerOptions opts {spacePreserve, langMask, /*decodeUrl=*/true}; + Tokenize(data, len, opts); +} +#endif + +void TNlpTokenizer::Tokenize(const wchar16* str, + size_t size, + bool spacePreserve, + TLangMask langMask) { + TTokenizerOptions opts {spacePreserve, langMask, /*decodeUrl=*/true}; + Tokenize(str, size, opts); +} + +bool IsSpecialTokenizerSymbol(const TWtringBuf s) { + if (s.size() != 1) { + return false; + } + // Only base-plane codepoints can be special tokenizer symbols, + // and they can be just casted to wchar32. + // Unicode conversion will be needed to process surrogate pairs. + return IsSpecialTokenizerSymbol(static_cast<wchar32>(s[0])); +} + +bool IsAsciiEmojiPart(const TWtringBuf s) { + // no worries for surrogates here because of Ascii + for (auto c : s) + if (!IsAsciiEmojiPart(c)) + return false; + return true; +} |