diff options
author | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
commit | b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch) | |
tree | 2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/tokenizer/tokenizer.cpp | |
parent | 559174a9144de40d6bb3997ea4073c82289b4974 (diff) | |
download | ydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz |
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/tokenizer/tokenizer.cpp')
-rw-r--r-- | library/cpp/tokenizer/tokenizer.cpp | 78 |
1 files changed, 0 insertions, 78 deletions
diff --git a/library/cpp/tokenizer/tokenizer.cpp b/library/cpp/tokenizer/tokenizer.cpp deleted file mode 100644 index 948271a2857..00000000000 --- a/library/cpp/tokenizer/tokenizer.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef CATBOOST_OPENSOURCE -#include <library/cpp/charset/wide.h> -#endif - -#include <util/charset/wide.h> -#include <util/memory/tempbuf.h> - -#include "sentbreakfilter.h" -#include "nlpparser.h" -#include "tokenizer.h" - -#include <util/stream/file.h> - -void TNlpTokenizer::Tokenize(const wchar16* str, - size_t size, - const TTokenizerOptions& opts) { - bool semicolonBreaksSentence = opts.LangMask == TLangMask(LANG_GRE); - TSentBreakFilter sentBreakFilter(opts.LangMask); - THolder<TNlpParser> parser; - switch (opts.Version) { - case 2: - parser = MakeHolder<TVersionedNlpParser<2>>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve, - BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode); - break; - case 3: - parser = MakeHolder<TVersionedNlpParser<3>>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve, - BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode, opts.KeepAffixes); - break; - default: - parser = MakeHolder<TDefaultNlpParser>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve, - BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode); - break; - } - try { - parser->Execute(str, size, &TextStart); - } catch (const ITokenHandler::TAllDoneException&) { - // do nothing - } -} - -#ifndef CATBOOST_OPENSOURCE -void TNlpTokenizer::Tokenize(const char* text, - size_t len, - bool spacePreserve, - TLangMask langMask) { - TCharTemp buf(len); - wchar16* const data = buf.Data(); - CharToWide(text, len, data, csYandex); - TTokenizerOptions opts {spacePreserve, langMask, /*decodeUrl=*/true}; - Tokenize(data, len, opts); -} -#endif - -void TNlpTokenizer::Tokenize(const wchar16* str, - size_t size, - bool spacePreserve, - TLangMask langMask) { - TTokenizerOptions opts {spacePreserve, langMask, /*decodeUrl=*/true}; - Tokenize(str, size, opts); -} - -bool IsSpecialTokenizerSymbol(const TWtringBuf s) { - if (s.size() != 1) { - return false; - } - // Only base-plane codepoints can be special tokenizer symbols, - // and they can be just casted to wchar32. - // Unicode conversion will be needed to process surrogate pairs. - return IsSpecialTokenizerSymbol(static_cast<wchar32>(s[0])); -} - -bool IsAsciiEmojiPart(const TWtringBuf s) { - // no worries for surrogates here because of Ascii - for (auto c : s) - if (!IsAsciiEmojiPart(c)) - return false; - return true; -} |