aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/tokenizer.cpp
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
committerqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
commitb1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch)
tree2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/tokenizer/tokenizer.cpp
parent559174a9144de40d6bb3997ea4073c82289b4974 (diff)
downloadydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/tokenizer/tokenizer.cpp')
-rw-r--r--library/cpp/tokenizer/tokenizer.cpp78
1 files changed, 0 insertions, 78 deletions
diff --git a/library/cpp/tokenizer/tokenizer.cpp b/library/cpp/tokenizer/tokenizer.cpp
deleted file mode 100644
index 948271a2857..00000000000
--- a/library/cpp/tokenizer/tokenizer.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef CATBOOST_OPENSOURCE
-#include <library/cpp/charset/wide.h>
-#endif
-
-#include <util/charset/wide.h>
-#include <util/memory/tempbuf.h>
-
-#include "sentbreakfilter.h"
-#include "nlpparser.h"
-#include "tokenizer.h"
-
-#include <util/stream/file.h>
-
-void TNlpTokenizer::Tokenize(const wchar16* str,
- size_t size,
- const TTokenizerOptions& opts) {
- bool semicolonBreaksSentence = opts.LangMask == TLangMask(LANG_GRE);
- TSentBreakFilter sentBreakFilter(opts.LangMask);
- THolder<TNlpParser> parser;
- switch (opts.Version) {
- case 2:
- parser = MakeHolder<TVersionedNlpParser<2>>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve,
- BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode);
- break;
- case 3:
- parser = MakeHolder<TVersionedNlpParser<3>>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve,
- BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode, opts.KeepAffixes);
- break;
- default:
- parser = MakeHolder<TDefaultNlpParser>(TokenHandler, sentBreakFilter, Buffer, opts.SpacePreserve,
- BackwardCompatible, semicolonBreaksSentence, opts.UrlDecode);
- break;
- }
- try {
- parser->Execute(str, size, &TextStart);
- } catch (const ITokenHandler::TAllDoneException&) {
- // do nothing
- }
-}
-
-#ifndef CATBOOST_OPENSOURCE
-void TNlpTokenizer::Tokenize(const char* text,
- size_t len,
- bool spacePreserve,
- TLangMask langMask) {
- TCharTemp buf(len);
- wchar16* const data = buf.Data();
- CharToWide(text, len, data, csYandex);
- TTokenizerOptions opts {spacePreserve, langMask, /*decodeUrl=*/true};
- Tokenize(data, len, opts);
-}
-#endif
-
-void TNlpTokenizer::Tokenize(const wchar16* str,
- size_t size,
- bool spacePreserve,
- TLangMask langMask) {
- TTokenizerOptions opts {spacePreserve, langMask, /*decodeUrl=*/true};
- Tokenize(str, size, opts);
-}
-
-bool IsSpecialTokenizerSymbol(const TWtringBuf s) {
- if (s.size() != 1) {
- return false;
- }
- // Only base-plane codepoints can be special tokenizer symbols,
- // and they can be just casted to wchar32.
- // Unicode conversion will be needed to process surrogate pairs.
- return IsSpecialTokenizerSymbol(static_cast<wchar32>(s[0]));
-}
-
-bool IsAsciiEmojiPart(const TWtringBuf s) {
- // no worries for surrogates here because of Ascii
- for (auto c : s)
- if (!IsAsciiEmojiPart(c))
- return false;
- return true;
-}