aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/split.h
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
committerqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
commitb1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch)
tree2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/tokenizer/split.h
parent559174a9144de40d6bb3997ea4073c82289b4974 (diff)
downloadydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/tokenizer/split.h')
-rw-r--r--library/cpp/tokenizer/split.h35
1 files changed, 0 insertions, 35 deletions
diff --git a/library/cpp/tokenizer/split.h b/library/cpp/tokenizer/split.h
deleted file mode 100644
index 11f57f7f0ee..00000000000
--- a/library/cpp/tokenizer/split.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#include <library/cpp/enumbitset/enumbitset.h>
-#include <library/cpp/langmask/langmask.h>
-#include <library/cpp/token/nlptypes.h>
-
-#include <util/generic/bitmap.h>
-#include <util/generic/string.h>
-#include <util/generic/vector.h>
-
-struct TTokenizerSplitParams {
-public:
- typedef TEnumBitSet<NLP_TYPE, NLP_END, NLP_MISCTEXT + 1> THandledMask;
- static const THandledMask WORDS;
- static const THandledMask NOT_PUNCT;
-
-public:
- TTokenizerSplitParams(){};
-
- TTokenizerSplitParams(const THandledMask& mask)
- : HandledMask(mask){};
-
-public:
- /// Token types to handle, not used in SplitIntoSentences
- THandledMask HandledMask = WORDS;
-
- /// Tokenizer params, see tokenizer.h for detailed explanation
- bool BackwardCompatibility = true;
- bool SpacePreserve = false;
- TLangMask TokenizerLangMask;
- bool UrlDecode = true;
-};
-
-TVector<TUtf16String> SplitIntoTokens(const TUtf16String& text, const TTokenizerSplitParams& params = TTokenizerSplitParams());
-TVector<TUtf16String> SplitIntoSentences(const TUtf16String& text, const TTokenizerSplitParams& params = TTokenizerSplitParams());