diff options
author | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
commit | b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch) | |
tree | 2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/tokenizer/split.h | |
parent | 559174a9144de40d6bb3997ea4073c82289b4974 (diff) | |
download | ydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz |
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/tokenizer/split.h')
-rw-r--r-- | library/cpp/tokenizer/split.h | 35 |
1 files changed, 0 insertions, 35 deletions
diff --git a/library/cpp/tokenizer/split.h b/library/cpp/tokenizer/split.h deleted file mode 100644 index 11f57f7f0ee..00000000000 --- a/library/cpp/tokenizer/split.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include <library/cpp/enumbitset/enumbitset.h> -#include <library/cpp/langmask/langmask.h> -#include <library/cpp/token/nlptypes.h> - -#include <util/generic/bitmap.h> -#include <util/generic/string.h> -#include <util/generic/vector.h> - -struct TTokenizerSplitParams { -public: - typedef TEnumBitSet<NLP_TYPE, NLP_END, NLP_MISCTEXT + 1> THandledMask; - static const THandledMask WORDS; - static const THandledMask NOT_PUNCT; - -public: - TTokenizerSplitParams(){}; - - TTokenizerSplitParams(const THandledMask& mask) - : HandledMask(mask){}; - -public: - /// Token types to handle, not used in SplitIntoSentences - THandledMask HandledMask = WORDS; - - /// Tokenizer params, see tokenizer.h for detailed explanation - bool BackwardCompatibility = true; - bool SpacePreserve = false; - TLangMask TokenizerLangMask; - bool UrlDecode = true; -}; - -TVector<TUtf16String> SplitIntoTokens(const TUtf16String& text, const TTokenizerSplitParams& params = TTokenizerSplitParams()); -TVector<TUtf16String> SplitIntoSentences(const TUtf16String& text, const TTokenizerSplitParams& params = TTokenizerSplitParams()); |