diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/split.h | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/split.h')
-rw-r--r-- | library/cpp/tokenizer/split.h | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/split.h b/library/cpp/tokenizer/split.h new file mode 100644 index 00000000000..11f57f7f0ee --- /dev/null +++ b/library/cpp/tokenizer/split.h @@ -0,0 +1,35 @@ +#pragma once + +#include <library/cpp/enumbitset/enumbitset.h> +#include <library/cpp/langmask/langmask.h> +#include <library/cpp/token/nlptypes.h> + +#include <util/generic/bitmap.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> + +struct TTokenizerSplitParams { +public: + typedef TEnumBitSet<NLP_TYPE, NLP_END, NLP_MISCTEXT + 1> THandledMask; + static const THandledMask WORDS; + static const THandledMask NOT_PUNCT; + +public: + TTokenizerSplitParams(){}; + + TTokenizerSplitParams(const THandledMask& mask) + : HandledMask(mask){}; + +public: + /// Token types to handle, not used in SplitIntoSentences + THandledMask HandledMask = WORDS; + + /// Tokenizer params, see tokenizer.h for detailed explanation + bool BackwardCompatibility = true; + bool SpacePreserve = false; + TLangMask TokenizerLangMask; + bool UrlDecode = true; +}; + +TVector<TUtf16String> SplitIntoTokens(const TUtf16String& text, const TTokenizerSplitParams& params = TTokenizerSplitParams()); +TVector<TUtf16String> SplitIntoSentences(const TUtf16String& text, const TTokenizerSplitParams& params = TTokenizerSplitParams()); |