diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/multitokenutil.h | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/multitokenutil.h')
-rw-r--r-- | library/cpp/tokenizer/multitokenutil.h | 22 |
1 files changed, 22 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/multitokenutil.h b/library/cpp/tokenizer/multitokenutil.h new file mode 100644 index 00000000000..19781f67858 --- /dev/null +++ b/library/cpp/tokenizer/multitokenutil.h @@ -0,0 +1,22 @@ +#pragma once + +#include <library/cpp/token/nlptypes.h> +#include <library/cpp/token/token_structure.h> + +void CorrectDelimiters(TCharSpan& prevtok, wchar16 suffixChar, TCharSpan& lasttok, wchar16 prefixChar); + +//! removes hyphenations and replaces unicode delimiters +//! @return new length of multitoken +NLP_TYPE PrepareMultitoken(TTokenStructure& subtokens, wchar16* buffer, size_t buflen, const wchar16* entry, size_t& len); + +//! cuts off the subtokens according to the specified maximum length +//! @return new length of the subtokens +size_t AdjustSubtokens(TTokenStructure& subtokens, size_t maxLen); + +//! corrects positions of subtokens and cuts off their length according to the specified maximum +//! @note the first @c n characters are accents +//! @return new length of the subtokens +size_t AdjustSubtokens(TTokenStructure& subtokens, size_t n, size_t maxLen); + +//! for debugging purposes only +bool CheckMultitoken(const TWideToken& tok); |