aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/multitokenutil.h
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/multitokenutil.h
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/multitokenutil.h')
-rw-r--r--library/cpp/tokenizer/multitokenutil.h22
1 files changed, 22 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/multitokenutil.h b/library/cpp/tokenizer/multitokenutil.h
new file mode 100644
index 00000000000..19781f67858
--- /dev/null
+++ b/library/cpp/tokenizer/multitokenutil.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <library/cpp/token/nlptypes.h>
+#include <library/cpp/token/token_structure.h>
+
+void CorrectDelimiters(TCharSpan& prevtok, wchar16 suffixChar, TCharSpan& lasttok, wchar16 prefixChar);
+
+//! removes hyphenations and replaces unicode delimiters
+//! @return new length of multitoken
+NLP_TYPE PrepareMultitoken(TTokenStructure& subtokens, wchar16* buffer, size_t buflen, const wchar16* entry, size_t& len);
+
+//! cuts off the subtokens according to the specified maximum length
+//! @return new length of the subtokens
+size_t AdjustSubtokens(TTokenStructure& subtokens, size_t maxLen);
+
+//! corrects positions of subtokens and cuts off their length according to the specified maximum
+//! @note the first @c n characters are accents
+//! @return new length of the subtokens
+size_t AdjustSubtokens(TTokenStructure& subtokens, size_t n, size_t maxLen);
+
+//! for debugging purposes only
+bool CheckMultitoken(const TWideToken& tok);