validate canons without yatest_common

author: qrort <qrort@yandex-team.com> 2022-11-30 23:47:12 +0300
committer: qrort <qrort@yandex-team.com> 2022-11-30 23:47:12 +0300
commit: 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
tree: bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/token/token_util.h
parent: 332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
download: ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
1 files changed, 30 insertions, 0 deletions
diff --git a/library/cpp/token/token_util.h b/library/cpp/token/token_util.h
new file mode 100644
index 00000000000..7383bdc840e
--- /dev/null
+++ b/library/cpp/token/token_util.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "token_structure.h"
+
+#include <util/system/yassert.h>
+#include <util/generic/string.h>
+
+TUtf16String RemoveWideTokenPrefix(TWideToken& token);
+TUtf16String RemoveWideTokenSuffix(TWideToken& token);
+
+// Check if we can split wide-token after specified sub-token.
+// The function does't allow to split on dash and apostrophe betwenn normal words
+bool CheckWideTokenSplit(const TWideToken& token, size_t pos);
+// Check if we can split wide-token after specified sub-token by dot delimiter.
+// The function verifies the following condition:
+// <word> <dot> <word with uppercased first character or number>
+// <number> <dot> <word with uppercased first character>
+bool CheckWideTokenDotSplit(const TWideToken& token, size_t pos);
+
+// Check if we can split wide-token after specified sub-token.
+// The function uses rich-tree specific heuristics
+bool CheckWideTokenReqSplit(const TTokenStructure& subtokens, size_t pos);
+
+inline size_t GetSubTokenOffset(const TWideToken& tok, size_t subToken) {
+    Y_ASSERT(subToken < tok.SubTokens.size());
+    return tok.SubTokens[subToken].Pos - tok.SubTokens[subToken].PrefixLen;
+}
+
+// Create a new wide-token with the specified inclusive [start, end] sub-token range
+TWideToken ExtractWideTokenRange(const TWideToken& tok, size_t start, size_t end);
author	qrort <qrort@yandex-team.com>	2022-11-30 23:47:12 +0300
committer	qrort <qrort@yandex-team.com>	2022-11-30 23:47:12 +0300
commit	22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
tree	bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/token/token_util.h
parent	332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
download	ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz