aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/token/token_util.h
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/token/token_util.h
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/token/token_util.h')
-rw-r--r--library/cpp/token/token_util.h30
1 files changed, 30 insertions, 0 deletions
diff --git a/library/cpp/token/token_util.h b/library/cpp/token/token_util.h
new file mode 100644
index 00000000000..7383bdc840e
--- /dev/null
+++ b/library/cpp/token/token_util.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "token_structure.h"
+
+#include <util/system/yassert.h>
+#include <util/generic/string.h>
+
+TUtf16String RemoveWideTokenPrefix(TWideToken& token);
+TUtf16String RemoveWideTokenSuffix(TWideToken& token);
+
+// Check if we can split wide-token after specified sub-token.
+// The function does't allow to split on dash and apostrophe betwenn normal words
+bool CheckWideTokenSplit(const TWideToken& token, size_t pos);
+// Check if we can split wide-token after specified sub-token by dot delimiter.
+// The function verifies the following condition:
+// <word> <dot> <word with uppercased first character or number>
+// <number> <dot> <word with uppercased first character>
+bool CheckWideTokenDotSplit(const TWideToken& token, size_t pos);
+
+// Check if we can split wide-token after specified sub-token.
+// The function uses rich-tree specific heuristics
+bool CheckWideTokenReqSplit(const TTokenStructure& subtokens, size_t pos);
+
+inline size_t GetSubTokenOffset(const TWideToken& tok, size_t subToken) {
+ Y_ASSERT(subToken < tok.SubTokens.size());
+ return tok.SubTokens[subToken].Pos - tok.SubTokens[subToken].PrefixLen;
+}
+
+// Create a new wide-token with the specified inclusive [start, end] sub-token range
+TWideToken ExtractWideTokenRange(const TWideToken& tok, size_t start, size_t end);