diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/token/token_util.h | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/token/token_util.h')
-rw-r--r-- | library/cpp/token/token_util.h | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/library/cpp/token/token_util.h b/library/cpp/token/token_util.h new file mode 100644 index 00000000000..7383bdc840e --- /dev/null +++ b/library/cpp/token/token_util.h @@ -0,0 +1,30 @@ +#pragma once + +#include "token_structure.h" + +#include <util/system/yassert.h> +#include <util/generic/string.h> + +TUtf16String RemoveWideTokenPrefix(TWideToken& token); +TUtf16String RemoveWideTokenSuffix(TWideToken& token); + +// Check if we can split wide-token after specified sub-token. +// The function does't allow to split on dash and apostrophe betwenn normal words +bool CheckWideTokenSplit(const TWideToken& token, size_t pos); +// Check if we can split wide-token after specified sub-token by dot delimiter. +// The function verifies the following condition: +// <word> <dot> <word with uppercased first character or number> +// <number> <dot> <word with uppercased first character> +bool CheckWideTokenDotSplit(const TWideToken& token, size_t pos); + +// Check if we can split wide-token after specified sub-token. +// The function uses rich-tree specific heuristics +bool CheckWideTokenReqSplit(const TTokenStructure& subtokens, size_t pos); + +inline size_t GetSubTokenOffset(const TWideToken& tok, size_t subToken) { + Y_ASSERT(subToken < tok.SubTokens.size()); + return tok.SubTokens[subToken].Pos - tok.SubTokens[subToken].PrefixLen; +} + +// Create a new wide-token with the specified inclusive [start, end] sub-token range +TWideToken ExtractWideTokenRange(const TWideToken& tok, size_t start, size_t end); |