diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/token/token_structure.h | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/token/token_structure.h')
-rw-r--r-- | library/cpp/token/token_structure.h | 306 |
1 files changed, 306 insertions, 0 deletions
diff --git a/library/cpp/token/token_structure.h b/library/cpp/token/token_structure.h new file mode 100644 index 00000000000..f78c536460f --- /dev/null +++ b/library/cpp/token/token_structure.h @@ -0,0 +1,306 @@ +#pragma once + +#include <util/str_stl.h> +#include <util/system/defaults.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/charset/wide.h> + +const size_t MAX_SUBTOKENS = 63; // 128; it must not be greater than number of words in a sentence + +//! @note tokenizer produces multitokens containins TOKEN_WORD and TOKEN_NUMBER subtokens +enum ETokenType { + TOKEN_WORD, + TOKEN_NUMBER, + TOKEN_MARK, //!< token contains letters and digits: a1b2 + TOKEN_FLOAT, //!< '1.23' - single token with no delimiter, length is equal to 4; introduced for backward compatibility + TOKEN_MIXED //!< actually the very last token can have this type, it must be tokenized again; it can contain delimiters, suffixes, different token types +}; + +enum EHyphenType { + HYPHEN_NONE, //!< means that there is no hyphenation + HYPHEN_ORDINARY, //!< an ordinary hyphenation, "-\n" + HYPHEN_SOFT, //!< U+00AD soft hyphen, "­" + HYPHEN_HARD //!< U+2011 non-breaking hyphen, it isn't processed yet, in case of "black-\nwhite" it will be "blackwhite" but it should be "black-white" +}; + +enum ETokenDelim { + TOKDELIM_NULL, //!< means that there is no token delimiter + TOKDELIM_APOSTROPHE, //!< ' 0x27 + TOKDELIM_MINUS, //!< - 0x2D + TOKDELIM_PLUS, //!< + 0x2B + TOKDELIM_UNDERSCORE, //!< _ 0x5F + TOKDELIM_SLASH, //!< / 0x2F + TOKDELIM_AT_SIGN, //!< @ 0x40 + TOKDELIM_DOT, //!< . 0x2E + TOKDELIM_UNKNOWN, //!< used between concatenated multitokens (such delimiters can have length greater than 1 or equal to 0) +}; + +// Position of a single token in the word +struct TCharSpan { + TCharSpan() + : Pos(0) + , Len(0) + , PrefixLen(0) + , SuffixLen(0) + , Type(TOKEN_WORD) + , Hyphen(HYPHEN_NONE) + , TokenDelim(TOKDELIM_NULL) + { + } + + TCharSpan(size_t offset, size_t len, ETokenType type = TOKEN_WORD, ETokenDelim tokenDelim = TOKDELIM_NULL, EHyphenType hyphen = HYPHEN_NONE, + ui16 suffixLen = 0, ui16 prefixLen = 0) + : Pos(offset) + , Len(len) + , PrefixLen(prefixLen) + , SuffixLen(suffixLen) + , Type(type) + , Hyphen(hyphen) + , TokenDelim(tokenDelim) + { + } + + size_t Pos; //!< offset from the beginning of the word to the start of the token + size_t Len; //!< token length, it does not include suffix length + ui16 PrefixLen; //!< Len doesn't include PrefixLen + ui16 SuffixLen; //!< suffix length + ETokenType Type; //!< type of this token + EHyphenType Hyphen; //!< hyphenation after the token, it should be verified by the lemmer and concatenated if the word is correct + ETokenDelim TokenDelim; //!< if true then apostrophe or minus follows this token + + //! @note end position does not include suffix length + inline size_t EndPos() const { + return Pos + Len; + } + + bool operator==(const TCharSpan& span) const { + return ((span.Pos == Pos) && (span.Len == Len) && + (span.PrefixLen == PrefixLen) && (span.SuffixLen == SuffixLen) && + (span.Type == Type) && (span.Hyphen == Hyphen) && (span.TokenDelim == TokenDelim)); + } + + size_t Hash() const { + size_t result = Pos; + result = CombineHashes(result, Len); + result = CombineHashes(result, size_t(PrefixLen)); + result = CombineHashes(result, size_t(SuffixLen)); + result = CombineHashes(result, size_t(Type)); + result = CombineHashes(result, size_t(Hyphen)); + result = CombineHashes(result, THash<size_t>()(TokenDelim)); + return result; + } +}; + +template <class T> +struct THash; + +template <> +struct THash<TCharSpan> { + size_t operator()(const TCharSpan& s) const { + return s.Hash(); + } +}; +// Vector of immutable TCharSpan elements, with upper limit on size: MAX_SUBTOKENS +// Mimics std::vector<TCharSpan> as closely as possible + +class TTokenStructure { +public: + TTokenStructure() { + // Tokens.reserve(MAX_SUBTOKENS); don't reserve memory + } + void push_back(const TCharSpan& s) { + push_back(s.Pos, s.Len, s.Type, s.TokenDelim, s.Hyphen, s.SuffixLen, s.PrefixLen); + } + void push_back(size_t offset, size_t len, ETokenType type = TOKEN_WORD, ETokenDelim tokenDelim = TOKDELIM_NULL, EHyphenType hyphen = HYPHEN_NONE, + ui16 suffixLen = 0, ui16 prefixLen = 0) { + Y_ASSERT(type == TOKEN_WORD || type == TOKEN_NUMBER || type == TOKEN_MARK || type == TOKEN_FLOAT || type == TOKEN_MIXED); + if (Tokens.size() == MAX_SUBTOKENS) { + // no more room - append to the last subtoken + TCharSpan& span = Tokens.back(); + span.Len = len + offset - span.Pos; + // PrefixLen not changed + span.SuffixLen = suffixLen; + span.Type = TOKEN_MIXED; // token gets invalid: it can contain delimiters, suffixes and different token types + span.Hyphen = HYPHEN_NONE; + span.TokenDelim = TOKDELIM_NULL; + } else { + // insert a new one + Tokens.push_back( + TCharSpan(offset, len, type, tokenDelim, hyphen, suffixLen, prefixLen)); + } + } + void pop_back() { + Tokens.pop_back(); + } + const TCharSpan* begin() const { + return Tokens.begin(); + } + const TCharSpan* end() const { + return Tokens.end(); + } + TCharSpan* begin() { + return Tokens.begin(); + } + TCharSpan* end() { + return Tokens.end(); + } + const TCharSpan* cbegin() const { + return Tokens.cbegin(); + } + const TCharSpan* cend() const { + return Tokens.cend(); + } + const TCharSpan& operator[](size_t idx) const { + return Tokens[idx]; + } + TCharSpan& operator[](size_t idx) { + return Tokens[idx]; + } + const TCharSpan& back() const { + return Tokens.back(); + } + TCharSpan& back() { + return Tokens.back(); + } + size_t size() const { + return Tokens.size(); + } + void resize(size_t newsize) { + Tokens.resize(Min(newsize, MAX_SUBTOKENS)); + } + size_t capacity() const { + return MAX_SUBTOKENS; + } + bool empty() const { + return Tokens.empty(); + } + void clear() { + Tokens.clear(); + } + void swap(TTokenStructure& other) { + Tokens.swap(other.Tokens); + } + +private: + TVector<TCharSpan> Tokens; +}; + +//! @note actually it is multi-token containing several tokens +struct TWideToken { + const wchar16* Token; + size_t Leng; + TTokenStructure SubTokens; + + TWideToken() + : Token(nullptr) + , Leng(0) + { + } + + TWideToken(const wchar16* s, size_t len) + : Token(s) + , Leng(len) + { + SubTokens.push_back(0, len); + } + + TWideToken(const wchar16* token, size_t len, const TTokenStructure& subTokens) + : Token(token) + , Leng(len) + , SubTokens(subTokens) + { + } + + void Concat(const TWideToken& tok) { + Y_ASSERT(Token && Leng && !SubTokens.empty() && Token + Leng <= tok.Token); + const size_t offset = tok.Token - Token; + Leng = offset + tok.Leng; + SubTokens.back().TokenDelim = TOKDELIM_UNKNOWN; + for (const auto& s : tok.SubTokens) { + SubTokens.push_back(s.Pos + offset, s.Len, s.Type, s.TokenDelim, s.Hyphen, s.SuffixLen, s.PrefixLen); + } + } + + const TWtringBuf Text() const { + return TWtringBuf(Token, Leng); + } + + TWideToken Extract(size_t begin, size_t end) const { + TWideToken ret; + if (end > SubTokens.size()) + end = SubTokens.size(); + if (begin >= end) + return ret; + + const size_t shift = SubTokens[begin].Pos - SubTokens[begin].PrefixLen; + ret.Token = Token + shift; + for (size_t i = begin; i < end; ++i) { + ret.SubTokens.push_back(SubTokens[i]); + ret.SubTokens.back().Pos -= shift; + } + ret.SubTokens.back().Hyphen = HYPHEN_NONE; + ret.SubTokens.back().TokenDelim = TOKDELIM_NULL; + ret.Leng = ret.SubTokens.back().EndPos() + ret.SubTokens.back().SuffixLen; + Y_ASSERT(ret.Leng + shift <= Leng); + return ret; + } +}; + +inline bool operator==(const TWideToken& a, const TWideToken& b) { + if ((a.Leng != b.Leng) || memcmp(a.Token, b.Token, a.Leng * sizeof(wchar16))) + return false; + if (a.SubTokens.size() != b.SubTokens.size()) + return false; + for (size_t i = 0; i < a.SubTokens.size(); ++i) { + if (!(a.SubTokens[i] == b.SubTokens[i])) + return false; + } + return true; +} + +inline bool GetLeftTokenDelim(const TWideToken& tok, size_t index) { + const TTokenStructure& subtokens = tok.SubTokens; + Y_ASSERT(index < subtokens.size()); + if (index == 0) + return false; + const TCharSpan& subtok = subtokens[index]; + const TCharSpan& prev = subtokens[index - 1]; + return (prev.EndPos() + prev.SuffixLen < subtok.Pos - subtok.PrefixLen); +} + +inline wchar16 GetRightTokenDelim(const TWideToken& tok, size_t index) { + const TTokenStructure& subtokens = tok.SubTokens; + Y_ASSERT(index < subtokens.size()); + if (index + 1 >= subtokens.size()) + return 0; + const TCharSpan& subtok = subtokens[index]; + const size_t delimPos = subtok.EndPos() + subtok.SuffixLen; + const TCharSpan& next = subtokens[index + 1]; + return (delimPos == next.Pos - next.PrefixLen ? 0 : tok.Token[delimPos]); +} + +//! @note yc_minus (0x2D) can be 0xB7 that is an ASCII character as well +inline char GetTokenDelimChar(ETokenDelim delim) { + switch (delim) { + case TOKDELIM_APOSTROPHE: + return '\''; + case TOKDELIM_MINUS: + return '-'; + case TOKDELIM_PLUS: + return '+'; + case TOKDELIM_UNDERSCORE: + return '_'; + case TOKDELIM_SLASH: + return '/'; + case TOKDELIM_AT_SIGN: + return '@'; + case TOKDELIM_DOT: + return '.'; + case TOKDELIM_NULL: + case TOKDELIM_UNKNOWN: + default: + Y_ASSERT(!"invalid delimiter"); + return 0; + } +} |