diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/multitokenparser.h | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/multitokenparser.h')
-rw-r--r-- | library/cpp/tokenizer/multitokenparser.h | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/multitokenparser.h b/library/cpp/tokenizer/multitokenparser.h new file mode 100644 index 0000000000..0c5da68537 --- /dev/null +++ b/library/cpp/tokenizer/multitokenparser.h @@ -0,0 +1,203 @@ +#pragma once + +#include <library/cpp/token/nlptypes.h> +#include <library/cpp/token/token_structure.h> +#include "multitokenutil.h" + +// NOTE: this class inherited by request tokenizers only +class TMultitokenParser { + TWideToken Multitoken; //!< length of the whole token can include suffixes, if difference between pos and endpos of two tokens is greater than 1 then there is suffix + TCharSpan CurCharSpan; + wchar16 PrefixChar; + wchar16 SuffixChar; + +protected: + TMultitokenParser() + : PrefixChar(0) + , SuffixChar(0) + { + } + + //! used for deferring tokens in TReqTokenizer + void SetMultitoken(const TWideToken& tok) { + Multitoken = tok; + } + + const TWideToken& GetMultitoken() const { + return Multitoken; + } + + //! @param token start of the multitoken + //! @param len length includes suffix: 'ab-c+' length is equal to 5, in case of prefix operator: '-!abc' length is equal to 3 + //! and prefix: '#hashtag', '@user_name' + void SetMultitoken(const wchar16* token, size_t len) { + TTokenStructure& subtokens = Multitoken.SubTokens; + if (!subtokens.empty()) { + // positions of the first subtoken can be non-zero in the request parser + // but 'token' in this case must point to the first character of the first subtoken + const size_t pos = subtokens[0].Pos - subtokens[0].PrefixLen; + Y_ASSERT((subtokens.back().EndPos() - pos + subtokens.back().SuffixLen) == len); + if (pos) { + const size_t n = subtokens.size(); + for (size_t i = 0; i < n; ++i) + subtokens[i].Pos -= pos; + } + } + Multitoken.Token = token; + Multitoken.Leng = len; + + Y_ASSERT(CheckMultitoken(Multitoken)); + } + + const TTokenStructure& GetSubtokens() const { + return Multitoken.SubTokens; + } + + void BeginToken(const wchar16* tokstart, const wchar16* p) { + // it can be called twice in case "exa­́mple", see nlptok.rl, mixedtoken, tokfirst/toknext, numfirst/numnext + if (CurCharSpan.Len == 0) + CurCharSpan.Pos = p - tokstart; + } + + void BeginToken(const wchar16* tokstart, const wchar16* p, ETokenType type) { + BeginToken(tokstart, p); + CurCharSpan.Type = type; + } + + void UpdateToken() { + CurCharSpan.Len += 1; + } + + void AddToken() { + Y_ASSERT(CurCharSpan.Len); + Y_ASSERT(CurCharSpan.Type == TOKEN_WORD || CurCharSpan.Type == TOKEN_NUMBER); + + TTokenStructure& tokens = Multitoken.SubTokens; + + // @todo if number of tokens is greater than 64 then the last token can consist of numbers, letters and delimiters... + tokens.push_back(CurCharSpan); + + const size_t n = tokens.size(); + if (n > 1) + CorrectDelimiters(tokens[n - 2], SuffixChar, tokens[n - 1], PrefixChar); + + CurCharSpan.Pos = 0; + CurCharSpan.Len = 0; // it is checked in AddLastToken() + CurCharSpan.PrefixLen = 0; + PrefixChar = 0; + SuffixChar = 0; + } + + void AddIdeograph(size_t len) { + Y_ASSERT(!CurCharSpan.Len && (len == 1 || len == 2)); + TTokenStructure& tokens = Multitoken.SubTokens; + Y_ASSERT(tokens.empty()); + tokens.push_back(0, len, TOKEN_WORD); + } + + void AddLastToken(const wchar16* tokstart, const wchar16* tokend) { + // - CurCharSpan.Len assigned to 0 in AddToken() because in case of multitoken with '.' at the end, for + // example: " well-formed. " parser already called to %add_token because '.' can be delimiter of the next token + if (CurCharSpan.Len) { + const wchar16* const actualStart = tokstart + CurCharSpan.Pos; + // for ex. "5% " can have (actualStart == tokend) because '%' could be part of the next token with utf8 characters + if (actualStart < tokend) { + const size_t actualLen = tokend - actualStart; + if (CurCharSpan.Len != actualLen) // for example "WORD% NEXTWORD" - '%' could be part of UTF8 encoded character and already counted... + CurCharSpan.Len = actualLen; + AddToken(); + } else + CancelToken(); + } else + CancelToken(); + + TTokenStructure& tokens = Multitoken.SubTokens; + if (!tokens.empty()) + tokens.back().TokenDelim = TOKDELIM_NULL; // reset delimiter if any + } + + //! correct the last token if it contains words and numbers and changes length of multitoken + //! @param len length of multitoken (including all subtokens), for ex. (te - ts) + //! @return true if the last token is valid, false - last token is cut off and length is changed + //! @note in case of '+!abc-...-xyz' length includes '+!', this function doesn't take into account offset of the first subtoken + //! if number of subtokens equal to 63 all superfluous subtokens are put into the last subtoken TOKEN_MIXED + //! which is cut off in this function + bool CheckLastToken(size_t& len) { + TTokenStructure& tokens = Multitoken.SubTokens; + if (tokens.size() == MAX_SUBTOKENS && tokens.back().Type == TOKEN_MIXED) { + tokens.pop_back(); + // change delimiter (+) to suffix + TCharSpan& lasttok = tokens.back(); + len = lasttok.EndPos() + lasttok.SuffixLen; + // actually '+' should be added if subtoken has suffix '+' because '++' is valid suffix as well + if (lasttok.TokenDelim == TOKDELIM_PLUS && lasttok.SuffixLen == 0) { + lasttok.SuffixLen = 1; + len += 1; + } + lasttok.TokenDelim = TOKDELIM_NULL; + return false; + } + Y_ASSERT(tokens.empty() || tokens.back().TokenDelim == TOKDELIM_NULL); + return true; + } + + //! @return result of CheckLastToken() + //! @note positions of the first subtoken can be non-zero in case: +abc, -!xyz, + //! tokstart in this cases can point to + and - respectively, + //! SetMultitoken() resets position of the first subtoken to 0 + //! @param pos old position value of the first subtoken before resetting it to 0 + //! @param len new len of multitoken after cutting off the last subtoken if it is invalid + //! @note in case of '+!abc-efg' returned 'len' includes the prefix operators '+!' and is equal to 9 + bool SetRequestMultitoken(const wchar16* tokstart, const wchar16* tokend, size_t& len) { + AddLastToken(tokstart, tokend); + const TTokenStructure& subtokens = GetSubtokens(); + Y_ASSERT(!subtokens.empty()); + const TCharSpan& firsttok = subtokens[0]; + const TCharSpan& lasttok = subtokens.back(); + len = lasttok.EndPos() + lasttok.SuffixLen; // can't use (te - ts) because postfix can be there + const bool res = CheckLastToken(len); + const size_t firsttokStart = firsttok.Pos - firsttok.PrefixLen; + SetMultitoken(tokstart + firsttokStart, len - firsttokStart); + return res; + } + + void UpdatePrefix(wchar16 c) { + Y_ASSERT(c == '#' || c == '@' || c == '$'); + CurCharSpan.PrefixLen = 1; // length of prefix can't be more than 1 + PrefixChar = c; + } + + void UpdateSuffix(wchar16 c) { + Y_ASSERT(c == '#' || c == '+'); + TTokenStructure& tokens = Multitoken.SubTokens; + if (!tokens.empty()) { + tokens.back().SuffixLen += 1; + SuffixChar = c; + } else + Y_ASSERT(!"can't update suffix: no subtokens"); + } + + void CancelToken() { + // example: "abc 5% def", '%' can be the first symbol of utf8 encoded character so token is started by call to BeginToken() + // and then UpdateToken() is called as well but there is no call to AddToken() because '%' is interpreted as a misc character so + // CurCharSpan.Len must be reset + CurCharSpan.Len = 0; + CurCharSpan.PrefixLen = 0; + PrefixChar = 0; + SuffixChar = 0; + } + + void ClearSubtokens() { + Multitoken.SubTokens.clear(); + } + + //! @param delim type of delimiter + //! @param c delimiter unicode character + void SetTokenDelim(ETokenDelim delim, wchar16 /*c*/) { + Y_ASSERT(!Multitoken.SubTokens.empty()); + Multitoken.SubTokens.back().TokenDelim = delim; + // @todo remove this condition because unicode delimiters are removed before lemmatization + // if (c >= 0x7F) // if it is non-ASCII character + // SimpleMultitoken = false; + } +}; |