diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/multitokenutil.cpp | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/multitokenutil.cpp')
-rw-r--r-- | library/cpp/tokenizer/multitokenutil.cpp | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/multitokenutil.cpp b/library/cpp/tokenizer/multitokenutil.cpp new file mode 100644 index 00000000000..6d3a537bdde --- /dev/null +++ b/library/cpp/tokenizer/multitokenutil.cpp @@ -0,0 +1,278 @@ +#include <util/system/maxlen.h> +#include "multitokenutil.h" + +namespace { + inline void GlueTokens(TCharSpan* desttok, const TCharSpan* srctok) { + Y_ASSERT(srctok->PrefixLen == 0); + desttok->Len += srctok->Len; + desttok->SuffixLen = srctok->SuffixLen; + desttok->Hyphen = srctok->Hyphen; // the next token can have hyphenation + desttok->TokenDelim = srctok->TokenDelim; + } + + inline void CopyToken(TCharSpan* desttok, const TCharSpan* srctok, size_t destpos) { + Y_ASSERT(srctok->PrefixLen == 0); + desttok->Pos = destpos; + desttok->Len = srctok->Len; + desttok->SuffixLen = srctok->SuffixLen; + desttok->Type = srctok->Type; + desttok->Hyphen = srctok->Hyphen; // the next token can have hyphenation + desttok->TokenDelim = srctok->TokenDelim; + } + + //! processes hyphenation between subtokens if any + inline void ProcessHyphenation(wchar16* buffer, const wchar16* entry, TCharSpan*& desttok, const TCharSpan* srctok, size_t& destpos, size_t& srcpos) { + // Check if the space contains hyphenation, and copy the separator + if (desttok->Hyphen == HYPHEN_ORDINARY) { + if (desttok->Type == TOKEN_WORD && srctok->Type == TOKEN_WORD) { + // glue two adjacent tokens if both are TOKEN_WORD + srcpos = srctok->Pos; + GlueTokens(desttok, srctok); + } else { + // copy delimiter only - remove all following whitespace characters, for example: "sepa- \n rator" (see nlptok.rl, hyphenation/nlfasblank) + wchar16* const dest = buffer + destpos; + const wchar16* const src = entry + srcpos; // srcpos - the beginning of source delimiter + *dest = *src; + desttok->TokenDelim = TOKDELIM_MINUS; + ++destpos; + srcpos = srctok->Pos; // keep it before CopyToken() + CopyToken(++desttok, srctok, destpos); + } + } else if (desttok->Hyphen == HYPHEN_SOFT) { + if (desttok->Type == srctok->Type) { + // glue two adjacent tokens if both have the same type + srcpos = srctok->Pos; + GlueTokens(desttok, srctok); + } else { + // delimiter isn't copied + srcpos = srctok->Pos; // keep it before CopyToken() + CopyToken(++desttok, srctok, destpos); + } + } else { + Y_ASSERT(desttok->Hyphen == HYPHEN_NONE); // HYPHEN_HARD isn't processed yet + Y_ASSERT(srctok->Pos >= srcpos); + if (srctok->Pos > srcpos) { + // copy the delimiter and create the next token + const size_t seplen = srctok->Pos - srcpos; // srcpos - the beginning of source delimiter + + // copy suffix with delimiter + std::char_traits<wchar16>::copy(buffer + destpos, entry + srcpos, seplen); + destpos += seplen; + + srcpos = srctok->Pos; // keep it before CopyToken() + } + CopyToken(++desttok, srctok, destpos); + } + } + + //! @param len length of text to be copied (it can include suffix length) + inline void CopyTokenText(wchar16* buffer, const wchar16* entry, size_t len, size_t& destpos, size_t& srcpos) { + std::char_traits<wchar16>::copy(buffer + destpos, entry + srcpos, len); // srcpos - the beginning of source token + srcpos += len; + destpos += len; + } + + inline bool CutSubtoken(TCharSpan& s, size_t maxLen) { + if (s.EndPos() > maxLen) { + s.Len = maxLen - s.Pos; + s.SuffixLen = 0; + return true; + } else if (s.EndPos() + s.SuffixLen > maxLen) { + s.SuffixLen = maxLen - s.EndPos(); + return true; + } + return false; + } +} + +void CorrectDelimiters(TCharSpan& prevtok, wchar16 suffixChar, TCharSpan& lasttok, wchar16 prefixChar) { + // correct suffix length of the previous token + switch (lasttok.Pos - prevtok.EndPos()) { + case 0: + Y_ASSERT(prevtok.SuffixLen == 0 && prevtok.TokenDelim == TOKDELIM_NULL && lasttok.PrefixLen == 0); + break; + case 1: // only delimiter allowed + if (prevtok.SuffixLen == 1 && prevtok.TokenDelim == TOKDELIM_PLUS) // 'a+b' + prevtok.SuffixLen = 0; + Y_ASSERT( + (prevtok.SuffixLen == 0 && prevtok.TokenDelim != TOKDELIM_NULL && lasttok.PrefixLen == 0) || + (prevtok.SuffixLen == 0 && prevtok.TokenDelim == TOKDELIM_NULL && lasttok.PrefixLen == 0 && prevtok.Hyphen == HYPHEN_SOFT)); + break; + case 2: // only suffix + delimiter OR delimiter + prefix + if (prevtok.SuffixLen == 1 && lasttok.PrefixLen == 1) { // 'a+@b' - delim/prefix, 'a#@b' - suffix/delim + if (suffixChar == '+') { + prevtok.SuffixLen = 0; + prevtok.TokenDelim = TOKDELIM_PLUS; + } else if (suffixChar == '#') { + Y_ASSERT(prefixChar == '@'); // only '@' can be delimiter and prefix + prevtok.TokenDelim = TOKDELIM_AT_SIGN; + lasttok.PrefixLen = 0; + } + } else if (prevtok.SuffixLen == 2) { + prevtok.SuffixLen = 1; + prevtok.TokenDelim = TOKDELIM_PLUS; // since SuffixLen == 2 can be '++' only + } + Y_ASSERT( + (prevtok.SuffixLen == 1 && prevtok.TokenDelim != TOKDELIM_NULL && lasttok.PrefixLen == 0) || + (prevtok.SuffixLen == 0 && prevtok.TokenDelim != TOKDELIM_NULL && lasttok.PrefixLen == 1) || + (prevtok.SuffixLen == 0 && prevtok.TokenDelim == TOKDELIM_NULL && lasttok.PrefixLen == 0 && + (prevtok.Hyphen == HYPHEN_ORDINARY || prevtok.Hyphen == HYPHEN_SOFT))); + break; + case 3: + if (prevtok.SuffixLen == 2 && lasttok.PrefixLen == 1) { // a++/b + if (prevtok.TokenDelim == TOKDELIM_PLUS) // a++/b + prevtok.SuffixLen = 1; + else if (prevtok.TokenDelim == TOKDELIM_AT_SIGN) { // 'a++@b' + prevtok.SuffixLen = 1; + prevtok.TokenDelim = TOKDELIM_PLUS; + } + } + Y_ASSERT( + (prevtok.SuffixLen == 1 && prevtok.TokenDelim != TOKDELIM_NULL && lasttok.PrefixLen == 1) || + (prevtok.SuffixLen == 2 && prevtok.TokenDelim != TOKDELIM_NULL && lasttok.PrefixLen == 0) || + (prevtok.SuffixLen == 0 && prevtok.TokenDelim == TOKDELIM_NULL && lasttok.PrefixLen == 0 && + (prevtok.Hyphen == HYPHEN_ORDINARY || prevtok.Hyphen == HYPHEN_SOFT))); + break; + case 4: + Y_ASSERT( + (prevtok.SuffixLen == 2 && prevtok.TokenDelim != TOKDELIM_NULL && lasttok.PrefixLen == 1) || + (prevtok.SuffixLen == 0 && prevtok.TokenDelim == TOKDELIM_NULL && lasttok.PrefixLen == 0 && + (prevtok.Hyphen == HYPHEN_ORDINARY || prevtok.Hyphen == HYPHEN_SOFT))); + break; + default: + Y_ASSERT( + prevtok.SuffixLen == 0 && prevtok.TokenDelim == TOKDELIM_NULL && lasttok.PrefixLen == 0 && + (prevtok.Hyphen == HYPHEN_ORDINARY || prevtok.Hyphen == HYPHEN_SOFT)); + break; + } +} + +NLP_TYPE PrepareMultitoken(TTokenStructure& subtokens, wchar16* buffer, size_t buflen, const wchar16* entry, size_t& len) { + size_t srcpos = subtokens[0].EndPos(); // the beginning of source delimiters and tokens + if (srcpos > buflen - 1) { + srcpos = buflen - 1; + subtokens[0].Len = srcpos - subtokens[0].Pos; + } + size_t destpos = srcpos; + std::char_traits<wchar16>::copy(buffer, entry, srcpos); + + TCharSpan* const firsttok = &subtokens[0]; + TCharSpan* const lasttok = firsttok + subtokens.size(); + TCharSpan* desttok = firsttok; + + // lengths of tokens are not changed, tokens can be moved only to remove delimiters + + for (const TCharSpan* srctok = firsttok + 1; srctok != lasttok; ++srctok) { + // Check available space; truncate input if not sufficient + if (srctok->EndPos() >= buflen) { + break; + } + + if (srctok->EndPos() + srctok->SuffixLen >= buflen) { + ProcessHyphenation(buffer, entry, desttok, srctok, destpos, srcpos); + + if (desttok->EndPos() + desttok->SuffixLen >= buflen) { + Y_ASSERT(desttok->EndPos() < buflen); + desttok->SuffixLen = 0; // cut off the suffix + + CopyTokenText(buffer, entry, srctok->Len, destpos, srcpos); // copy with no suffix + } else + CopyTokenText(buffer, entry, (srctok->Len + srctok->SuffixLen), destpos, srcpos); // copy the token with suffix + + break; + } + + ProcessHyphenation(buffer, entry, desttok, srctok, destpos, srcpos); + CopyTokenText(buffer, entry, (srctok->Len + srctok->SuffixLen), destpos, srcpos); // copy the token with suffix + } + + Y_ASSERT(srcpos >= destpos); + + // Multitoken->Leng + len = destpos; + subtokens.resize(desttok - firsttok + 1); + subtokens.back().TokenDelim = TOKDELIM_NULL; + return DetectNLPType(subtokens); // after PrepareMultitoken nlpType can be chagned +} + +size_t AdjustSubtokens(TTokenStructure& subtokens, size_t maxLen) { + Y_ASSERT(!subtokens.empty()); + + TCharSpan* const first = &subtokens[0]; + TCharSpan* p = first + subtokens.size() - 1; + while (p != first && p->Pos >= maxLen) { + --p; + } + + CutSubtoken(*p, maxLen); + subtokens.resize(p - first + 1); + p->TokenDelim = TOKDELIM_NULL; + return p->EndPos() + p->SuffixLen; +} + +size_t AdjustSubtokens(TTokenStructure& subtokens, size_t n, size_t maxLen) { + Y_ASSERT(n > 0); + Y_ASSERT(!subtokens.empty()); + + // ````````````````ab``-cd```-ef``` + // ---------0---------- --1-- --2-- + // ^n + + TCharSpan& first = subtokens[0]; + Y_ASSERT(first.Len > n); + first.Pos = 0; + first.Len -= n; + if (CutSubtoken(first, maxLen)) { + subtokens.resize(1); + } else { + for (size_t i = 1; i < subtokens.size(); ++i) { + TCharSpan& span = subtokens[i]; + + Y_ASSERT(span.Pos > n); + const size_t newPos = span.Pos - n; + + if (newPos >= maxLen) { // if no symbols left for current token - finish + subtokens.resize(i); + break; + } + + span.Pos = newPos; + + if (CutSubtoken(span, maxLen)) { + subtokens.resize(i + 1); + break; + } + } + } + + TCharSpan& last = subtokens.back(); + last.TokenDelim = TOKDELIM_NULL; + return last.EndPos() + last.SuffixLen; +} + +bool CheckMultitoken(const TWideToken& tok) { + const TTokenStructure& subtokens = tok.SubTokens; + for (size_t i = 0; i < subtokens.size(); ++i) { + const TCharSpan& s = subtokens[i]; + if (!s.Len) + Y_ASSERT(false); + if (i > 0) { + const TCharSpan& prev = subtokens[i - 1]; + if (prev.EndPos() + prev.SuffixLen > s.Pos - s.PrefixLen) + Y_ASSERT(false); + else if (prev.EndPos() + prev.SuffixLen == s.Pos - s.PrefixLen && prev.TokenDelim != TOKDELIM_NULL) + Y_ASSERT(false); + } + if (i + 1 < subtokens.size()) { + const TCharSpan& next = subtokens[i + 1]; + if (s.EndPos() + s.SuffixLen > next.Pos - next.PrefixLen) + Y_ASSERT(false); + else if (s.EndPos() + s.SuffixLen == next.Pos - next.PrefixLen && s.TokenDelim != TOKDELIM_NULL) + Y_ASSERT(false); + } else { + if (s.EndPos() + s.SuffixLen != tok.Leng || s.TokenDelim != TOKDELIM_NULL) + Y_ASSERT(false); + } + } + return (subtokens.empty() || tok.Leng == (subtokens.back().EndPos() + subtokens.back().SuffixLen)); +} |