aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/nlpparser.cpp
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
committerqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
commitb1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch)
tree2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/tokenizer/nlpparser.cpp
parent559174a9144de40d6bb3997ea4073c82289b4974 (diff)
downloadydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/tokenizer/nlpparser.cpp')
-rw-r--r--library/cpp/tokenizer/nlpparser.cpp553
1 files changed, 0 insertions, 553 deletions
diff --git a/library/cpp/tokenizer/nlpparser.cpp b/library/cpp/tokenizer/nlpparser.cpp
deleted file mode 100644
index dca8530c0cf..00000000000
--- a/library/cpp/tokenizer/nlpparser.cpp
+++ /dev/null
@@ -1,553 +0,0 @@
-#include "nlpparser.h"
-#include "sentbreakfilter.h"
-#include "special_tokens.h"
-
-#include <library/cpp/token/charfilter.h>
-#include <library/cpp/token/token_iterator.h>
-
-#include <util/charset/utf8.h>
-#include <util/charset/wide.h>
-#include <util/generic/utility.h>
-
-namespace {
- const char PERCENT_CHAR = '%';
-
- //! returns pointer to the first non-accent symbol, if it is not found - returns NULL
- const wchar16* FindNonAccent(const wchar16* p, size_t n) {
- const TAccents accents;
- const wchar16* const e = p + n;
- for (; p != e; ++p) {
- if (!accents.Check(*p))
- break;
- }
- return p;
- }
-
- inline char HexToChar(char h) {
- Y_ASSERT(isxdigit(h));
- return (
- h >= 'a' ? h - 'a' + 10 : h >= 'A' ? h - 'A' + 10 : h >= '0' ? h - '0' : 0);
- }
-}
-
-#ifdef ROBOT_OLDTOK
-//compilation error
-TNlpParser::TNlpParser(ITokenHandler& handler, TSentBreakFilter& sentBreakFilter, bool spacePreserve, bool, bool)
-#else
-TNlpParser::TNlpParser(ITokenHandler& handler, TSentBreakFilter& sentBreakFilter, TTempArray<wchar16>& buffer,
- bool spacePreserve, bool backwardCompatible, bool semicolonBreaksSentence,
- bool urlDecode)
-#endif
- : TokenHandler(handler)
- , SentenceBreak(nullptr)
- , SentBreakFilter(sentBreakFilter)
- , OrigText(nullptr)
- , Text(nullptr)
- , EndOfText(nullptr)
- , ExtraLenIndex(0)
- , SpacePreserve(spacePreserve)
-#ifdef ROBOT_OLDTOK
- , BackwardCompatible(true)
- , SemicolonBreaksSentence(false)
-#else
- , BackwardCompatible(backwardCompatible)
- , SemicolonBreaksSentence(semicolonBreaksSentence)
-#endif
- , UrlDecode(urlDecode)
- , Buffer(buffer)
-{
-}
-
-void TNlpParser::ProcessMultitoken(const wchar16* ts, const wchar16* te) {
- Base.AddLastToken(ts, te);
- const wchar16* p = ts;
- size_t prevEnd = 0;
- size_t tokenCount = Base.GetTokenCount();
- for (size_t i = 0; i < tokenCount; ++i) {
- TParserToken& tok = Base.GetToken(i);
- if (i) {
- size_t n = tok.GetStart() - prevEnd;
- if (n) {
- MakeEntry(p, n, NLP_MISCTEXT);
- p += n;
- }
- }
- prevEnd = tok.GetEnd();
- MakeMultitokenEntry(tok, p);
- p = ts + prevEnd;
- }
- Base.ResetTokens();
-}
-
-template<> void TVersionedNlpParser<2>::MakeMultitokenEntry(TParserToken& token, const wchar16* entry) {
- size_t entryLen = token.GetLength();
-
- TTokenStructure subtokens;
- token.CorrectPositions();
- Y_ASSERT(token.GetLength() == entryLen); // check after the correction
- token.SwapSubtokens(subtokens);
-
- const wchar16* tokenText = entry;
- size_t tokenLen = entryLen;
- NLP_TYPE type = token.GetNlpType();
- wchar16 buffer[TOKEN_MAX_BUF];
- if (token.HasHyphen()) {
- type = PrepareMultitoken(subtokens, buffer, TOKEN_MAX_BUF, entry, tokenLen);
- Y_ASSERT(tokenLen <= TOKEN_MAX_LEN); // multitoken isn't longer than TOKEN_MAX_LEN
- tokenText = buffer;
- } else {
- // NLP_WORD, NLP_INTEGER, NLP_FLOAT, NLP_MARK are needed to be cut off
- Y_ASSERT(type == NLP_WORD || type == NLP_INTEGER || type == NLP_FLOAT || type == NLP_MARK);
- if (tokenLen > TOKEN_MAX_LEN) {
- // entry here always points to the original text
- CutTooLongMultitoken(subtokens, entry, tokenLen, entryLen, type);
- tokenText = entry; // in case if leading accents were removed...
- }
- }
-
- TWideToken multitoken; // don't call to constructor TWideToken(entry, leng)!
- multitoken.Token = tokenText;
- multitoken.Leng = tokenLen;
- multitoken.SubTokens.swap(subtokens);
- Y_ASSERT(CheckMultitoken(multitoken));
-
- const size_t totalLen = entryLen + GetExtraLen(entry, entryLen);
-
- Y_ASSERT(multitoken.SubTokens.size());
- if (BackwardCompatible) {
- PassBackwardCompatibleToken(multitoken, type, totalLen);
- } else {
- SentBreakFilter.OnToken(multitoken, type);
- TokenHandler.OnToken(multitoken, totalLen, type);
- }
-}
-
-void TVersionedNlpParser<3>::MakeMultitokenEntry(TParserToken& token, const wchar16* entry) {
- size_t entryLen = token.GetLength();
-
- TTokenStructure subtokens;
- token.CorrectPositions();
- Y_ASSERT(token.GetLength() == entryLen); // check after the correction
- token.SwapSubtokens(subtokens);
-
- KeepedPotentialPrefix = nullptr;
- if (!subtokens.empty() && subtokens[0].PrefixLen > 0) {
- Y_ASSERT(subtokens[0].PrefixLen == 1);
- // in case x#y we have already tokenized # as suffix
- if (!KeepAffixes && LastTokenSuffixLength == 0) {
- MakeEntry(entry, 1, NLP_WORD);
- }
- if (!KeepAffixes || LastTokenSuffixLength != 0) {
- subtokens[0].PrefixLen = 0;
- --entryLen;
- ++entry;
- for (auto& subtoken : subtokens) {
- --subtoken.Pos;
- }
- }
- LastTokenSuffixLength = 0;
- }
-
- const wchar16* tokenText = entry;
- size_t tokenLen = entryLen;
- NLP_TYPE type = token.GetNlpType();
- wchar16 buffer[TOKEN_MAX_BUF];
- if (token.HasHyphen()) {
- type = PrepareMultitoken(subtokens, buffer, TOKEN_MAX_BUF, entry, tokenLen);
- Y_ASSERT(tokenLen <= TOKEN_MAX_LEN); // multitoken isn't longer than TOKEN_MAX_LEN
- tokenText = buffer;
- } else {
- // NLP_WORD, NLP_INTEGER, NLP_FLOAT, NLP_MARK are needed to be cut off
- Y_ASSERT(type == NLP_WORD || type == NLP_INTEGER || type == NLP_FLOAT || type == NLP_MARK);
- if (tokenLen > TOKEN_MAX_LEN) {
- // entry here always points to the original text
- CutTooLongMultitoken(subtokens, entry, tokenLen, entryLen, type);
- tokenText = entry; // in case if leading accents were removed...
- }
- }
-
- TWideToken multitoken; // don't call to constructor TWideToken(entry, leng)!
- multitoken.Token = tokenText;
- multitoken.Leng = tokenLen;
- multitoken.SubTokens.swap(subtokens);
- Y_ASSERT(CheckMultitoken(multitoken));
-
- const size_t totalLen = entryLen + GetExtraLen(entry, entryLen);
-
- Y_ASSERT(multitoken.SubTokens.size());
- if (BackwardCompatible) {
- PassBackwardCompatibleToken(multitoken, type, totalLen);
- } else {
- SentBreakFilter.OnToken(multitoken, type);
- TokenHandler.OnToken(multitoken, totalLen, type);
- }
-
-}
-
-size_t TNlpParser::GetExtraLen(const wchar16* entry, size_t entryLen) {
- const size_t offset = entry - OrigText;
- const size_t endOffset = offset + entryLen;
- ui32 extraLen = 0;
- while (ExtraLenIndex < ExtraLen.size() &&
- ExtraLen[ExtraLenIndex].first > offset && ExtraLen[ExtraLenIndex].first <= endOffset) {
- extraLen += ExtraLen[ExtraLenIndex].second;
- ++ExtraLenIndex;
- }
- return extraLen;
-}
-
-void TNlpParser::CutTooLongMultitoken(TTokenStructure& subtokens, const wchar16*& entry, size_t& leng, size_t& origleng, NLP_TYPE& type) {
- Y_ASSERT(leng > TOKEN_MAX_LEN);
- if (type == NLP_WORD || type == NLP_INTEGER || type == NLP_MARK) {
- // if too many accent symbols are in the beginning of the token (the number is greater than TOKEN_MAX_LEN)
- // TODO: remove accents before tokenization
- const ptrdiff_t n = FindNonAccent(entry, leng) - entry;
- Y_ASSERT(n >= 0);
-
- // NLP_WORD contains words only, NLP_INTEGER - integers only, NLP_MARK - words and integers
- Y_ASSERT(!subtokens.empty());
-
- if (n > 0) {
- const TWideToken miscText(entry, n); // the first part containing accents only
- TokenHandler.OnToken(miscText, n, NLP_MISCTEXT);
- origleng -= n;
- entry += n;
- leng = AdjustSubtokens(subtokens, n, TOKEN_MAX_LEN);
- } else
- leng = AdjustSubtokens(subtokens, TOKEN_MAX_LEN);
-
- // correct NLP type
- if (type == NLP_MARK) {
- Y_ASSERT(!subtokens.empty());
- ETokenType tokType = subtokens[0].Type;
- Y_ASSERT(tokType == TOKEN_WORD || tokType == TOKEN_NUMBER);
- for (size_t i = 1; i < subtokens.size(); ++i) {
- if (subtokens[i].Type != tokType) {
- tokType = TOKEN_MARK;
- break;
- }
- }
- if (tokType != TOKEN_MARK)
- type = (tokType == TOKEN_WORD ? NLP_WORD : NLP_INTEGER);
- }
- } else {
- // no processing of the case when point of a NLP_FLOAT token is cut off (position of the
- // point character is greater than TOKEN_MAX_LEN) and token actually will be integer
- Y_ASSERT(subtokens.empty());
- leng = TOKEN_MAX_LEN;
- }
-}
-
-void TNlpParser::PassBackwardCompatibleToken(const TWideToken& multitoken, NLP_TYPE type, size_t totalLen) {
- if (multitoken.SubTokens.size() == 1) {
- const TCharSpan& subtok = multitoken.SubTokens[0];
- TWideToken tok;
- if (subtok.PrefixLen) {
- tok.Token = multitoken.Token;
- tok.Leng = subtok.PrefixLen;
- SentBreakFilter.OnToken(tok, NLP_MISCTEXT);
- TokenHandler.OnToken(tok, tok.Leng, NLP_MISCTEXT);
- }
-
- const ui16 prefixLen = subtok.PrefixLen;
- tok.Token = multitoken.Token + prefixLen;
- tok.Leng = multitoken.Leng - prefixLen;
- tok.SubTokens.push_back(subtok);
- tok.SubTokens[0].PrefixLen = 0;
- tok.SubTokens[0].Pos -= prefixLen;
-
- SentBreakFilter.OnToken(tok, type);
- TokenHandler.OnToken(tok, tok.Leng + (totalLen - multitoken.Leng), type);
- // suffix after alone number is kept, for example: 18+ -> [18+]
- // if number with suffix is part of multitoken then suffix will be removed: 16+/18+ -> [16]+/[18]+
- // see also tokenizer_ut.cpp
- } else {
- TWideToken tok;
- TTokenIterator it(multitoken);
- it.GetPrefix(tok); // prefix of the first token
- if (tok.Leng) {
- SentBreakFilter.OnToken(tok, NLP_MISCTEXT);
- TokenHandler.OnToken(tok, tok.Leng, NLP_MISCTEXT);
- }
- while (it.Next()) {
- it.GetMultitoken(tok);
- SentBreakFilter.OnToken(tok, it.GetNlpType());
- if (!it.Finished())
- TokenHandler.OnToken(tok, tok.Leng, it.GetNlpType());
- else
- TokenHandler.OnToken(tok, tok.Leng + (totalLen - multitoken.Leng), it.GetNlpType());
- it.GetDelimiter(tok);
- if (tok.Leng) {
- SentBreakFilter.OnToken(tok, NLP_MISCTEXT);
- TokenHandler.OnToken(tok, tok.Leng, NLP_MISCTEXT);
- }
- }
- }
-}
-
-void TNlpParser::MakeEntry(const wchar16* entry, size_t entryLen, NLP_TYPE type) {
- TWideToken token; // don't call to constructor TWideToken(entry, leng)!
- token.Token = entry;
- token.Leng = entryLen;
-
- const size_t totalLen = entryLen + GetExtraLen(entry, entryLen);
-
- SentBreakFilter.OnToken(token, type);
- TokenHandler.OnToken(token, totalLen, type);
-}
-
-void TVersionedNlpParser<3>::MakeEntry(const wchar16* entry, size_t entryLen, NLP_TYPE type) {
- if (KeepedPotentialPrefix) {
- TWideToken token(KeepedPotentialPrefix, 1);
- SentBreakFilter.OnToken(token, type);
- TokenHandler.OnToken(token, entryLen, type);
- KeepedPotentialPrefix = nullptr;
- entryLen -= 1;
- entry += 1;
- if (entryLen == 0) {
- return;
- }
- }
- if (type == NLP_WORD) {
- TWideToken token(entry, entryLen);
- SentBreakFilter.OnToken(token, type);
- TokenHandler.OnToken(token, entryLen, type);
- return;
- }
- return TNlpParser::MakeEntry(entry, entryLen, type);
-}
-
-namespace {
- bool IsWhitespaceClass(const unsigned char c) {
- return c == TNlpParser::CC_LINE_FEED
- || c == TNlpParser::CC_TAB
- || c == TNlpParser::CC_CARRIAGE_RETURN
- || c == TNlpParser::CC_WHITESPACE;
- }
-
- bool IsNotSpace(wchar16 c) {
- return !IsWhitespaceClass(TNlpParser::GetCharClass(c));
- }
-}
-
-int TVersionedNlpParser<3>::MakeMisctextEntry(const unsigned char* entry, size_t len, size_t availableAfter) {
- const wchar16* entry16 = GetOrigText(entry);
- size_t skipFirst = LastTokenSuffixLength;
- LastTokenSuffixLength = 0;
- //if last symbol is #, @ or $ (tokprefix), we may want to create token from it if it is prefix
- bool leftLast = (len > 1) && (entry16[len - 1] == '#' || entry16[len - 1] == '@' || entry16[len - 1] == '$');
- while (len > 0) {
- const wchar16* misctextEnd = std::find_if(entry16, entry16 + len, IsNotSpace);
- size_t interestingLength = 0;
- while (misctextEnd < entry16 + len) {
- interestingLength = GetSpecialTokenLength(misctextEnd, len - (misctextEnd - entry16) + availableAfter);
- if (interestingLength != 0) {
- break;
- }
- misctextEnd = std::find_if(misctextEnd + 1, entry16 + len, IsNotSpace);
- }
- if (misctextEnd > entry16) {
- while (skipFirst && misctextEnd > entry16) {
- ++entry16;
- --len;
- --skipFirst;
- }
- if (leftLast && misctextEnd == len + entry16) {
- if (misctextEnd - entry16 > 1) {
- MakeEntry(entry16, misctextEnd - entry16 - 1, NLP_MISCTEXT);
- }
- return -1;
- }
- if (misctextEnd > entry16) {
- MakeEntry(entry16, misctextEnd - entry16, NLP_MISCTEXT);
- }
- len -= misctextEnd - entry16;
- entry16 = misctextEnd;
- }
- Y_ASSERT(misctextEnd == entry16);
- if (interestingLength > 0) {
- while (skipFirst && interestingLength && len) {
- ++entry16;
- --interestingLength;
- --len;
- --skipFirst;
- }
- if (KeepAffixes && leftLast && len == interestingLength) {
- for (size_t i = 0; i + 1 < interestingLength; ++i) {
- MakeEntry(entry16 + i, 1, NLP_WORD);
- }
- KeepedPotentialPrefix = entry16 + interestingLength - 1;
- return -1;
- }
- for (size_t i = 0; i < interestingLength; ++i) {
- MakeEntry(entry16 + i, 1, NLP_WORD);
- }
- if (interestingLength > len) {
- return interestingLength - len;
- }
- len -= interestingLength;
- entry16 += interestingLength;
- }
- }
- return 0;
-}
-
-size_t TNlpParser::MakeSentenceBreak(const wchar16* entry, size_t leng) {
- if (!SentenceBreak)
- SentenceBreak = entry + leng - 1; // last symbol is ytitle
- const size_t end = SentenceBreak - entry;
- assert(0 < end && end <= leng);
-
- MakeEntry(entry, end, SentBreakFilter.OnSentBreak(entry, leng));
- SentenceBreak = nullptr;
- return end; // adjust the current position, excluding the start of the sentence
-}
-
-void TNlpParser::ProcessSurrogatePairs(const wchar16* ts, const wchar16* te) {
- const wchar16 brokenRune = BROKEN_RUNE;
- const wchar16* lead = nullptr;
- for (const wchar16* p = ts; p != te; ++p) {
- if (IsW16SurrogateLead(*p)) {
- if (lead)
- MakeEntry(&brokenRune, 1, NLP_MISCTEXT);
- lead = p;
- } else if (IsW16SurrogateTail(*p)) {
- if (lead) {
- Base.AddIdeograph(2);
- Y_ASSERT(Base.GetTokenCount() == 1);
- MakeMultitokenEntry(Base.GetToken(0), lead);
- Base.ResetTokens();
- } else
- MakeEntry(&brokenRune, 1, NLP_MISCTEXT);
- lead = nullptr;
- } else
- Y_ASSERT(!"invalid character");
- }
- if (lead)
- MakeEntry(&brokenRune, 1, NLP_MISCTEXT);
-}
-
-void TNlpParser::ProcessIdeographs(const wchar16* ts, const wchar16* te) {
- for (const wchar16* p = ts; p != te; ++p) {
- Base.AddIdeograph(1);
- Y_ASSERT(Base.GetTokenCount() == 1);
- MakeMultitokenEntry(Base.GetToken(0), p);
- Base.ResetTokens();
- }
-}
-
-void TNlpParser::Execute(const wchar16* text, size_t len, const wchar16** textStart) {
- if (!len)
- return;
- const wchar16* p = text;
- const wchar16* e = p + len;
- wchar16* data = nullptr;
- wchar16* dest = nullptr;
- ExtraLen.clear();
- ExtraLenIndex = 0;
-
- while (p != e) {
- if (UrlDecode && *p == PERCENT_CHAR && (p + 3) <= e && IsHexdigit(p[1]) && IsHexdigit(p[2])) {
- if (!dest) {
- Buffer = TTempArray<wchar16>(len + 1);
- data = Buffer.Data();
- dest = data;
- const size_t n = p - text;
- std::char_traits<wchar16>::copy(dest, text, n);
- dest += n;
- }
-
- const wchar16* start = p; // in case if UTF8 is bad
- TTempBuf buf(e - p); // for UTF8
- char* const utf8 = buf.Data();
- size_t i = 0;
- while (p != e && *p == PERCENT_CHAR && (p + 3) <= e && IsHexdigit(p[1]) && IsHexdigit(p[2])) {
- const char c = (HexToChar(char(p[1])) << 4) | HexToChar(char(p[2]));
- utf8[i++] = ((unsigned char)c < 0x20 ? ' ' : c); // replace all controlling characters with ' '
- p += 3;
- }
-
- bool decoded = false;
- // convert at least 2 UTF8 bytes
- if (i > 1) {
- decoded = true;
- Y_VERIFY(size_t(p - start) == 3 * i);
- size_t written = 0;
- const size_t extraLenRollback = ExtraLen.size();
- for (size_t j = 0; j < i;) {
- size_t stepRead = 0;
- if (RECODE_OK != GetUTF8CharLen(stepRead, reinterpret_cast<const unsigned char*>(utf8) + j, reinterpret_cast<const unsigned char*>(utf8) + i)) {
- decoded = false;
- break;
- }
- Y_VERIFY(stepRead && j + stepRead <= i);
- size_t stepWritten = 0;
- if (!UTF8ToWide(utf8 + j, stepRead, dest + written, stepWritten)) {
- decoded = false;
- break;
- }
- written += stepWritten;
- ExtraLen.push_back(std::make_pair<ui32>(dest + written - data, 3 * stepRead - stepWritten));
- j += stepRead;
- }
- if (decoded) {
- dest += written;
- } else {
- ExtraLen.resize(extraLenRollback);
- }
- }
- if (!decoded) {
- // UTF8 is bad or too short (for example: %action-%61%62%63)
- // copy text as is:
- size_t n = p - start;
- std::char_traits<wchar16>::copy(dest, start, n);
- dest += n;
- }
- } else if (dest)
- *dest++ = *p++;
- else
- ++p;
- }
-
- if (dest) {
- if (textStart) {
- *textStart = data;
- }
- *dest = 0; // just in case
- const size_t newLen = dest - data;
- TTempBuf convbuf(newLen + 1);
- unsigned char* conv = (unsigned char*)convbuf.Data();
- ConvertTextToCharClasses(data, newLen, conv);
- OrigText = data;
- ExecuteImpl(conv, newLen);
- } else {
- if (textStart) {
- *textStart = text;
- }
- TTempBuf convbuf(len + 1);
- unsigned char* conv = (unsigned char*)convbuf.Data();
- ConvertTextToCharClasses(text, len, conv);
- OrigText = text;
- ExecuteImpl(conv, len);
- }
-}
-
-void TNlpParser::ConvertTextToCharClasses(const wchar16* text, size_t len, unsigned char* buffer) {
- const wchar16* end = text + len;
- while (text != end) {
- // TODO: it would be better to copy the char classes table into the new one in the constructor
- // and to change required char classes in it instead of checking conditions here (semicolon and whitespaces)
- const unsigned char c = (*text == ';' ? (unsigned char)(SemicolonBreaksSentence ? CC_TERM_PUNCT : CC_MISC_TEXT) : CharClasses[*text]);
- ++text;
- if (SpacePreserve)
- *buffer++ = c;
- else {
- // in case of !SpacePreserve all whitespaces are replaced with space because
- // browsers normalize whitespaces: "a \t\n\r b" -> "a b" if tag <pre></pre> isn't used
- // this change fixes incorrect hyphenations without tag <pre>: "HTML-\nfile" is not "HTMLfile"
- // browser show this text as: "HTML- file"
- *buffer++ = (IsWhitespaceClass(c) ? (unsigned char)CC_SPACE : c);
- }
- }
- *buffer = 0;
-}