aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/token/charfilter.h
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/token/charfilter.h
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/token/charfilter.h')
-rw-r--r--library/cpp/token/charfilter.h224
1 files changed, 224 insertions, 0 deletions
diff --git a/library/cpp/token/charfilter.h b/library/cpp/token/charfilter.h
new file mode 100644
index 0000000000..6e95b05aea
--- /dev/null
+++ b/library/cpp/token/charfilter.h
@@ -0,0 +1,224 @@
+#pragma once
+
+#include <util/charset/wide.h>
+#include <util/generic/singleton.h>
+
+#include "token_structure.h"
+
+//! represents a set of accent characters
+//! @note this class is intended to be a singleton because if has quite large array (65 KB)
+class TAccentTable: private TNonCopyable {
+public:
+ TAccentTable();
+
+ bool operator[](wchar16 c) const {
+ return Data[c];
+ }
+
+private:
+ enum { DATA_SIZE = 0xFFFF };
+ unsigned char Data[DATA_SIZE];
+};
+
+//! represents an accessor to the accent table, this class is able to be copied
+class TAccents {
+public:
+ TAccents()
+ : Table(*HugeSingleton<TAccentTable>())
+ {
+ }
+
+ //! retruns @c true if character is an accent symbol
+ bool Check(wchar16 c) const {
+ return Table[c];
+ }
+
+private:
+ const TAccentTable& Table;
+};
+
+//! removes characters from @c TWideToken using a filter
+//! @note the checker class must have the default constructor and @c Check member function
+template <typename TChecker>
+class TCharFilter {
+ TWideToken Token;
+ TCharTemp Buffer;
+ TChecker Checker;
+
+private:
+ //! copies already verified data - characters and subtokens
+ wchar16* CopyData(const TWideToken& token, size_t n, size_t t, wchar16* const data) {
+ std::char_traits<wchar16>::copy(data, token.Token, n); // without current character
+
+ Token.SubTokens.clear();
+ for (size_t i = 0; i < t; ++i) {
+ Token.SubTokens.push_back(token.SubTokens[i]);
+ }
+
+ return data + n;
+ }
+
+public:
+ explicit TCharFilter(size_t bufSize)
+ : Buffer(bufSize)
+ {
+ }
+
+ //! removes accent characters from token
+ //! @return if there is no any accent character the function returns the source token
+ //! otherwise the function returns the internal token
+ const TWideToken& Filter(const TWideToken& token) {
+ if (token.SubTokens.empty())
+ return FilterTokenNoSubtokens(token);
+ else
+ return FilterTokenWithSubtokens(token);
+ }
+
+ static bool HasChars(const TWideToken& token) {
+ TChecker checker;
+ const TTokenStructure& subtokens = token.SubTokens;
+ const wchar16* const s = token.Token; // source character sequence
+ size_t i = 0; // the index of the current source character
+ size_t t = 0; // the index of the next source subtoken
+
+ while (i < token.Leng) {
+ if (t < subtokens.size()) {
+ if (i >= subtokens[t].Pos && i < subtokens[t].EndPos()) {
+ // inside a token
+ if (checker.Check(s[i])) {
+ return true;
+ }
+ }
+
+ ++i;
+
+ if (i >= subtokens[t].EndPos()) {
+ ++t;
+ }
+ } else {
+ break;
+ }
+ }
+
+ return false;
+ }
+
+private:
+ const TWideToken& FilterTokenWithSubtokens(const TWideToken& token) {
+ Y_ASSERT(!token.SubTokens.empty());
+ Y_ASSERT(token.SubTokens.back().EndPos() <= token.Leng);
+
+ const TTokenStructure& subtokens = token.SubTokens;
+ const wchar16* const s = token.Token; // source character sequence
+ size_t i = 0; // the index of the current source character
+ size_t t = 0; // the index of the next source subtoken
+
+ while (i < token.Leng) {
+ if (t < subtokens.size()) {
+ if (i >= subtokens[t].Pos && i < subtokens[t].EndPos()) {
+ // inside a token
+ if (Checker.Check(s[i]))
+ return FilterTokenWithSubtokens(token, s, i, t, Buffer.Data());
+ }
+
+ ++i;
+
+ if (i >= subtokens[t].EndPos()) {
+ ++t;
+ }
+ } else {
+ break;
+ }
+ }
+
+ return token;
+ }
+
+ const TWideToken& FilterTokenWithSubtokens(
+ const TWideToken& token, const wchar16* s, size_t i, size_t t, wchar16* const buffer) {
+ Y_ASSERT(i < token.Leng && t < token.SubTokens.size() && s >= token.Token);
+
+ const TTokenStructure& subtokens = token.SubTokens;
+ wchar16* d = CopyData(token, i, t, buffer); // destination character
+ TCharSpan span = subtokens[t];
+
+ while (i < token.Leng) {
+ if (t < subtokens.size()) {
+ if (i >= subtokens[t].Pos && i < subtokens[t].EndPos()) {
+ // inside a token
+ if (Checker.Check(s[i])) {
+ Y_ASSERT(span.Len);
+ --span.Len;
+ } else {
+ *d++ = s[i];
+ }
+ } else {
+ // outside of tokens
+ *d++ = s[i];
+ }
+
+ ++i;
+
+ if (i >= subtokens[t].EndPos()) {
+ ++t;
+
+ if (span.Len)
+ Token.SubTokens.push_back(span);
+
+ if (t < subtokens.size()) {
+ const size_t diff = i - (d - buffer);
+ Y_ASSERT(subtokens[t].Pos >= diff);
+ span.Pos = subtokens[t].Pos - diff;
+ span.Len = subtokens[t].Len;
+ }
+ }
+ } else {
+ // copy the remainder of characters
+ const size_t n = token.Leng - i;
+ std::char_traits<wchar16>::copy(d, &s[i], n);
+ d += n;
+ break;
+ }
+ }
+
+ Token.Token = buffer;
+ Token.Leng = d - buffer;
+ Y_ASSERT(!Token.SubTokens.size() || (Token.SubTokens.size() && Token.Leng >= Token.SubTokens.back().EndPos()));
+ return Token;
+ }
+
+ const TWideToken& FilterTokenNoSubtokens(const TWideToken& token) {
+ Y_ASSERT(token.SubTokens.empty());
+ const wchar16* s = token.Token;
+ const wchar16* const e = s + token.Leng;
+
+ for (; s != e; ++s) {
+ if (Checker.Check(*s))
+ return FilterTokenNoSubtokens(token.Token, s, e, Buffer.Data());
+ }
+
+ return token;
+ }
+
+ const TWideToken& FilterTokenNoSubtokens(
+ const wchar16* const token, const wchar16* s, const wchar16* const e, wchar16* const buffer) {
+ const size_t n = s - token;
+ std::char_traits<wchar16>::copy(buffer, token, n);
+ wchar16* d = buffer + n;
+
+ for (; s != e; ++s) {
+ if (!Checker.Check(*s))
+ *d++ = *s;
+ }
+
+ Token.Token = buffer;
+ Token.Leng = d - buffer;
+ Y_ASSERT(Token.Leng);
+ return Token;
+ }
+};
+
+const wchar32* LemmerDecomposition(wchar32 ch, bool advancedGermanUmlauts = true, bool extTable = false);
+size_t NormalizeUnicode(const wchar16* word, size_t length, wchar16* converted, size_t bufLen, bool advancedGermanUmlauts = true, bool extTable = false);
+TUtf16String NormalizeUnicode(const TUtf16String& w, bool advancedGermanUmlauts = true, bool extTable = false);
+TUtf16String NormalizeUnicode(const TWtringBuf& wbuf, bool advancedGermanUmlauts = true, bool extTable = false);