aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/token/decomposition.cpp
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
committerqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
commitb1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch)
tree2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/token/decomposition.cpp
parent559174a9144de40d6bb3997ea4073c82289b4974 (diff)
downloadydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/token/decomposition.cpp')
-rw-r--r--library/cpp/token/decomposition.cpp169
1 files changed, 0 insertions, 169 deletions
diff --git a/library/cpp/token/decomposition.cpp b/library/cpp/token/decomposition.cpp
deleted file mode 100644
index 7ccb51fc6a4..00000000000
--- a/library/cpp/token/decomposition.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-#include "charfilter.h"
-#include <library/cpp/unicode/normalization/normalization.h>
-#include <util/charset/unidata.h>
-
-namespace NUnicode {
- namespace NPrivate {
- const TDecompositionTable& LemmerDecomposition();
- }
-}
-
-static const wchar32* LemmerDecompositionInt(wchar32 ch, bool advancedGermanUmlauts, bool extTable) {
- static const wchar32 ae[] = {'a', 'e', 0};
- static const wchar32 oe[] = {'o', 'e', 0};
- static const wchar32 ue[] = {'u', 'e', 0};
-
- if (advancedGermanUmlauts) {
- switch (ch) {
- case 0x00E4: // ä
- return ae;
- case 0x00F6: // ö
- return oe;
- case 0x00FC: // ü
- return ue;
- }
- }
-
- if (extTable)
- return NUnicode::NPrivate::Decomposition(NUnicode::NPrivate::LemmerDecomposition(), ch);
-
- static const wchar32 I[] = {'I', 0};
- static const wchar32 i[] = {'i', 0};
- static const wchar32 ss[] = {'s', 's', 0};
-
- switch (ch) {
- // case 0x040E: // Ў
- // case 0x045E: // ў
- case 0x0419: // Й
- case 0x0439: // й
- case 0x0407: // Ї
- case 0x0457: // ї
- return nullptr;
- case 0x0130: // I with dot
- return I;
- case 0x0131: // dotless i
- return i;
- case 0x00DF: // ß
- return ss;
- }
- return NUnicode::Decomposition<true>(ch);
-}
-
-const wchar32* LemmerDecomposition(wchar32 ch, bool advancedGermanUmlauts, bool extTable) {
- const wchar32* dec = LemmerDecompositionInt(ch, advancedGermanUmlauts, extTable);
- if (dec && dec[0] == ch && dec[1] == 0)
- return nullptr;
- return dec;
-}
-
-static size_t CharSize(wchar32 c) {
- if (c <= 0xFFFF)
- return 1;
- return 2;
-}
-
-static void CheckAddChar(wchar16*& r, size_t& bufLen, wchar32 c) {
- if (IsCombining(c))
- return;
- c = ToLower(c);
- if (CharSize(c) > bufLen) {
- bufLen = 0;
- return;
- }
- size_t sz = WriteSymbol(c, r);
- bufLen -= sz;
-}
-
-bool IsDecomp(ui16 c, bool extTable) {
- const wchar32* decomp = LemmerDecompositionInt(c, false, extTable);
- return decomp != nullptr && (decomp[0] != c || decomp[1] != 0);
-}
-
-bool IsDecomp(ui16 c) {
- return IsDecomp(c, false) || IsDecomp(c, true);
-}
-
-const ui32 UI16_COUNT = 0x10000;
-
-class TLower {
-public:
- static const TLower DefaultTLower;
-
-public:
- ui16 Lower[UI16_COUNT];
-
-public:
- TLower() {
- for (ui32 i = 0; i < UI16_COUNT; i++) {
- if (IsW16SurrogateLead(i) || IsW16SurrogateTail(i) || IsDecomp(i) || IsCombining(i)) {
- Lower[i] = 0;
- } else {
- Lower[i] = ::ToLower(i);
- }
- }
- }
-
- inline ui16 ToLower(ui16 c) const noexcept {
- return Lower[c];
- }
-};
-
-const TLower TLower::DefaultTLower;
-
-bool NormalizeUnicodeInt(const wchar16* word, size_t length, wchar16*& res, size_t bufLen, bool advancedGermanUmlauts, bool extTable) {
- const wchar16* end = word + length;
- while (word != end && bufLen > 0) {
- wchar16 lw = TLower::DefaultTLower.ToLower(*word);
- if (lw != 0) {
- *(res++) = lw;
- word++;
- bufLen--;
- continue;
- }
- wchar32 ch = ReadSymbolAndAdvance(word, end);
- const wchar32* decomp = LemmerDecompositionInt(ch, advancedGermanUmlauts, extTable);
- if (decomp != nullptr) {
- for (; *decomp != 0 && bufLen > 0; ++decomp)
- CheckAddChar(res, bufLen, *decomp);
- } else {
- CheckAddChar(res, bufLen, ch);
- }
- }
- return word >= end;
-}
-
-size_t NormalizeUnicode(const wchar16* word, size_t length, wchar16* converted, size_t bufLen, bool advancedGermanUmlauts, bool extTable) {
- wchar16* p = converted;
- NormalizeUnicodeInt(word, length, p, bufLen, advancedGermanUmlauts, extTable);
- return p - converted;
-}
-
-const ui32 MAX_DECOMPOSED_LEN = 18;
-
-bool NormalizeUnicode(const TWtringBuf& wbuf, bool advancedGermanUmlauts, bool extTable, TUtf16String& ret, ui32 mult) {
- size_t buflen = wbuf.size() * mult + MAX_DECOMPOSED_LEN; // for 1 symbol with longest sequence
- ret.reserve(buflen);
- wchar16* p = ret.begin();
- wchar16* converted = p;
- bool ok = NormalizeUnicodeInt(wbuf.data(), wbuf.size(), p, buflen, advancedGermanUmlauts, extTable);
- if (!ok) {
-#ifndef NDEBUG
- fprintf(stderr, "[WARNING]\tOut of buffer %zu %u\n", wbuf.size(), (unsigned int)mult);
-#endif
- return false;
- }
- ret.ReserveAndResize(p - converted);
- return true;
-}
-
-TUtf16String NormalizeUnicode(const TWtringBuf& wbuf, bool advancedGermanUmlauts, bool extTable) {
- TUtf16String ret;
- if (NormalizeUnicode(wbuf, advancedGermanUmlauts, extTable, ret, 2)) // First try buffer with size twice of original, enough in most cases
- return ret;
- NormalizeUnicode(wbuf, advancedGermanUmlauts, extTable, ret, MAX_DECOMPOSED_LEN); // 18 is enough, because 1 source char can produce no more than 18
- return ret;
-}
-
-TUtf16String NormalizeUnicode(const TUtf16String& word, bool advancedGermanUmlauts, bool extTable) {
- return NormalizeUnicode(TWtringBuf(word), advancedGermanUmlauts, extTable);
-}