aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/langs
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
committerqrort <qrort@yandex-team.com>2022-12-02 11:31:25 +0300
commitb1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch)
tree2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/langs
parent559174a9144de40d6bb3997ea4073c82289b4974 (diff)
downloadydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/langs')
-rw-r--r--library/cpp/langs/README.md8
-rw-r--r--library/cpp/langs/generated/uniscripts.cpp458
-rw-r--r--library/cpp/langs/langs.cpp330
-rw-r--r--library/cpp/langs/langs.h229
-rw-r--r--library/cpp/langs/scripts.cpp158
-rw-r--r--library/cpp/langs/scripts.h56
6 files changed, 0 insertions, 1239 deletions
diff --git a/library/cpp/langs/README.md b/library/cpp/langs/README.md
deleted file mode 100644
index 537ae31e1bb..00000000000
--- a/library/cpp/langs/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-Здесь описаны константы для [языков](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h) и [письменностей](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/scripts.h) (скриптов в терминах Unicode).
-
-В терминах этих констант языков работают [документная](https://a.yandex-team.ru/arc/trunk/arcadia/kernel/recshell/recshell.h) и [запросная](https://a.yandex-team.ru/arc/trunk/arcadia/dict/recognize/queryrec) распознавалки языка.
-
-Имеется [набор функций](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h?rev=r6909333#L142-214) для преобразования констант в двухбуквенный или трехбуквенный код и обратного получения константы по строке с учетом синонимов. Есть [функции](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h?rev=r6909333#L216-217) для определения письменности по языку и по символу).
-
-В списке констант представлены не все языки и письменности, а лишь те, которые представляли интерес для поиска Яндекса и машинного перевода.
-Имеется несколько псевдоязыков типа `LANG_UZB_CYR` или `LANG_KAZ_LAT`.
diff --git a/library/cpp/langs/generated/uniscripts.cpp b/library/cpp/langs/generated/uniscripts.cpp
deleted file mode 100644
index 59cc6a70c25..00000000000
--- a/library/cpp/langs/generated/uniscripts.cpp
+++ /dev/null
@@ -1,458 +0,0 @@
-// Generated from http://www.unicode.org/Public/UNIDATA/Scripts.txt
-// The best way to alter this file is to modify uniscripts.py
-#include <library/cpp/langs/langs.h>
-#include <util/system/yassert.h>
-
-#include <cstring>
-
-namespace NCharsetInternal {
- struct TScriptRange {
- EScript Script;
- wchar32 Start;
- wchar32 End;
- };
-
- const TScriptRange ScriptRanges[] = {
- { SCRIPT_ETHIOPIC, 0x1200, 0x1248 },
- { SCRIPT_ETHIOPIC, 0x124A, 0x124D },
- { SCRIPT_ETHIOPIC, 0x1250, 0x1256 },
- { SCRIPT_ETHIOPIC, 0x1258, 0x1258 },
- { SCRIPT_ETHIOPIC, 0x125A, 0x125D },
- { SCRIPT_ETHIOPIC, 0x1260, 0x1288 },
- { SCRIPT_ETHIOPIC, 0x128A, 0x128D },
- { SCRIPT_ETHIOPIC, 0x1290, 0x12B0 },
- { SCRIPT_ETHIOPIC, 0x12B2, 0x12B5 },
- { SCRIPT_ETHIOPIC, 0x12B8, 0x12BE },
- { SCRIPT_ETHIOPIC, 0x12C0, 0x12C0 },
- { SCRIPT_ETHIOPIC, 0x12C2, 0x12C5 },
- { SCRIPT_ETHIOPIC, 0x12C8, 0x12D6 },
- { SCRIPT_ETHIOPIC, 0x12D8, 0x1310 },
- { SCRIPT_ETHIOPIC, 0x1312, 0x1315 },
- { SCRIPT_ETHIOPIC, 0x1318, 0x135A },
- { SCRIPT_ETHIOPIC, 0x135D, 0x137C },
- { SCRIPT_ETHIOPIC, 0x1380, 0x1399 },
- { SCRIPT_ETHIOPIC, 0x2D80, 0x2D96 },
- { SCRIPT_ETHIOPIC, 0x2DA0, 0x2DA6 },
- { SCRIPT_ETHIOPIC, 0x2DA8, 0x2DAE },
- { SCRIPT_ETHIOPIC, 0x2DB0, 0x2DB6 },
- { SCRIPT_ETHIOPIC, 0x2DB8, 0x2DBE },
- { SCRIPT_ETHIOPIC, 0x2DC0, 0x2DC6 },
- { SCRIPT_ETHIOPIC, 0x2DC8, 0x2DCE },
- { SCRIPT_ETHIOPIC, 0x2DD0, 0x2DD6 },
- { SCRIPT_ETHIOPIC, 0x2DD8, 0x2DDE },
- { SCRIPT_ETHIOPIC, 0xAB01, 0xAB06 },
- { SCRIPT_ETHIOPIC, 0xAB09, 0xAB0E },
- { SCRIPT_ETHIOPIC, 0xAB11, 0xAB16 },
- { SCRIPT_ETHIOPIC, 0xAB20, 0xAB26 },
- { SCRIPT_ETHIOPIC, 0xAB28, 0xAB2E },
- { SCRIPT_ARABIC, 0x600, 0x604 },
- { SCRIPT_ARABIC, 0x606, 0x60B },
- { SCRIPT_ARABIC, 0x60D, 0x61A },
- { SCRIPT_ARABIC, 0x61E, 0x61E },
- { SCRIPT_ARABIC, 0x620, 0x63F },
- { SCRIPT_ARABIC, 0x641, 0x64A },
- { SCRIPT_ARABIC, 0x656, 0x66F },
- { SCRIPT_ARABIC, 0x671, 0x6DC },
- { SCRIPT_ARABIC, 0x6DE, 0x6FF },
- { SCRIPT_ARABIC, 0x750, 0x77F },
- { SCRIPT_ARABIC, 0x8A0, 0x8B4 },
- { SCRIPT_ARABIC, 0x8B6, 0x8BD },
- { SCRIPT_ARABIC, 0x8D4, 0x8E1 },
- { SCRIPT_ARABIC, 0x8E3, 0x8FF },
- { SCRIPT_ARABIC, 0xFB50, 0xFBC1 },
- { SCRIPT_ARABIC, 0xFBD3, 0xFD3D },
- { SCRIPT_ARABIC, 0xFD50, 0xFD8F },
- { SCRIPT_ARABIC, 0xFD92, 0xFDC7 },
- { SCRIPT_ARABIC, 0xFDF0, 0xFDFD },
- { SCRIPT_ARABIC, 0xFE70, 0xFE74 },
- { SCRIPT_ARABIC, 0xFE76, 0xFEFC },
- { SCRIPT_MONGOLIAN, 0x1800, 0x1801 },
- { SCRIPT_MONGOLIAN, 0x1804, 0x1804 },
- { SCRIPT_MONGOLIAN, 0x1806, 0x180E },
- { SCRIPT_MONGOLIAN, 0x1810, 0x1819 },
- { SCRIPT_MONGOLIAN, 0x1820, 0x1877 },
- { SCRIPT_MONGOLIAN, 0x1880, 0x18AA },
- { SCRIPT_TAMIL, 0xB82, 0xB83 },
- { SCRIPT_TAMIL, 0xB85, 0xB8A },
- { SCRIPT_TAMIL, 0xB8E, 0xB90 },
- { SCRIPT_TAMIL, 0xB92, 0xB95 },
- { SCRIPT_TAMIL, 0xB99, 0xB9A },
- { SCRIPT_TAMIL, 0xB9C, 0xB9C },
- { SCRIPT_TAMIL, 0xB9E, 0xB9F },
- { SCRIPT_TAMIL, 0xBA3, 0xBA4 },
- { SCRIPT_TAMIL, 0xBA8, 0xBAA },
- { SCRIPT_TAMIL, 0xBAE, 0xBB9 },
- { SCRIPT_TAMIL, 0xBBE, 0xBC2 },
- { SCRIPT_TAMIL, 0xBC6, 0xBC8 },
- { SCRIPT_TAMIL, 0xBCA, 0xBCD },
- { SCRIPT_TAMIL, 0xBD0, 0xBD0 },
- { SCRIPT_TAMIL, 0xBD7, 0xBD7 },
- { SCRIPT_TAMIL, 0xBE6, 0xBFA },
- { SCRIPT_GUJARATI, 0xA81, 0xA83 },
- { SCRIPT_GUJARATI, 0xA85, 0xA8D },
- { SCRIPT_GUJARATI, 0xA8F, 0xA91 },
- { SCRIPT_GUJARATI, 0xA93, 0xAA8 },
- { SCRIPT_GUJARATI, 0xAAA, 0xAB0 },
- { SCRIPT_GUJARATI, 0xAB2, 0xAB3 },
- { SCRIPT_GUJARATI, 0xAB5, 0xAB9 },
- { SCRIPT_GUJARATI, 0xABC, 0xAC5 },
- { SCRIPT_GUJARATI, 0xAC7, 0xAC9 },
- { SCRIPT_GUJARATI, 0xACB, 0xACD },
- { SCRIPT_GUJARATI, 0xAD0, 0xAD0 },
- { SCRIPT_GUJARATI, 0xAE0, 0xAE3 },
- { SCRIPT_GUJARATI, 0xAE6, 0xAF1 },
- { SCRIPT_GUJARATI, 0xAF9, 0xAF9 },
- { SCRIPT_MALAYALAM, 0xD01, 0xD03 },
- { SCRIPT_MALAYALAM, 0xD05, 0xD0C },
- { SCRIPT_MALAYALAM, 0xD0E, 0xD10 },
- { SCRIPT_MALAYALAM, 0xD12, 0xD3A },
- { SCRIPT_MALAYALAM, 0xD3D, 0xD44 },
- { SCRIPT_MALAYALAM, 0xD46, 0xD48 },
- { SCRIPT_MALAYALAM, 0xD4A, 0xD4F },
- { SCRIPT_MALAYALAM, 0xD54, 0xD63 },
- { SCRIPT_MALAYALAM, 0xD66, 0xD7F },
- { SCRIPT_ARMENIAN, 0x531, 0x556 },
- { SCRIPT_ARMENIAN, 0x559, 0x55F },
- { SCRIPT_ARMENIAN, 0x561, 0x587 },
- { SCRIPT_ARMENIAN, 0x58A, 0x58A },
- { SCRIPT_ARMENIAN, 0x58D, 0x58F },
- { SCRIPT_ARMENIAN, 0xFB13, 0xFB17 },
- { SCRIPT_HANGUL, 0x1100, 0x11FF },
- { SCRIPT_HANGUL, 0x302E, 0x302F },
- { SCRIPT_HANGUL, 0x3131, 0x318E },
- { SCRIPT_HANGUL, 0x3200, 0x321E },
- { SCRIPT_HANGUL, 0x3260, 0x327E },
- { SCRIPT_HANGUL, 0xA960, 0xA97C },
- { SCRIPT_HANGUL, 0xAC00, 0xD7A3 },
- { SCRIPT_HANGUL, 0xD7B0, 0xD7C6 },
- { SCRIPT_HANGUL, 0xD7CB, 0xD7FB },
- { SCRIPT_HANGUL, 0xFFA0, 0xFFBE },
- { SCRIPT_HANGUL, 0xFFC2, 0xFFC7 },
- { SCRIPT_HANGUL, 0xFFCA, 0xFFCF },
- { SCRIPT_HANGUL, 0xFFD2, 0xFFD7 },
- { SCRIPT_HANGUL, 0xFFDA, 0xFFDC },
- { SCRIPT_GURMUKHI, 0xA01, 0xA03 },
- { SCRIPT_GURMUKHI, 0xA05, 0xA0A },
- { SCRIPT_GURMUKHI, 0xA0F, 0xA10 },
- { SCRIPT_GURMUKHI, 0xA13, 0xA28 },
- { SCRIPT_GURMUKHI, 0xA2A, 0xA30 },
- { SCRIPT_GURMUKHI, 0xA32, 0xA33 },
- { SCRIPT_GURMUKHI, 0xA35, 0xA36 },
- { SCRIPT_GURMUKHI, 0xA38, 0xA39 },
- { SCRIPT_GURMUKHI, 0xA3C, 0xA3C },
- { SCRIPT_GURMUKHI, 0xA3E, 0xA42 },
- { SCRIPT_GURMUKHI, 0xA47, 0xA48 },
- { SCRIPT_GURMUKHI, 0xA4B, 0xA4D },
- { SCRIPT_GURMUKHI, 0xA51, 0xA51 },
- { SCRIPT_GURMUKHI, 0xA59, 0xA5C },
- { SCRIPT_GURMUKHI, 0xA5E, 0xA5E },
- { SCRIPT_GURMUKHI, 0xA66, 0xA75 },
- { SCRIPT_CYRILLIC, 0x400, 0x484 },
- { SCRIPT_CYRILLIC, 0x487, 0x52F },
- { SCRIPT_CYRILLIC, 0x1C80, 0x1C88 },
- { SCRIPT_CYRILLIC, 0x1D2B, 0x1D2B },
- { SCRIPT_CYRILLIC, 0x1D78, 0x1D78 },
- { SCRIPT_CYRILLIC, 0x2DE0, 0x2DFF },
- { SCRIPT_CYRILLIC, 0xA640, 0xA69F },
- { SCRIPT_CYRILLIC, 0xFE2E, 0xFE2F },
- { SCRIPT_DEVANAGARI, 0x900, 0x950 },
- { SCRIPT_DEVANAGARI, 0x953, 0x963 },
- { SCRIPT_DEVANAGARI, 0x966, 0x97F },
- { SCRIPT_DEVANAGARI, 0xA8E0, 0xA8FD },
- { SCRIPT_HEBREW, 0x591, 0x5C7 },
- { SCRIPT_HEBREW, 0x5D0, 0x5EA },
- { SCRIPT_HEBREW, 0x5F0, 0x5F4 },
- { SCRIPT_HEBREW, 0xFB1D, 0xFB36 },
- { SCRIPT_HEBREW, 0xFB38, 0xFB3C },
- { SCRIPT_HEBREW, 0xFB3E, 0xFB3E },
- { SCRIPT_HEBREW, 0xFB40, 0xFB41 },
- { SCRIPT_HEBREW, 0xFB43, 0xFB44 },
- { SCRIPT_HEBREW, 0xFB46, 0xFB4F },
- { SCRIPT_THAI, 0xE01, 0xE3A },
- { SCRIPT_THAI, 0xE40, 0xE5B },
- { SCRIPT_SYRIAC, 0x700, 0x70D },
- { SCRIPT_SYRIAC, 0x70F, 0x74A },
- { SCRIPT_SYRIAC, 0x74D, 0x74F },
- { SCRIPT_KANNADA, 0xC80, 0xC83 },
- { SCRIPT_KANNADA, 0xC85, 0xC8C },
- { SCRIPT_KANNADA, 0xC8E, 0xC90 },
- { SCRIPT_KANNADA, 0xC92, 0xCA8 },
- { SCRIPT_KANNADA, 0xCAA, 0xCB3 },
- { SCRIPT_KANNADA, 0xCB5, 0xCB9 },
- { SCRIPT_KANNADA, 0xCBC, 0xCC4 },
- { SCRIPT_KANNADA, 0xCC6, 0xCC8 },
- { SCRIPT_KANNADA, 0xCCA, 0xCCD },
- { SCRIPT_KANNADA, 0xCD5, 0xCD6 },
- { SCRIPT_KANNADA, 0xCDE, 0xCDE },
- { SCRIPT_KANNADA, 0xCE0, 0xCE3 },
- { SCRIPT_KANNADA, 0xCE6, 0xCEF },
- { SCRIPT_KANNADA, 0xCF1, 0xCF2 },
- { SCRIPT_LAO, 0xE81, 0xE82 },
- { SCRIPT_LAO, 0xE84, 0xE84 },
- { SCRIPT_LAO, 0xE87, 0xE88 },
- { SCRIPT_LAO, 0xE8A, 0xE8A },
- { SCRIPT_LAO, 0xE8D, 0xE8D },
- { SCRIPT_LAO, 0xE94, 0xE97 },
- { SCRIPT_LAO, 0xE99, 0xE9F },
- { SCRIPT_LAO, 0xEA1, 0xEA3 },
- { SCRIPT_LAO, 0xEA5, 0xEA5 },
- { SCRIPT_LAO, 0xEA7, 0xEA7 },
- { SCRIPT_LAO, 0xEAA, 0xEAB },
- { SCRIPT_LAO, 0xEAD, 0xEB9 },
- { SCRIPT_LAO, 0xEBB, 0xEBD },
- { SCRIPT_LAO, 0xEC0, 0xEC4 },
- { SCRIPT_LAO, 0xEC6, 0xEC6 },
- { SCRIPT_LAO, 0xEC8, 0xECD },
- { SCRIPT_LAO, 0xED0, 0xED9 },
- { SCRIPT_LAO, 0xEDC, 0xEDF },
- { SCRIPT_TELUGU, 0xC00, 0xC03 },
- { SCRIPT_TELUGU, 0xC05, 0xC0C },
- { SCRIPT_TELUGU, 0xC0E, 0xC10 },
- { SCRIPT_TELUGU, 0xC12, 0xC28 },
- { SCRIPT_TELUGU, 0xC2A, 0xC39 },
- { SCRIPT_TELUGU, 0xC3D, 0xC44 },
- { SCRIPT_TELUGU, 0xC46, 0xC48 },
- { SCRIPT_TELUGU, 0xC4A, 0xC4D },
- { SCRIPT_TELUGU, 0xC55, 0xC56 },
- { SCRIPT_TELUGU, 0xC58, 0xC5A },
- { SCRIPT_TELUGU, 0xC60, 0xC63 },
- { SCRIPT_TELUGU, 0xC66, 0xC6F },
- { SCRIPT_TELUGU, 0xC78, 0xC7F },
- { SCRIPT_KHMER, 0x1780, 0x17DD },
- { SCRIPT_KHMER, 0x17E0, 0x17E9 },
- { SCRIPT_KHMER, 0x17F0, 0x17F9 },
- { SCRIPT_KHMER, 0x19E0, 0x19FF },
- { SCRIPT_LATIN, 0x41, 0x5A },
- { SCRIPT_LATIN, 0x61, 0x7A },
- { SCRIPT_LATIN, 0xAA, 0xAA },
- { SCRIPT_LATIN, 0xBA, 0xBA },
- { SCRIPT_LATIN, 0xC0, 0xD6 },
- { SCRIPT_LATIN, 0xD8, 0xF6 },
- { SCRIPT_LATIN, 0xF8, 0x2B8 },
- { SCRIPT_LATIN, 0x2E0, 0x2E4 },
- { SCRIPT_LATIN, 0x1D00, 0x1D25 },
- { SCRIPT_LATIN, 0x1D2C, 0x1D5C },
- { SCRIPT_LATIN, 0x1D62, 0x1D65 },
- { SCRIPT_LATIN, 0x1D6B, 0x1D77 },
- { SCRIPT_LATIN, 0x1D79, 0x1DBE },
- { SCRIPT_LATIN, 0x1E00, 0x1EFF },
- { SCRIPT_LATIN, 0x2071, 0x2071 },
- { SCRIPT_LATIN, 0x207F, 0x207F },
- { SCRIPT_LATIN, 0x2090, 0x209C },
- { SCRIPT_LATIN, 0x212A, 0x212B },
- { SCRIPT_LATIN, 0x2132, 0x2132 },
- { SCRIPT_LATIN, 0x214E, 0x214E },
- { SCRIPT_LATIN, 0x2160, 0x2188 },
- { SCRIPT_LATIN, 0x2C60, 0x2C7F },
- { SCRIPT_LATIN, 0xA722, 0xA787 },
- { SCRIPT_LATIN, 0xA78B, 0xA7AE },
- { SCRIPT_LATIN, 0xA7B0, 0xA7B7 },
- { SCRIPT_LATIN, 0xA7F7, 0xA7FF },
- { SCRIPT_LATIN, 0xAB30, 0xAB5A },
- { SCRIPT_LATIN, 0xAB5C, 0xAB64 },
- { SCRIPT_LATIN, 0xFB00, 0xFB06 },
- { SCRIPT_LATIN, 0xFF21, 0xFF3A },
- { SCRIPT_LATIN, 0xFF41, 0xFF5A },
- { SCRIPT_TIBETAN, 0xF00, 0xF47 },
- { SCRIPT_TIBETAN, 0xF49, 0xF6C },
- { SCRIPT_TIBETAN, 0xF71, 0xF97 },
- { SCRIPT_TIBETAN, 0xF99, 0xFBC },
- { SCRIPT_TIBETAN, 0xFBE, 0xFCC },
- { SCRIPT_TIBETAN, 0xFCE, 0xFD4 },
- { SCRIPT_TIBETAN, 0xFD9, 0xFDA },
- { SCRIPT_MYANMAR, 0x1000, 0x109F },
- { SCRIPT_MYANMAR, 0xA9E0, 0xA9FE },
- { SCRIPT_MYANMAR, 0xAA60, 0xAA7F },
- { SCRIPT_OTHER, 0x2EA, 0x2EB },
- { SCRIPT_OTHER, 0x7C0, 0x7FA },
- { SCRIPT_OTHER, 0x800, 0x82D },
- { SCRIPT_OTHER, 0x830, 0x83E },
- { SCRIPT_OTHER, 0x840, 0x85B },
- { SCRIPT_OTHER, 0x85E, 0x85E },
- { SCRIPT_OTHER, 0x13A0, 0x13F5 },
- { SCRIPT_OTHER, 0x13F8, 0x13FD },
- { SCRIPT_OTHER, 0x1400, 0x169C },
- { SCRIPT_OTHER, 0x1700, 0x170C },
- { SCRIPT_OTHER, 0x170E, 0x1714 },
- { SCRIPT_OTHER, 0x1720, 0x1734 },
- { SCRIPT_OTHER, 0x1740, 0x1753 },
- { SCRIPT_OTHER, 0x1760, 0x176C },
- { SCRIPT_OTHER, 0x176E, 0x1770 },
- { SCRIPT_OTHER, 0x1772, 0x1773 },
- { SCRIPT_OTHER, 0x18B0, 0x18F5 },
- { SCRIPT_OTHER, 0x1900, 0x191E },
- { SCRIPT_OTHER, 0x1920, 0x192B },
- { SCRIPT_OTHER, 0x1930, 0x193B },
- { SCRIPT_OTHER, 0x1940, 0x1940 },
- { SCRIPT_OTHER, 0x1944, 0x196D },
- { SCRIPT_OTHER, 0x1970, 0x1974 },
- { SCRIPT_OTHER, 0x1980, 0x19AB },
- { SCRIPT_OTHER, 0x19B0, 0x19C9 },
- { SCRIPT_OTHER, 0x19D0, 0x19DA },
- { SCRIPT_OTHER, 0x19DE, 0x19DF },
- { SCRIPT_OTHER, 0x1A00, 0x1A1B },
- { SCRIPT_OTHER, 0x1A1E, 0x1A5E },
- { SCRIPT_OTHER, 0x1A60, 0x1A7C },
- { SCRIPT_OTHER, 0x1A7F, 0x1A89 },
- { SCRIPT_OTHER, 0x1A90, 0x1A99 },
- { SCRIPT_OTHER, 0x1AA0, 0x1AAD },
- { SCRIPT_OTHER, 0x1B00, 0x1B4B },
- { SCRIPT_OTHER, 0x1B50, 0x1B7C },
- { SCRIPT_OTHER, 0x1B80, 0x1BF3 },
- { SCRIPT_OTHER, 0x1BFC, 0x1C37 },
- { SCRIPT_OTHER, 0x1C3B, 0x1C49 },
- { SCRIPT_OTHER, 0x1C4D, 0x1C7F },
- { SCRIPT_OTHER, 0x1CC0, 0x1CC7 },
- { SCRIPT_OTHER, 0x2800, 0x28FF },
- { SCRIPT_OTHER, 0x2C00, 0x2C2E },
- { SCRIPT_OTHER, 0x2C30, 0x2C5E },
- { SCRIPT_OTHER, 0x2D30, 0x2D67 },
- { SCRIPT_OTHER, 0x2D6F, 0x2D70 },
- { SCRIPT_OTHER, 0x2D7F, 0x2D7F },
- { SCRIPT_OTHER, 0x3105, 0x312D },
- { SCRIPT_OTHER, 0x31A0, 0x31BA },
- { SCRIPT_OTHER, 0xA000, 0xA48C },
- { SCRIPT_OTHER, 0xA490, 0xA4C6 },
- { SCRIPT_OTHER, 0xA4D0, 0xA62B },
- { SCRIPT_OTHER, 0xA6A0, 0xA6F7 },
- { SCRIPT_OTHER, 0xA800, 0xA82B },
- { SCRIPT_OTHER, 0xA840, 0xA877 },
- { SCRIPT_OTHER, 0xA880, 0xA8C5 },
- { SCRIPT_OTHER, 0xA8CE, 0xA8D9 },
- { SCRIPT_OTHER, 0xA900, 0xA92D },
- { SCRIPT_OTHER, 0xA92F, 0xA953 },
- { SCRIPT_OTHER, 0xA95F, 0xA95F },
- { SCRIPT_OTHER, 0xA980, 0xA9CD },
- { SCRIPT_OTHER, 0xA9D0, 0xA9D9 },
- { SCRIPT_OTHER, 0xA9DE, 0xA9DF },
- { SCRIPT_OTHER, 0xAA00, 0xAA36 },
- { SCRIPT_OTHER, 0xAA40, 0xAA4D },
- { SCRIPT_OTHER, 0xAA50, 0xAA59 },
- { SCRIPT_OTHER, 0xAA5C, 0xAA5F },
- { SCRIPT_OTHER, 0xAA80, 0xAAC2 },
- { SCRIPT_OTHER, 0xAADB, 0xAAF6 },
- { SCRIPT_OTHER, 0xAB70, 0xABED },
- { SCRIPT_OTHER, 0xABF0, 0xABF9 },
- { SCRIPT_HAN, 0x2E80, 0x2E99 },
- { SCRIPT_HAN, 0x2E9B, 0x2EF3 },
- { SCRIPT_HAN, 0x2F00, 0x2FD5 },
- { SCRIPT_HAN, 0x3005, 0x3005 },
- { SCRIPT_HAN, 0x3007, 0x3007 },
- { SCRIPT_HAN, 0x3021, 0x3029 },
- { SCRIPT_HAN, 0x3038, 0x303B },
- { SCRIPT_HAN, 0x3400, 0x4DB5 },
- { SCRIPT_HAN, 0x4E00, 0x9FD5 },
- { SCRIPT_HAN, 0xF900, 0xFA6D },
- { SCRIPT_HAN, 0xFA70, 0xFAD9 },
- { SCRIPT_THAANA, 0x780, 0x7B1 },
- { SCRIPT_HIRAGANA, 0x3041, 0x3096 },
- { SCRIPT_HIRAGANA, 0x309D, 0x309F },
- { SCRIPT_KATAKANA, 0x30A1, 0x30FA },
- { SCRIPT_KATAKANA, 0x30FD, 0x30FF },
- { SCRIPT_KATAKANA, 0x31F0, 0x31FF },
- { SCRIPT_KATAKANA, 0x32D0, 0x32FE },
- { SCRIPT_KATAKANA, 0x3300, 0x3357 },
- { SCRIPT_KATAKANA, 0xFF66, 0xFF6F },
- { SCRIPT_KATAKANA, 0xFF71, 0xFF9D },
- { SCRIPT_ORIYA, 0xB01, 0xB03 },
- { SCRIPT_ORIYA, 0xB05, 0xB0C },
- { SCRIPT_ORIYA, 0xB0F, 0xB10 },
- { SCRIPT_ORIYA, 0xB13, 0xB28 },
- { SCRIPT_ORIYA, 0xB2A, 0xB30 },
- { SCRIPT_ORIYA, 0xB32, 0xB33 },
- { SCRIPT_ORIYA, 0xB35, 0xB39 },
- { SCRIPT_ORIYA, 0xB3C, 0xB44 },
- { SCRIPT_ORIYA, 0xB47, 0xB48 },
- { SCRIPT_ORIYA, 0xB4B, 0xB4D },
- { SCRIPT_ORIYA, 0xB56, 0xB57 },
- { SCRIPT_ORIYA, 0xB5C, 0xB5D },
- { SCRIPT_ORIYA, 0xB5F, 0xB63 },
- { SCRIPT_ORIYA, 0xB66, 0xB77 },
- { SCRIPT_BENGALI, 0x980, 0x983 },
- { SCRIPT_BENGALI, 0x985, 0x98C },
- { SCRIPT_BENGALI, 0x98F, 0x990 },
- { SCRIPT_BENGALI, 0x993, 0x9A8 },
- { SCRIPT_BENGALI, 0x9AA, 0x9B0 },
- { SCRIPT_BENGALI, 0x9B2, 0x9B2 },
- { SCRIPT_BENGALI, 0x9B6, 0x9B9 },
- { SCRIPT_BENGALI, 0x9BC, 0x9C4 },
- { SCRIPT_BENGALI, 0x9C7, 0x9C8 },
- { SCRIPT_BENGALI, 0x9CB, 0x9CE },
- { SCRIPT_BENGALI, 0x9D7, 0x9D7 },
- { SCRIPT_BENGALI, 0x9DC, 0x9DD },
- { SCRIPT_BENGALI, 0x9DF, 0x9E3 },
- { SCRIPT_BENGALI, 0x9E6, 0x9FB },
- { SCRIPT_RUNIC, 0x16A0, 0x16EA },
- { SCRIPT_RUNIC, 0x16EE, 0x16F8 },
- { SCRIPT_SINHALA, 0xD82, 0xD83 },
- { SCRIPT_SINHALA, 0xD85, 0xD96 },
- { SCRIPT_SINHALA, 0xD9A, 0xDB1 },
- { SCRIPT_SINHALA, 0xDB3, 0xDBB },
- { SCRIPT_SINHALA, 0xDBD, 0xDBD },
- { SCRIPT_SINHALA, 0xDC0, 0xDC6 },
- { SCRIPT_SINHALA, 0xDCA, 0xDCA },
- { SCRIPT_SINHALA, 0xDCF, 0xDD4 },
- { SCRIPT_SINHALA, 0xDD6, 0xDD6 },
- { SCRIPT_SINHALA, 0xDD8, 0xDDF },
- { SCRIPT_SINHALA, 0xDE6, 0xDEF },
- { SCRIPT_SINHALA, 0xDF2, 0xDF4 },
- { SCRIPT_COPTIC, 0x3E2, 0x3EF },
- { SCRIPT_COPTIC, 0x2C80, 0x2CF3 },
- { SCRIPT_COPTIC, 0x2CF9, 0x2CFF },
- { SCRIPT_GEORGIAN, 0x10A0, 0x10C5 },
- { SCRIPT_GEORGIAN, 0x10C7, 0x10C7 },
- { SCRIPT_GEORGIAN, 0x10CD, 0x10CD },
- { SCRIPT_GEORGIAN, 0x10D0, 0x10FA },
- { SCRIPT_GEORGIAN, 0x10FC, 0x10FF },
- { SCRIPT_GEORGIAN, 0x2D00, 0x2D25 },
- { SCRIPT_GEORGIAN, 0x2D27, 0x2D27 },
- { SCRIPT_GEORGIAN, 0x2D2D, 0x2D2D },
- { SCRIPT_GREEK, 0x370, 0x373 },
- { SCRIPT_GREEK, 0x375, 0x377 },
- { SCRIPT_GREEK, 0x37A, 0x37D },
- { SCRIPT_GREEK, 0x37F, 0x37F },
- { SCRIPT_GREEK, 0x384, 0x384 },
- { SCRIPT_GREEK, 0x386, 0x386 },
- { SCRIPT_GREEK, 0x388, 0x38A },
- { SCRIPT_GREEK, 0x38C, 0x38C },
- { SCRIPT_GREEK, 0x38E, 0x3A1 },
- { SCRIPT_GREEK, 0x3A3, 0x3E1 },
- { SCRIPT_GREEK, 0x3F0, 0x3FF },
- { SCRIPT_GREEK, 0x1D26, 0x1D2A },
- { SCRIPT_GREEK, 0x1D5D, 0x1D61 },
- { SCRIPT_GREEK, 0x1D66, 0x1D6A },
- { SCRIPT_GREEK, 0x1DBF, 0x1DBF },
- { SCRIPT_GREEK, 0x1F00, 0x1F15 },
- { SCRIPT_GREEK, 0x1F18, 0x1F1D },
- { SCRIPT_GREEK, 0x1F20, 0x1F45 },
- { SCRIPT_GREEK, 0x1F48, 0x1F4D },
- { SCRIPT_GREEK, 0x1F50, 0x1F57 },
- { SCRIPT_GREEK, 0x1F59, 0x1F59 },
- { SCRIPT_GREEK, 0x1F5B, 0x1F5B },
- { SCRIPT_GREEK, 0x1F5D, 0x1F5D },
- { SCRIPT_GREEK, 0x1F5F, 0x1F7D },
- { SCRIPT_GREEK, 0x1F80, 0x1FB4 },
- { SCRIPT_GREEK, 0x1FB6, 0x1FC4 },
- { SCRIPT_GREEK, 0x1FC6, 0x1FD3 },
- { SCRIPT_GREEK, 0x1FD6, 0x1FDB },
- { SCRIPT_GREEK, 0x1FDD, 0x1FEF },
- { SCRIPT_GREEK, 0x1FF2, 0x1FF4 },
- { SCRIPT_GREEK, 0x1FF6, 0x1FFE },
- { SCRIPT_GREEK, 0x2126, 0x2126 },
- { SCRIPT_GREEK, 0xAB65, 0xAB65 },
- };
-
- void InitScriptData(ui8 data[], size_t len) {
- memset (data, 0, len * sizeof(ui8));
- for (auto range : ScriptRanges) {
- Y_ASSERT(range.Start <= range.End);
- Y_ASSERT((unsigned)range.Script < 0x100);
- size_t end = range.End;
- if (end >= len)
- end = len;
- for (size_t j = range.Start; j <= end; ++j) {
- data[j] = (ui8)range.Script;
- }
- }
- }
-}
diff --git a/library/cpp/langs/langs.cpp b/library/cpp/langs/langs.cpp
deleted file mode 100644
index 2c508e16022..00000000000
--- a/library/cpp/langs/langs.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-#include "langs.h"
-
-#include <library/cpp/digest/lower_case/hash_ops.h>
-
-#include <util/generic/array_size.h>
-#include <util/generic/hash.h>
-#include <util/generic/singleton.h>
-#include <util/generic/strbuf.h>
-#include <util/generic/yexception.h>
-#include <util/system/defaults.h>
-
-#include <array>
-#include <cctype>
-
-/*
- * define language by ELanguage
- */
-
-namespace {
- struct TLanguageNameAndEnum {
- ELanguage Language;
- EScript Script;
- const char* EnglishName;
- const char* BiblioName;
- const char* IsoName;
- const char* Synonyms;
- };
-
- const TLanguageNameAndEnum LanguageNameAndEnum[] = {
- {LANG_UNK, SCRIPT_OTHER, "Unknown", "unk", "mis", nullptr},
- {LANG_RUS, SCRIPT_CYRILLIC, "Russian", "rus", "ru", "ru-RU"},
- {LANG_ENG, SCRIPT_LATIN, "English", "eng", "en", "en-US, en-GB, en-CA, en-NZ, en-AU"},
- {LANG_POL, SCRIPT_LATIN, "Polish", "pol", "pl", nullptr},
- {LANG_HUN, SCRIPT_LATIN, "Hungarian", "hun", "hu", nullptr},
- {LANG_UKR, SCRIPT_CYRILLIC, "Ukrainian", "ukr", "uk", "uk-UA"},
- {LANG_GER, SCRIPT_LATIN, "German", "ger", "de", "deu"},
- {LANG_FRE, SCRIPT_LATIN, "French", "fre", "fr", "fra, frn, fr-FR, fr-CA"},
- {LANG_TAT, SCRIPT_CYRILLIC, "Tatar", "tat", "tt", nullptr},
- {LANG_BEL, SCRIPT_CYRILLIC, "Belarusian", "bel", "be", "blr, Belorussian"},
- {LANG_KAZ, SCRIPT_CYRILLIC, "Kazakh", "kaz", "kk", "kk-Cyrl"},
- {LANG_ALB, SCRIPT_LATIN, "Albanian", "alb", "sq", nullptr},
- {LANG_SPA, SCRIPT_LATIN, "Spanish", "spa", "es", nullptr},
- {LANG_ITA, SCRIPT_LATIN, "Italian", "ita", "it", nullptr},
- {LANG_ARM, SCRIPT_ARMENIAN, "Armenian", "arm", "hy", "hye"},
- {LANG_DAN, SCRIPT_LATIN, "Danish", "dan", "da", nullptr},
- {LANG_POR, SCRIPT_LATIN, "Portuguese", "por", "pt", nullptr},
- {LANG_ICE, SCRIPT_LATIN, "Icelandic", "ice", "is", "isl"},
- {LANG_SLO, SCRIPT_LATIN, "Slovak", "slo", "sk", "slk"},
- {LANG_SLV, SCRIPT_LATIN, "Slovene", "slv", "sl", "Slovenian"},
- {LANG_DUT, SCRIPT_LATIN, "Dutch", "dut", "nl", "nld"},
- {LANG_BUL, SCRIPT_CYRILLIC, "Bulgarian", "bul", "bg", nullptr},
- {LANG_CAT, SCRIPT_LATIN, "Catalan", "cat", "ca", nullptr},
- {LANG_HRV, SCRIPT_LATIN, "Croatian", "hrv", "hr", "scr"},
- {LANG_CZE, SCRIPT_LATIN, "Czech", "cze", "cs", "ces"},
- {LANG_GRE, SCRIPT_GREEK, "Greek", "gre", "el", "ell"},
- {LANG_HEB, SCRIPT_HEBREW, "Hebrew", "heb", "he", "iw"}, // 'iw' is old ISO-639 code
- {LANG_NOR, SCRIPT_LATIN, "Norwegian", "nor", "no", nullptr},
- {LANG_MAC, SCRIPT_CYRILLIC, "Macedonian", "mac", "mk", nullptr},
- {LANG_SWE, SCRIPT_LATIN, "Swedish", "swe", "sv", nullptr},
- {LANG_KOR, SCRIPT_HANGUL, "Korean", "kor", "ko", nullptr},
- {LANG_LAT, SCRIPT_LATIN, "Latin", "lat", "la", nullptr},
- {LANG_BASIC_RUS, SCRIPT_CYRILLIC, "Basic Russian", "basic-rus", "bas-ru", nullptr},
- {LANG_BOS, SCRIPT_LATIN, "Bosnian", "bos", "bs", nullptr},
- {LANG_MLT, SCRIPT_LATIN, "Maltese", "mlt", "mt", nullptr},
-
- {LANG_EMPTY, SCRIPT_OTHER, "Empty", "empty", nullptr, nullptr},
- {LANG_UNK_LAT, SCRIPT_LATIN, "Unknown Latin", "unklat", nullptr, nullptr},
- {LANG_UNK_CYR, SCRIPT_CYRILLIC, "Unknown Cyrillic", "unkcyr", nullptr, nullptr},
- {LANG_UNK_ALPHA, SCRIPT_OTHER, "Unknown Alpha", "unkalpha", nullptr, nullptr},
-
- {LANG_FIN, SCRIPT_LATIN, "Finnish", "fin", "fi", nullptr},
- {LANG_EST, SCRIPT_LATIN, "Estonian", "est", "et", nullptr},
- {LANG_LAV, SCRIPT_LATIN, "Latvian", "lav", "lv", nullptr},
- {LANG_LIT, SCRIPT_LATIN, "Lithuanian", "lit", "lt", nullptr},
- {LANG_BAK, SCRIPT_CYRILLIC, "Bashkir", "bak", "ba", nullptr},
- {LANG_TUR, SCRIPT_LATIN, "Turkish", "tur", "tr", nullptr},
- {LANG_RUM, SCRIPT_LATIN, "Romanian", "rum", "ro", "ron"},
- {LANG_MON, SCRIPT_CYRILLIC, "Mongolian", "mon", "mn", nullptr},
- {LANG_UZB, SCRIPT_LATIN, "Uzbek", "uzb", "uz", "uz-Latn"},
- {LANG_KIR, SCRIPT_CYRILLIC, "Kirghiz", "kir", "ky", "Kyrgyz"},
- {LANG_TGK, SCRIPT_CYRILLIC, "Tajik", "tgk", "tg", nullptr},
- {LANG_TUK, SCRIPT_LATIN, "Turkmen", "tuk", "tk", nullptr},
- {LANG_SRP, SCRIPT_CYRILLIC, "Serbian", "srp", "sr", nullptr},
- {LANG_AZE, SCRIPT_LATIN, "Azerbaijani", "aze", "az", "Azeri"},
- {LANG_BASIC_ENG, SCRIPT_LATIN, "Basic English", "basic-eng", "bas-en", nullptr},
- {LANG_GEO, SCRIPT_GEORGIAN, "Georgian", "geo", "ka", "kat"},
- {LANG_ARA, SCRIPT_ARABIC, "Arabic", "ara", "ar", nullptr},
- {LANG_PER, SCRIPT_ARABIC, "Persian", "per", "fa", "fas"},
- {LANG_CHU, SCRIPT_CYRILLIC, "Church Slavonic", "chu", "cu", nullptr},
- {LANG_CHI, SCRIPT_HAN, "Chinese", "chi", "zh", "zho"},
- {LANG_JPN, SCRIPT_HIRAGANA, "Japanese", "jpn", "ja", nullptr},
- {LANG_IND, SCRIPT_LATIN, "Indonesian", "ind", "id", "in"}, // 'in' is old ISO-639 code
- {LANG_MAY, SCRIPT_LATIN, "Malay", "may", "ms", "msa"},
- {LANG_THA, SCRIPT_THAI, "Thai", "tha", "th", nullptr},
- {LANG_VIE, SCRIPT_LATIN, "Vietnamese", "vie", "vi", nullptr},
- {LANG_GLE, SCRIPT_LATIN, "Irish", "gle", "ga", nullptr},
- {LANG_TGL, SCRIPT_LATIN, "Tagalog", "tgl", "tl", "fil"},
- {LANG_HIN, SCRIPT_DEVANAGARI, "Hindi", "hin", "hi", nullptr},
- {LANG_AFR, SCRIPT_LATIN, "Afrikaans", "afr", "af", nullptr},
- {LANG_URD, SCRIPT_ARABIC, "Urdu", "urd", "ur", nullptr},
- {LANG_MYA, SCRIPT_MYANMAR, "Burmese", "mya", "my", nullptr},
- {LANG_KHM, SCRIPT_KHMER, "Khmer", "khm", "km", nullptr},
- {LANG_LAO, SCRIPT_LAO, "Lao", "lao", "lo", "Laotian, Laothian"},
- {LANG_TAM, SCRIPT_TAMIL, "Tamil", "tam", "ta", nullptr},
- {LANG_BEN, SCRIPT_BENGALI, "Bengali", "ben", "bn", nullptr},
- {LANG_GUJ, SCRIPT_GUJARATI, "Gujarati", "guj", "gu", nullptr},
- {LANG_KAN, SCRIPT_KANNADA, "Kannada", "kan", "kn", nullptr},
- {LANG_PAN, SCRIPT_GURMUKHI, "Punjabi", "pan", "pa", nullptr},
- {LANG_SIN, SCRIPT_SINHALA, "Sinhalese", "sin", "si", nullptr},
- {LANG_SWA, SCRIPT_LATIN, "Swahili", "swa", "sw", nullptr},
- {LANG_BAQ, SCRIPT_LATIN, "Basque", "baq", "eu", "eus"},
- {LANG_WEL, SCRIPT_LATIN, "Welsh", "wel", "cy", "cym"},
- {LANG_GLG, SCRIPT_LATIN, "Galician", "glg", "gl", nullptr},
- {LANG_HAT, SCRIPT_LATIN, "Haitian Creole", "hat", "ht", "Haitian"},
- {LANG_MLG, SCRIPT_LATIN, "Malagasy", "mlg", "mg", nullptr},
- {LANG_CHV, SCRIPT_CYRILLIC, "Chuvash", "chv", "cv", nullptr},
- {LANG_UDM, SCRIPT_CYRILLIC, "Udmurt", "udm", "udm", nullptr},
- {LANG_KPV, SCRIPT_CYRILLIC, "Komi-Zyrian", "kpv", "kv", "Komi, kom"},
- {LANG_MHR, SCRIPT_CYRILLIC, "Meadow Mari", "mhr", "mhr", "EasternMari, Mari, chm"},
- {LANG_SJN, SCRIPT_LATIN, "Sindarin", "sjn", "sjn", nullptr},
- {LANG_MRJ, SCRIPT_CYRILLIC, "Hill Mari", "mrj", "mrj", "WesternMari"},
- {LANG_KOI, SCRIPT_CYRILLIC, "Komi-Permyak", "koi", "koi", nullptr},
- {LANG_LTZ, SCRIPT_LATIN, "Luxembourgish", "ltz", "lb", "Luxemburgish"},
- {LANG_GLA, SCRIPT_LATIN, "Scottish Gaelic", "gla", "gd", "Gaelic"},
- {LANG_CEB, SCRIPT_LATIN, "Cebuano", "ceb", "ceb", "Bisaya, Binisaya, Visayan"},
- {LANG_PUS, SCRIPT_ARABIC, "Pashto", "pus", "ps", nullptr},
- {LANG_KMR, SCRIPT_LATIN, "Kurmanji", "kmr", "ku", "Kurdish"},
- {LANG_AMH, SCRIPT_ETHIOPIC, "Amharic", "amh", "am", nullptr},
- {LANG_ZUL, SCRIPT_LATIN, "Zulu", "zul", "zu", nullptr},
- {LANG_IBO, SCRIPT_LATIN, "Igbo", "ibo", "ig", "Ibo"},
- {LANG_YOR, SCRIPT_LATIN, "Yoruba", "yor", "yo", nullptr},
- {LANG_COS, SCRIPT_LATIN, "Corsican", "cos", "co", nullptr},
- {LANG_XHO, SCRIPT_LATIN, "Xhosa", "xho", "xh", nullptr},
- {LANG_JAV, SCRIPT_LATIN, "Javanese", "jav", "jv", nullptr}, // Also SCRIPT_JAVANESE and SCRIPT_ARABIC
- {LANG_NEP, SCRIPT_DEVANAGARI, "Nepali", "nep", "ne", nullptr},
- {LANG_SND, SCRIPT_DEVANAGARI, "Sindhi", "snd", "sd", nullptr}, // Also SCRIPT_ARABIC and SCRIPT_GUJARATI
- {LANG_SOM, SCRIPT_LATIN, "Somali", "som", "so", nullptr},
- {LANG_EPO, SCRIPT_LATIN, "Esperanto", "epo", "eo", nullptr},
- {LANG_TEL, SCRIPT_TELUGU, "Telugu", "tel", "te", nullptr},
- {LANG_MAR, SCRIPT_DEVANAGARI, "Marathi", "mar", "mr", nullptr},
- {LANG_HAU, SCRIPT_LATIN, "Hausa", "hau", "ha", nullptr},
- {LANG_YID, SCRIPT_HEBREW, "Yiddish", "yid", "yi", nullptr},
- {LANG_MAL, SCRIPT_MALAYALAM, "Malayalam", "mal", "ml", nullptr},
- {LANG_MAO, SCRIPT_LATIN, "Maori", "mao", "mi", "mri"},
- {LANG_SUN, SCRIPT_LATIN, "Sundanese", "sun", "su", nullptr},
- {LANG_PAP, SCRIPT_LATIN, "Papiamento", "pap", "pap", nullptr},
- {LANG_UZB_CYR, SCRIPT_CYRILLIC, "Cyrillic Uzbek", "uzbcyr", "uz-Cyrl", nullptr}, // https://tools.ietf.org/html/rfc5646
- {LANG_TRANSCR_IPA, SCRIPT_LATIN, "International Phonetic Alphabet Transcription", "ipa", "tr-ipa", nullptr},
- {LANG_EMJ, SCRIPT_LATIN, "Emoji", "emj", "emj", nullptr},
- {LANG_UYG, SCRIPT_ARABIC, "Uyghur", "uig", "ug", nullptr},
- {LANG_BRE, SCRIPT_LATIN, "Breton", "bre", "br", nullptr},
- {LANG_SAH, SCRIPT_CYRILLIC, "Yakut", "sah", "sah", nullptr},
- {LANG_KAZ_LAT, SCRIPT_LATIN, "Latin Kazakh", "kazlat", "kk-Latn", nullptr},
- };
-
- static_assert(static_cast<size_t>(LANG_MAX) == Y_ARRAY_SIZE(LanguageNameAndEnum), "Size doesn't match");
-
- class TLanguagesMap {
- private:
- static const char* const EMPTY_NAME;
-
- using TNamesHash = THashMap<TStringBuf, ELanguage, TCIOps, TCIOps>;
- TNamesHash Hash;
-
- using TNamesArray = std::array<const char*, static_cast<size_t>(LANG_MAX)>;
- TNamesArray BiblioNames;
- TNamesArray IsoNames;
- TNamesArray FullNames;
-
- using TScripts = std::array<EScript, static_cast<size_t>(LANG_MAX)>;
- TScripts Scripts;
-
- private:
- void AddNameToHash(const TStringBuf& name, ELanguage language) {
- if (Hash.find(name) != Hash.end()) {
- Y_ASSERT(Hash.find(name)->second == language);
- return;
- }
-
- Hash[name] = language;
- }
-
- void AddName(const char* name, ELanguage language, TNamesArray& names) {
- if (name == nullptr || strlen(name) == 0)
- return;
-
- Y_ASSERT(names[language] == EMPTY_NAME);
- names[language] = name;
-
- AddNameToHash(name, language);
- }
-
- void AddSynonyms(const char* syn, ELanguage language) {
- static const char* del = " ,;";
- if (!syn)
- return;
- while (*syn) {
- size_t len = strcspn(syn, del);
- AddNameToHash(TStringBuf(syn, len), language);
- syn += len;
- while (*syn && strchr(del, *syn))
- ++syn;
- }
- }
-
- public:
- TLanguagesMap() {
- BiblioNames.fill(EMPTY_NAME);
- IsoNames.fill(EMPTY_NAME);
- FullNames.fill(EMPTY_NAME);
- Scripts.fill(SCRIPT_OTHER);
-
- for (size_t i = 0; i != Y_ARRAY_SIZE(LanguageNameAndEnum); ++i) {
- const TLanguageNameAndEnum& val = LanguageNameAndEnum[i];
-
- ELanguage language = val.Language;
-
- AddName(val.BiblioName, language, BiblioNames);
- AddName(val.IsoName, language, IsoNames);
- AddName(val.EnglishName, language, FullNames);
- AddSynonyms(val.Synonyms, language);
-
- if (Scripts[language] == SCRIPT_OTHER) {
- Scripts[language] = val.Script;
- }
- }
- }
-
- public:
- inline ELanguage LanguageByName(const TStringBuf& name, ELanguage def) const {
- if (!name)
- return def;
-
- TNamesHash::const_iterator i = Hash.find(name);
- if (i == Hash.end()) {
- // Try to extract the primary language code from constructions like "en-cockney" or "zh_Hant"
- size_t dash_pos = name.find_first_of("_-");
- if (dash_pos != TStringBuf::npos)
- i = Hash.find(name.substr(0, dash_pos));
- if (i == Hash.end())
- return def;
- }
-
- return i->second;
- }
-
- inline const char* FullNameByLanguage(ELanguage language) const {
- if (language < 0 || static_cast<size_t>(language) >= FullNames.size())
- return nullptr;
-
- return FullNames[language];
- }
- inline const char* BiblioNameByLanguage(ELanguage language) const {
- if (language < 0 || static_cast<size_t>(language) >= BiblioNames.size())
- return nullptr;
-
- return BiblioNames[language];
- }
- inline const char* IsoNameByLanguage(ELanguage language) const {
- if (language < 0 || static_cast<size_t>(language) >= IsoNames.size())
- return nullptr;
-
- return IsoNames[language];
- }
-
- inline EScript Script(ELanguage language) const {
- return Scripts[language];
- }
- };
-}
-
-const char* const TLanguagesMap::EMPTY_NAME = "";
-
-const char* FullNameByLanguage(ELanguage language) {
- return Singleton<TLanguagesMap>()->FullNameByLanguage(language);
-}
-const char* NameByLanguage(ELanguage language) {
- return Singleton<TLanguagesMap>()->BiblioNameByLanguage(language);
-}
-const char* IsoNameByLanguage(ELanguage language) {
- return Singleton<TLanguagesMap>()->IsoNameByLanguage(language);
-}
-
-ELanguage LanguageByNameStrict(const TStringBuf& name) {
- return Singleton<TLanguagesMap>()->LanguageByName(name, LANG_MAX);
-}
-
-ELanguage LanguageByNameOrDie(const TStringBuf& name) {
- ELanguage result = LanguageByNameStrict(name);
- if (result == LANG_MAX) {
- ythrow yexception() << "LanguageByNameOrDie: invalid language '" << name << "'";
- }
- return result;
-}
-
-ELanguage LanguageByName(const TStringBuf& name) {
- return Singleton<TLanguagesMap>()->LanguageByName(name, LANG_UNK);
-}
-
-EScript ScriptByLanguage(ELanguage language) {
- return Singleton<TLanguagesMap>()->Script(language);
-}
-
-namespace {
- const size_t MAX_GLYPH = 0x10000;
- class TScriptGlyphIndex {
- public:
- TScriptGlyphIndex() {
- NCharsetInternal::InitScriptData(Data, MAX_GLYPH);
- }
-
- EScript GetGlyphScript(wchar32 glyph) const {
- if (glyph >= MAX_GLYPH)
- return SCRIPT_UNKNOWN;
- return (EScript)Data[glyph];
- }
-
- private:
- ui8 Data[MAX_GLYPH];
- };
-}
-
-EScript ScriptByGlyph(wchar32 glyph) {
- return HugeSingleton<TScriptGlyphIndex>()->GetGlyphScript(glyph);
-}
-
-template <>
-void Out<ELanguage>(IOutputStream& o, ELanguage lang) {
- o << NameByLanguage(lang);
-}
diff --git a/library/cpp/langs/langs.h b/library/cpp/langs/langs.h
deleted file mode 100644
index 360ab6a8321..00000000000
--- a/library/cpp/langs/langs.h
+++ /dev/null
@@ -1,229 +0,0 @@
-#pragma once
-
-#include "scripts.h"
-
-#include <util/generic/strbuf.h>
-#include <util/system/defaults.h>
-
-#if defined(_win_)
-// LANG_LAO is #define in WinNT.h
-#undef LANG_LAO
-#endif
-
-// Language names are given according to ISO 639-2/B
-// Some languages are not present in ISO 639-2/B. Then ISO 639-3 is used.
-// http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
-enum ELanguage {
- LANG_UNK = 0, // Unknown
- LANG_RUS = 1, // Russian
- LANG_ENG = 2, // English
- LANG_POL = 3, // Polish
- LANG_HUN = 4, // Hungarian
- LANG_UKR = 5, // Ukrainian
- LANG_GER = 6, // German
- LANG_FRE = 7, // French
- LANG_TAT = 8, // Tatar
- LANG_BEL = 9, // Belarusian
- LANG_KAZ = 10, // Kazakh
- LANG_ALB = 11, // Albanian
- LANG_SPA = 12, // Spanish
- LANG_ITA = 13, // Italian
- LANG_ARM = 14, // Armenian
- LANG_DAN = 15, // Danish
- LANG_POR = 16, // Portuguese
- LANG_ICE = 17, // Icelandic
- LANG_SLO = 18, // Slovak
- LANG_SLV = 19, // Slovene
- LANG_DUT = 20, // Dutch (Netherlandish language)
- LANG_BUL = 21, // Bulgarian
- LANG_CAT = 22, // Catalan
- LANG_HRV = 23, // Croatian
- LANG_CZE = 24, // Czech
- LANG_GRE = 25, // Greek
- LANG_HEB = 26, // Hebrew
- LANG_NOR = 27, // Norwegian
- LANG_MAC = 28, // Macedonian
- LANG_SWE = 29, // Swedish
- LANG_KOR = 30, // Korean
- LANG_LAT = 31, // Latin
- LANG_BASIC_RUS = 32, // Simplified version of Russian (used at lemmer only)
- LANG_BOS = 33, // Bosnian
- LANG_MLT = 34, // Maltese
- LANG_EMPTY = 35, // Indicate that document is empty
- LANG_UNK_LAT = 36, // Any unrecognized latin language
- LANG_UNK_CYR = 37, // Any unrecognized cyrillic language
- LANG_UNK_ALPHA = 38, // Any unrecognized alphabetic language not fit into previous categories
- LANG_FIN = 39, // Finnish
- LANG_EST = 40, // Estonian
- LANG_LAV = 41, // Latvian
- LANG_LIT = 42, // Lithuanian
- LANG_BAK = 43, // Bashkir
- LANG_TUR = 44, // Turkish
- LANG_RUM = 45, // Romanian (also Moldavian)
- LANG_MON = 46, // Mongolian
- LANG_UZB = 47, // Uzbek
- LANG_KIR = 48, // Kirghiz
- LANG_TGK = 49, // Tajik
- LANG_TUK = 50, // Turkmen
- LANG_SRP = 51, // Serbian
- LANG_AZE = 52, // Azerbaijani
- LANG_BASIC_ENG = 53, // Simplified version of English (used at lemmer only)
- LANG_GEO = 54, // Georgian
- LANG_ARA = 55, // Arabic
- LANG_PER = 56, // Persian
- LANG_CHU = 57, // Church Slavonic
- LANG_CHI = 58, // Chinese
- LANG_JPN = 59, // Japanese
- LANG_IND = 60, // Indonesian
- LANG_MAY = 61, // Malay
- LANG_THA = 62, // Thai
- LANG_VIE = 63, // Vietnamese
- LANG_GLE = 64, // Irish (Gaelic)
- LANG_TGL = 65, // Tagalog (Filipino)
- LANG_HIN = 66, // Hindi
- LANG_AFR = 67, // Afrikaans
- LANG_URD = 68, // Urdu
- LANG_MYA = 69, // Burmese
- LANG_KHM = 70, // Khmer
- LANG_LAO = 71, // Lao
- LANG_TAM = 72, // Tamil
- LANG_BEN = 73, // Bengali
- LANG_GUJ = 74, // Gujarati
- LANG_KAN = 75, // Kannada
- LANG_PAN = 76, // Punjabi
- LANG_SIN = 77, // Sinhalese
- LANG_SWA = 78, // Swahili
- LANG_BAQ = 79, // Basque
- LANG_WEL = 80, // Welsh
- LANG_GLG = 81, // Galician
- LANG_HAT = 82, // Haitian Creole
- LANG_MLG = 83, // Malagasy
- LANG_CHV = 84, // Chuvash
- LANG_UDM = 85, // Udmurt
- LANG_KPV = 86, // Komi-Zyrian
- LANG_MHR = 87, // Meadow Mari (Eastern Mari)
- LANG_SJN = 88, // Sindarin
- LANG_MRJ = 89, // Hill Mari (Western Mari)
- LANG_KOI = 90, // Komi-Permyak
- LANG_LTZ = 91, // Luxembourgish
- LANG_GLA = 92, // Scottish Gaelic
- LANG_CEB = 93, // Cebuano
- LANG_PUS = 94, // Pashto
- LANG_KMR = 95, // Kurmanji
- LANG_AMH = 96, // Amharic
- LANG_ZUL = 97, // Zulu
- LANG_IBO = 98, // Igbo
- LANG_YOR = 99, // Yoruba
- LANG_COS = 100, // Corsican
- LANG_XHO = 101, // Xhosa
- LANG_JAV = 102, // Javanese
- LANG_NEP = 103, // Nepali
- LANG_SND = 104, // Sindhi
- LANG_SOM = 105, // Somali
- LANG_EPO = 106, // Esperanto
- LANG_TEL = 107, // Telugu
- LANG_MAR = 108, // Marathi
- LANG_HAU = 109, // Hausa
- LANG_YID = 110, // Yiddish
- LANG_MAL = 111, // Malayalam
- LANG_MAO = 112, // Maori
- LANG_SUN = 113, // Sundanese
- LANG_PAP = 114, // Papiamento
- LANG_UZB_CYR = 115, // Cyrillic Uzbek
- LANG_TRANSCR_IPA = 116, // International Phonetic Alphabet Transcription
- LANG_EMJ = 117, // Emoji
- LANG_UYG = 118, // Uyghur
- LANG_BRE = 119, // Breton
- LANG_SAH = 120, // Yakut
- LANG_KAZ_LAT = 121, // Latin Kazakh
- LANG_MAX
-};
-
-/**
- * Converts string to corresponding enum. Will try to extract the primary language code from
- * constructions like "en-cockney" or "zh_Hant". In case of failure will return `LANG_UNK`.
- *
- * @param name Language name
- * @return Language enum
- */
-ELanguage LanguageByName(const TStringBuf& name);
-
-/**
- * Same as `LanguageByName`, but in case of failure will return `LANG_MAX`.
- *
- * @see LanguageByName
- */
-ELanguage LanguageByNameStrict(const TStringBuf& name);
-
-/**
- * Converts language enum to corresponding ISO 639-2/B alpha-3 code. For languages missing in ISO
- * standard convertions are:
- * - LANG_UNK: "unk"
- * - LANG_BASIC_RUS: "basic-rus"
- * - LANG_EMPTY: "empty"
- * - LANG_UNK_LAT: "unklat"
- * - LANG_UNK_CYR: "unkcyr"
- * - LANG_UNK_ALPHA: "unkalpha"
- * - LANG_BASIC_ENG: "basic-eng"
- * - LANG_TRANSCR_IPA "transcr-ipa"
- * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be
- * `nullptr`.
- *
- * @param language Language enum
- * @return Language ISO 639-2/B alpha-3 code
- */
-const char* NameByLanguage(ELanguage language);
-
-/**
- * Converts language enum to corresponding ISO 639-1 alpha-2 code. For languages missing in ISO
- * standard convertions are:
- * - LANG_UNK: "mis"
- * - LANG_BASIC_RUS: "bas-ru"
- * - LANG_EMPTY: ""
- * - LANG_UNK_LAT: ""
- * - LANG_UNK_CYR: ""
- * - LANG_UNK_ALPHA: ""
- * - LANG_BASIC_ENG: "bas-en"
- * - LANG_TRANSCR_IPA "tr-ipa"
- * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be
- * `nullptr`.
- *
- * @param language Language enum
- * @return Language ISO 639-1 alpha-2 code
- */
-const char* IsoNameByLanguage(ELanguage language);
-
-/**
- * Converts language enum to corresponding human-readable language name. E.g. "Russian" for
- * `LANG_RUS` or "Basic Russian" for `LANG_BASIC_RUS`. If language is missing in `ELanguage` or if
- * it is a `LANG_MAX` then return value will be `nullptr`.
- *
- * @param language Language enum
- */
-const char* FullNameByLanguage(ELanguage language);
-
-/**
- * Same as `LanguageByNameStrict` but in case of failure will throw `yexception`.
- *
- * @see LanguageByNameStrict
- */
-ELanguage LanguageByNameOrDie(const TStringBuf& name);
-
-constexpr bool UnknownLanguage(const ELanguage language) noexcept {
- return language == LANG_UNK || language == LANG_UNK_LAT || language == LANG_UNK_CYR || language == LANG_UNK_ALPHA || language == LANG_EMPTY;
-}
-
-EScript ScriptByLanguage(ELanguage language);
-EScript ScriptByGlyph(wchar32 glyph);
-
-namespace NCharsetInternal {
- void InitScriptData(ui8 data[], size_t len);
-}
-
-inline bool LatinScript(ELanguage language) {
- return ScriptByLanguage(language) == SCRIPT_LATIN;
-}
-
-inline bool CyrillicScript(ELanguage language) {
- return ScriptByLanguage(language) == SCRIPT_CYRILLIC;
-}
diff --git a/library/cpp/langs/scripts.cpp b/library/cpp/langs/scripts.cpp
deleted file mode 100644
index 41cc91d3ce6..00000000000
--- a/library/cpp/langs/scripts.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-#include "scripts.h"
-
-#include <library/cpp/digest/lower_case/hash_ops.h>
-
-#include <util/generic/hash.h>
-#include <util/generic/singleton.h>
-#include <util/generic/strbuf.h>
-#include <util/generic/yexception.h>
-#include <util/system/defaults.h>
-
-#include <array>
-
-namespace {
- struct TScriptNameAndEnum {
- EScript Script;
- const char* EnglishName;
- const char* IsoName;
- };
-
- const TScriptNameAndEnum ScriptNameAndEnum[] = {
- {SCRIPT_UNKNOWN, "Unknown", "Zzzz"},
- {SCRIPT_LATIN, "Latin", "Latn"},
- {SCRIPT_CYRILLIC, "Cyrillic", "Cyrl"},
-
- {SCRIPT_GREEK, "Greek", "Grek"},
- {SCRIPT_ARABIC, "Arabic", "Arab"},
- {SCRIPT_HEBREW, "Hebrew", "Hebr"},
- {SCRIPT_ARMENIAN, "Armenian", "Armn"},
- {SCRIPT_GEORGIAN, "Georgian", "Geor"},
-
- {SCRIPT_HAN, "Han", "Hans"}, // We use more common Simpliied variant (as opposed to Traditional 'Hant')
- {SCRIPT_KATAKANA, "Katakana", "Kana"},
- {SCRIPT_HIRAGANA, "Hiragana", "Hira"},
- {SCRIPT_HANGUL, "Hangul", "Hang"},
-
- {SCRIPT_DEVANAGARI, "Devanagari", "Deva"},
- {SCRIPT_BENGALI, "Bengali", "Beng"},
- {SCRIPT_GUJARATI, "Gujarati", "Gujr"},
- {SCRIPT_GURMUKHI, "Gurmukhi", "Guru"},
- {SCRIPT_KANNADA, "Kannada", "Knda"},
- {SCRIPT_MALAYALAM, "Malayalam", "Mlym"},
- {SCRIPT_ORIYA, "Oriya", "Orya"},
- {SCRIPT_TAMIL, "Tamil", "Taml"},
- {SCRIPT_TELUGU, "Telugu", "Telu"},
- {SCRIPT_THAANA, "Thaana", "Thaa"},
- {SCRIPT_SINHALA, "Sinhala", "Sinh"},
-
- {SCRIPT_MYANMAR, "Myanmar", "Mymr"},
- {SCRIPT_THAI, "Thai", "Thai"},
- {SCRIPT_LAO, "Lao", "Laoo"},
- {SCRIPT_KHMER, "Khmer", "Khmr"},
- {SCRIPT_TIBETAN, "Tibetan", "Tibt"},
- {SCRIPT_MONGOLIAN, "Mongolian", "Mong"},
-
- {SCRIPT_ETHIOPIC, "Ethiopic", "Ethi"},
- {SCRIPT_RUNIC, "Runic", "Runr"},
- {SCRIPT_COPTIC, "Coptic", "Copt"},
- {SCRIPT_SYRIAC, "Syriac", "Syrc"},
-
- {SCRIPT_OTHER, "Other", "Zyyy"},
- };
-
- static_assert(static_cast<size_t>(SCRIPT_MAX) == Y_ARRAY_SIZE(ScriptNameAndEnum), "Size doesn't match");
-
- class TScriptsMap {
- private:
- static const char* const EMPTY_NAME;
-
- using TNamesHash = THashMap<TStringBuf, EScript, TCIOps, TCIOps>;
- TNamesHash Hash;
-
- using TNamesArray = std::array<const char*, static_cast<size_t>(SCRIPT_MAX)>;
- TNamesArray IsoNames;
- TNamesArray FullNames;
-
- private:
- void AddNameToHash(const TStringBuf& name, EScript script) {
- if (Hash.find(name) != Hash.end()) {
- Y_ASSERT(Hash.find(name)->second == script);
- return;
- }
-
- Hash[name] = script;
- }
-
- void AddName(const char* name, EScript script, TNamesArray& names) {
- if (name == nullptr || strlen(name) == 0)
- return;
-
- Y_ASSERT(names[script] == EMPTY_NAME);
- names[script] = name;
-
- AddNameToHash(name, script);
- }
-
- public:
- TScriptsMap() {
- IsoNames.fill(EMPTY_NAME);
- FullNames.fill(EMPTY_NAME);
-
- for (const auto& val : ScriptNameAndEnum) {
- EScript script = val.Script;
-
- AddName(val.IsoName, script, IsoNames);
- AddName(val.EnglishName, script, FullNames);
- }
- }
-
- public:
- inline EScript ScriptByName(const TStringBuf& name, EScript def) const {
- if (!name)
- return def;
-
- TNamesHash::const_iterator i = Hash.find(name);
- if (i == Hash.end()) {
- return def;
- }
-
- return i->second;
- }
-
- inline const char* FullNameByScript(EScript script) const {
- if (script < 0 || static_cast<size_t>(script) >= FullNames.size())
- return nullptr;
-
- return FullNames[script];
- }
-
- inline const char* IsoNameByScript(EScript script) const {
- if (script < 0 || static_cast<size_t>(script) >= IsoNames.size())
- return nullptr;
-
- return IsoNames[script];
- }
- };
-}
-
-const char* const TScriptsMap::EMPTY_NAME = "";
-
-const char* FullNameByScript(EScript script) {
- return Singleton<TScriptsMap>()->FullNameByScript(script);
-}
-
-const char* IsoNameByScript(EScript script) {
- return Singleton<TScriptsMap>()->IsoNameByScript(script);
-}
-
-EScript ScriptByName(const TStringBuf& name) {
- return Singleton<TScriptsMap>()->ScriptByName(name, SCRIPT_UNKNOWN);
-}
-
-EScript ScriptByNameOrDie(const TStringBuf& name) {
- EScript result = ScriptByName(name);
- if (result == SCRIPT_UNKNOWN) {
- ythrow yexception() << "ScriptByNameOrDie: invalid script '" << name << "'";
- }
- return result;
-}
diff --git a/library/cpp/langs/scripts.h b/library/cpp/langs/scripts.h
deleted file mode 100644
index 4c47a33d2cb..00000000000
--- a/library/cpp/langs/scripts.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#pragma once
-
-#include <util/generic/strbuf.h>
-
-// Writing systems, a.k.a. scripts
-//
-enum EScript {
- SCRIPT_UNKNOWN = 0,
- SCRIPT_LATIN,
- SCRIPT_CYRILLIC,
-
- SCRIPT_GREEK,
- SCRIPT_ARABIC,
- SCRIPT_HEBREW,
- SCRIPT_ARMENIAN,
- SCRIPT_GEORGIAN,
-
- SCRIPT_HAN,
- SCRIPT_KATAKANA,
- SCRIPT_HIRAGANA,
- SCRIPT_HANGUL,
-
- SCRIPT_DEVANAGARI,
- SCRIPT_BENGALI,
- SCRIPT_GUJARATI,
- SCRIPT_GURMUKHI,
- SCRIPT_KANNADA,
- SCRIPT_MALAYALAM,
- SCRIPT_ORIYA,
- SCRIPT_TAMIL,
- SCRIPT_TELUGU,
- SCRIPT_THAANA,
- SCRIPT_SINHALA,
-
- SCRIPT_MYANMAR,
- SCRIPT_THAI,
- SCRIPT_LAO,
- SCRIPT_KHMER,
- SCRIPT_TIBETAN,
- SCRIPT_MONGOLIAN,
-
- SCRIPT_ETHIOPIC,
- SCRIPT_RUNIC,
- SCRIPT_COPTIC,
- SCRIPT_SYRIAC,
-
- SCRIPT_OTHER,
- SCRIPT_MAX
-};
-
-// According to ISO 15924 codes. See https://en.wikipedia.org/wiki/ISO_15924
-//
-EScript ScriptByName(const TStringBuf& name);
-EScript ScriptByNameOrDie(const TStringBuf& name);
-const char* IsoNameByScript(EScript script);
-const char* FullNameByScript(EScript script);