aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/langs
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/langs
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/langs')
-rw-r--r--library/cpp/langs/README.md8
-rw-r--r--library/cpp/langs/generated/uniscripts.cpp458
-rw-r--r--library/cpp/langs/langs.cpp330
-rw-r--r--library/cpp/langs/langs.h229
-rw-r--r--library/cpp/langs/scripts.cpp158
-rw-r--r--library/cpp/langs/scripts.h56
6 files changed, 1239 insertions, 0 deletions
diff --git a/library/cpp/langs/README.md b/library/cpp/langs/README.md
new file mode 100644
index 0000000000..537ae31e1b
--- /dev/null
+++ b/library/cpp/langs/README.md
@@ -0,0 +1,8 @@
+Здесь описаны константы для [языков](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h) и [письменностей](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/scripts.h) (скриптов в терминах Unicode).
+
+В терминах этих констант языков работают [документная](https://a.yandex-team.ru/arc/trunk/arcadia/kernel/recshell/recshell.h) и [запросная](https://a.yandex-team.ru/arc/trunk/arcadia/dict/recognize/queryrec) распознавалки языка.
+
+Имеется [набор функций](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h?rev=r6909333#L142-214) для преобразования констант в двухбуквенный или трехбуквенный код и обратного получения константы по строке с учетом синонимов. Есть [функции](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h?rev=r6909333#L216-217) для определения письменности по языку и по символу).
+
+В списке констант представлены не все языки и письменности, а лишь те, которые представляли интерес для поиска Яндекса и машинного перевода.
+Имеется несколько псевдоязыков типа `LANG_UZB_CYR` или `LANG_KAZ_LAT`.
diff --git a/library/cpp/langs/generated/uniscripts.cpp b/library/cpp/langs/generated/uniscripts.cpp
new file mode 100644
index 0000000000..59cc6a70c2
--- /dev/null
+++ b/library/cpp/langs/generated/uniscripts.cpp
@@ -0,0 +1,458 @@
+// Generated from http://www.unicode.org/Public/UNIDATA/Scripts.txt
+// The best way to alter this file is to modify uniscripts.py
+#include <library/cpp/langs/langs.h>
+#include <util/system/yassert.h>
+
+#include <cstring>
+
+namespace NCharsetInternal {
+ struct TScriptRange {
+ EScript Script;
+ wchar32 Start;
+ wchar32 End;
+ };
+
+ const TScriptRange ScriptRanges[] = {
+ { SCRIPT_ETHIOPIC, 0x1200, 0x1248 },
+ { SCRIPT_ETHIOPIC, 0x124A, 0x124D },
+ { SCRIPT_ETHIOPIC, 0x1250, 0x1256 },
+ { SCRIPT_ETHIOPIC, 0x1258, 0x1258 },
+ { SCRIPT_ETHIOPIC, 0x125A, 0x125D },
+ { SCRIPT_ETHIOPIC, 0x1260, 0x1288 },
+ { SCRIPT_ETHIOPIC, 0x128A, 0x128D },
+ { SCRIPT_ETHIOPIC, 0x1290, 0x12B0 },
+ { SCRIPT_ETHIOPIC, 0x12B2, 0x12B5 },
+ { SCRIPT_ETHIOPIC, 0x12B8, 0x12BE },
+ { SCRIPT_ETHIOPIC, 0x12C0, 0x12C0 },
+ { SCRIPT_ETHIOPIC, 0x12C2, 0x12C5 },
+ { SCRIPT_ETHIOPIC, 0x12C8, 0x12D6 },
+ { SCRIPT_ETHIOPIC, 0x12D8, 0x1310 },
+ { SCRIPT_ETHIOPIC, 0x1312, 0x1315 },
+ { SCRIPT_ETHIOPIC, 0x1318, 0x135A },
+ { SCRIPT_ETHIOPIC, 0x135D, 0x137C },
+ { SCRIPT_ETHIOPIC, 0x1380, 0x1399 },
+ { SCRIPT_ETHIOPIC, 0x2D80, 0x2D96 },
+ { SCRIPT_ETHIOPIC, 0x2DA0, 0x2DA6 },
+ { SCRIPT_ETHIOPIC, 0x2DA8, 0x2DAE },
+ { SCRIPT_ETHIOPIC, 0x2DB0, 0x2DB6 },
+ { SCRIPT_ETHIOPIC, 0x2DB8, 0x2DBE },
+ { SCRIPT_ETHIOPIC, 0x2DC0, 0x2DC6 },
+ { SCRIPT_ETHIOPIC, 0x2DC8, 0x2DCE },
+ { SCRIPT_ETHIOPIC, 0x2DD0, 0x2DD6 },
+ { SCRIPT_ETHIOPIC, 0x2DD8, 0x2DDE },
+ { SCRIPT_ETHIOPIC, 0xAB01, 0xAB06 },
+ { SCRIPT_ETHIOPIC, 0xAB09, 0xAB0E },
+ { SCRIPT_ETHIOPIC, 0xAB11, 0xAB16 },
+ { SCRIPT_ETHIOPIC, 0xAB20, 0xAB26 },
+ { SCRIPT_ETHIOPIC, 0xAB28, 0xAB2E },
+ { SCRIPT_ARABIC, 0x600, 0x604 },
+ { SCRIPT_ARABIC, 0x606, 0x60B },
+ { SCRIPT_ARABIC, 0x60D, 0x61A },
+ { SCRIPT_ARABIC, 0x61E, 0x61E },
+ { SCRIPT_ARABIC, 0x620, 0x63F },
+ { SCRIPT_ARABIC, 0x641, 0x64A },
+ { SCRIPT_ARABIC, 0x656, 0x66F },
+ { SCRIPT_ARABIC, 0x671, 0x6DC },
+ { SCRIPT_ARABIC, 0x6DE, 0x6FF },
+ { SCRIPT_ARABIC, 0x750, 0x77F },
+ { SCRIPT_ARABIC, 0x8A0, 0x8B4 },
+ { SCRIPT_ARABIC, 0x8B6, 0x8BD },
+ { SCRIPT_ARABIC, 0x8D4, 0x8E1 },
+ { SCRIPT_ARABIC, 0x8E3, 0x8FF },
+ { SCRIPT_ARABIC, 0xFB50, 0xFBC1 },
+ { SCRIPT_ARABIC, 0xFBD3, 0xFD3D },
+ { SCRIPT_ARABIC, 0xFD50, 0xFD8F },
+ { SCRIPT_ARABIC, 0xFD92, 0xFDC7 },
+ { SCRIPT_ARABIC, 0xFDF0, 0xFDFD },
+ { SCRIPT_ARABIC, 0xFE70, 0xFE74 },
+ { SCRIPT_ARABIC, 0xFE76, 0xFEFC },
+ { SCRIPT_MONGOLIAN, 0x1800, 0x1801 },
+ { SCRIPT_MONGOLIAN, 0x1804, 0x1804 },
+ { SCRIPT_MONGOLIAN, 0x1806, 0x180E },
+ { SCRIPT_MONGOLIAN, 0x1810, 0x1819 },
+ { SCRIPT_MONGOLIAN, 0x1820, 0x1877 },
+ { SCRIPT_MONGOLIAN, 0x1880, 0x18AA },
+ { SCRIPT_TAMIL, 0xB82, 0xB83 },
+ { SCRIPT_TAMIL, 0xB85, 0xB8A },
+ { SCRIPT_TAMIL, 0xB8E, 0xB90 },
+ { SCRIPT_TAMIL, 0xB92, 0xB95 },
+ { SCRIPT_TAMIL, 0xB99, 0xB9A },
+ { SCRIPT_TAMIL, 0xB9C, 0xB9C },
+ { SCRIPT_TAMIL, 0xB9E, 0xB9F },
+ { SCRIPT_TAMIL, 0xBA3, 0xBA4 },
+ { SCRIPT_TAMIL, 0xBA8, 0xBAA },
+ { SCRIPT_TAMIL, 0xBAE, 0xBB9 },
+ { SCRIPT_TAMIL, 0xBBE, 0xBC2 },
+ { SCRIPT_TAMIL, 0xBC6, 0xBC8 },
+ { SCRIPT_TAMIL, 0xBCA, 0xBCD },
+ { SCRIPT_TAMIL, 0xBD0, 0xBD0 },
+ { SCRIPT_TAMIL, 0xBD7, 0xBD7 },
+ { SCRIPT_TAMIL, 0xBE6, 0xBFA },
+ { SCRIPT_GUJARATI, 0xA81, 0xA83 },
+ { SCRIPT_GUJARATI, 0xA85, 0xA8D },
+ { SCRIPT_GUJARATI, 0xA8F, 0xA91 },
+ { SCRIPT_GUJARATI, 0xA93, 0xAA8 },
+ { SCRIPT_GUJARATI, 0xAAA, 0xAB0 },
+ { SCRIPT_GUJARATI, 0xAB2, 0xAB3 },
+ { SCRIPT_GUJARATI, 0xAB5, 0xAB9 },
+ { SCRIPT_GUJARATI, 0xABC, 0xAC5 },
+ { SCRIPT_GUJARATI, 0xAC7, 0xAC9 },
+ { SCRIPT_GUJARATI, 0xACB, 0xACD },
+ { SCRIPT_GUJARATI, 0xAD0, 0xAD0 },
+ { SCRIPT_GUJARATI, 0xAE0, 0xAE3 },
+ { SCRIPT_GUJARATI, 0xAE6, 0xAF1 },
+ { SCRIPT_GUJARATI, 0xAF9, 0xAF9 },
+ { SCRIPT_MALAYALAM, 0xD01, 0xD03 },
+ { SCRIPT_MALAYALAM, 0xD05, 0xD0C },
+ { SCRIPT_MALAYALAM, 0xD0E, 0xD10 },
+ { SCRIPT_MALAYALAM, 0xD12, 0xD3A },
+ { SCRIPT_MALAYALAM, 0xD3D, 0xD44 },
+ { SCRIPT_MALAYALAM, 0xD46, 0xD48 },
+ { SCRIPT_MALAYALAM, 0xD4A, 0xD4F },
+ { SCRIPT_MALAYALAM, 0xD54, 0xD63 },
+ { SCRIPT_MALAYALAM, 0xD66, 0xD7F },
+ { SCRIPT_ARMENIAN, 0x531, 0x556 },
+ { SCRIPT_ARMENIAN, 0x559, 0x55F },
+ { SCRIPT_ARMENIAN, 0x561, 0x587 },
+ { SCRIPT_ARMENIAN, 0x58A, 0x58A },
+ { SCRIPT_ARMENIAN, 0x58D, 0x58F },
+ { SCRIPT_ARMENIAN, 0xFB13, 0xFB17 },
+ { SCRIPT_HANGUL, 0x1100, 0x11FF },
+ { SCRIPT_HANGUL, 0x302E, 0x302F },
+ { SCRIPT_HANGUL, 0x3131, 0x318E },
+ { SCRIPT_HANGUL, 0x3200, 0x321E },
+ { SCRIPT_HANGUL, 0x3260, 0x327E },
+ { SCRIPT_HANGUL, 0xA960, 0xA97C },
+ { SCRIPT_HANGUL, 0xAC00, 0xD7A3 },
+ { SCRIPT_HANGUL, 0xD7B0, 0xD7C6 },
+ { SCRIPT_HANGUL, 0xD7CB, 0xD7FB },
+ { SCRIPT_HANGUL, 0xFFA0, 0xFFBE },
+ { SCRIPT_HANGUL, 0xFFC2, 0xFFC7 },
+ { SCRIPT_HANGUL, 0xFFCA, 0xFFCF },
+ { SCRIPT_HANGUL, 0xFFD2, 0xFFD7 },
+ { SCRIPT_HANGUL, 0xFFDA, 0xFFDC },
+ { SCRIPT_GURMUKHI, 0xA01, 0xA03 },
+ { SCRIPT_GURMUKHI, 0xA05, 0xA0A },
+ { SCRIPT_GURMUKHI, 0xA0F, 0xA10 },
+ { SCRIPT_GURMUKHI, 0xA13, 0xA28 },
+ { SCRIPT_GURMUKHI, 0xA2A, 0xA30 },
+ { SCRIPT_GURMUKHI, 0xA32, 0xA33 },
+ { SCRIPT_GURMUKHI, 0xA35, 0xA36 },
+ { SCRIPT_GURMUKHI, 0xA38, 0xA39 },
+ { SCRIPT_GURMUKHI, 0xA3C, 0xA3C },
+ { SCRIPT_GURMUKHI, 0xA3E, 0xA42 },
+ { SCRIPT_GURMUKHI, 0xA47, 0xA48 },
+ { SCRIPT_GURMUKHI, 0xA4B, 0xA4D },
+ { SCRIPT_GURMUKHI, 0xA51, 0xA51 },
+ { SCRIPT_GURMUKHI, 0xA59, 0xA5C },
+ { SCRIPT_GURMUKHI, 0xA5E, 0xA5E },
+ { SCRIPT_GURMUKHI, 0xA66, 0xA75 },
+ { SCRIPT_CYRILLIC, 0x400, 0x484 },
+ { SCRIPT_CYRILLIC, 0x487, 0x52F },
+ { SCRIPT_CYRILLIC, 0x1C80, 0x1C88 },
+ { SCRIPT_CYRILLIC, 0x1D2B, 0x1D2B },
+ { SCRIPT_CYRILLIC, 0x1D78, 0x1D78 },
+ { SCRIPT_CYRILLIC, 0x2DE0, 0x2DFF },
+ { SCRIPT_CYRILLIC, 0xA640, 0xA69F },
+ { SCRIPT_CYRILLIC, 0xFE2E, 0xFE2F },
+ { SCRIPT_DEVANAGARI, 0x900, 0x950 },
+ { SCRIPT_DEVANAGARI, 0x953, 0x963 },
+ { SCRIPT_DEVANAGARI, 0x966, 0x97F },
+ { SCRIPT_DEVANAGARI, 0xA8E0, 0xA8FD },
+ { SCRIPT_HEBREW, 0x591, 0x5C7 },
+ { SCRIPT_HEBREW, 0x5D0, 0x5EA },
+ { SCRIPT_HEBREW, 0x5F0, 0x5F4 },
+ { SCRIPT_HEBREW, 0xFB1D, 0xFB36 },
+ { SCRIPT_HEBREW, 0xFB38, 0xFB3C },
+ { SCRIPT_HEBREW, 0xFB3E, 0xFB3E },
+ { SCRIPT_HEBREW, 0xFB40, 0xFB41 },
+ { SCRIPT_HEBREW, 0xFB43, 0xFB44 },
+ { SCRIPT_HEBREW, 0xFB46, 0xFB4F },
+ { SCRIPT_THAI, 0xE01, 0xE3A },
+ { SCRIPT_THAI, 0xE40, 0xE5B },
+ { SCRIPT_SYRIAC, 0x700, 0x70D },
+ { SCRIPT_SYRIAC, 0x70F, 0x74A },
+ { SCRIPT_SYRIAC, 0x74D, 0x74F },
+ { SCRIPT_KANNADA, 0xC80, 0xC83 },
+ { SCRIPT_KANNADA, 0xC85, 0xC8C },
+ { SCRIPT_KANNADA, 0xC8E, 0xC90 },
+ { SCRIPT_KANNADA, 0xC92, 0xCA8 },
+ { SCRIPT_KANNADA, 0xCAA, 0xCB3 },
+ { SCRIPT_KANNADA, 0xCB5, 0xCB9 },
+ { SCRIPT_KANNADA, 0xCBC, 0xCC4 },
+ { SCRIPT_KANNADA, 0xCC6, 0xCC8 },
+ { SCRIPT_KANNADA, 0xCCA, 0xCCD },
+ { SCRIPT_KANNADA, 0xCD5, 0xCD6 },
+ { SCRIPT_KANNADA, 0xCDE, 0xCDE },
+ { SCRIPT_KANNADA, 0xCE0, 0xCE3 },
+ { SCRIPT_KANNADA, 0xCE6, 0xCEF },
+ { SCRIPT_KANNADA, 0xCF1, 0xCF2 },
+ { SCRIPT_LAO, 0xE81, 0xE82 },
+ { SCRIPT_LAO, 0xE84, 0xE84 },
+ { SCRIPT_LAO, 0xE87, 0xE88 },
+ { SCRIPT_LAO, 0xE8A, 0xE8A },
+ { SCRIPT_LAO, 0xE8D, 0xE8D },
+ { SCRIPT_LAO, 0xE94, 0xE97 },
+ { SCRIPT_LAO, 0xE99, 0xE9F },
+ { SCRIPT_LAO, 0xEA1, 0xEA3 },
+ { SCRIPT_LAO, 0xEA5, 0xEA5 },
+ { SCRIPT_LAO, 0xEA7, 0xEA7 },
+ { SCRIPT_LAO, 0xEAA, 0xEAB },
+ { SCRIPT_LAO, 0xEAD, 0xEB9 },
+ { SCRIPT_LAO, 0xEBB, 0xEBD },
+ { SCRIPT_LAO, 0xEC0, 0xEC4 },
+ { SCRIPT_LAO, 0xEC6, 0xEC6 },
+ { SCRIPT_LAO, 0xEC8, 0xECD },
+ { SCRIPT_LAO, 0xED0, 0xED9 },
+ { SCRIPT_LAO, 0xEDC, 0xEDF },
+ { SCRIPT_TELUGU, 0xC00, 0xC03 },
+ { SCRIPT_TELUGU, 0xC05, 0xC0C },
+ { SCRIPT_TELUGU, 0xC0E, 0xC10 },
+ { SCRIPT_TELUGU, 0xC12, 0xC28 },
+ { SCRIPT_TELUGU, 0xC2A, 0xC39 },
+ { SCRIPT_TELUGU, 0xC3D, 0xC44 },
+ { SCRIPT_TELUGU, 0xC46, 0xC48 },
+ { SCRIPT_TELUGU, 0xC4A, 0xC4D },
+ { SCRIPT_TELUGU, 0xC55, 0xC56 },
+ { SCRIPT_TELUGU, 0xC58, 0xC5A },
+ { SCRIPT_TELUGU, 0xC60, 0xC63 },
+ { SCRIPT_TELUGU, 0xC66, 0xC6F },
+ { SCRIPT_TELUGU, 0xC78, 0xC7F },
+ { SCRIPT_KHMER, 0x1780, 0x17DD },
+ { SCRIPT_KHMER, 0x17E0, 0x17E9 },
+ { SCRIPT_KHMER, 0x17F0, 0x17F9 },
+ { SCRIPT_KHMER, 0x19E0, 0x19FF },
+ { SCRIPT_LATIN, 0x41, 0x5A },
+ { SCRIPT_LATIN, 0x61, 0x7A },
+ { SCRIPT_LATIN, 0xAA, 0xAA },
+ { SCRIPT_LATIN, 0xBA, 0xBA },
+ { SCRIPT_LATIN, 0xC0, 0xD6 },
+ { SCRIPT_LATIN, 0xD8, 0xF6 },
+ { SCRIPT_LATIN, 0xF8, 0x2B8 },
+ { SCRIPT_LATIN, 0x2E0, 0x2E4 },
+ { SCRIPT_LATIN, 0x1D00, 0x1D25 },
+ { SCRIPT_LATIN, 0x1D2C, 0x1D5C },
+ { SCRIPT_LATIN, 0x1D62, 0x1D65 },
+ { SCRIPT_LATIN, 0x1D6B, 0x1D77 },
+ { SCRIPT_LATIN, 0x1D79, 0x1DBE },
+ { SCRIPT_LATIN, 0x1E00, 0x1EFF },
+ { SCRIPT_LATIN, 0x2071, 0x2071 },
+ { SCRIPT_LATIN, 0x207F, 0x207F },
+ { SCRIPT_LATIN, 0x2090, 0x209C },
+ { SCRIPT_LATIN, 0x212A, 0x212B },
+ { SCRIPT_LATIN, 0x2132, 0x2132 },
+ { SCRIPT_LATIN, 0x214E, 0x214E },
+ { SCRIPT_LATIN, 0x2160, 0x2188 },
+ { SCRIPT_LATIN, 0x2C60, 0x2C7F },
+ { SCRIPT_LATIN, 0xA722, 0xA787 },
+ { SCRIPT_LATIN, 0xA78B, 0xA7AE },
+ { SCRIPT_LATIN, 0xA7B0, 0xA7B7 },
+ { SCRIPT_LATIN, 0xA7F7, 0xA7FF },
+ { SCRIPT_LATIN, 0xAB30, 0xAB5A },
+ { SCRIPT_LATIN, 0xAB5C, 0xAB64 },
+ { SCRIPT_LATIN, 0xFB00, 0xFB06 },
+ { SCRIPT_LATIN, 0xFF21, 0xFF3A },
+ { SCRIPT_LATIN, 0xFF41, 0xFF5A },
+ { SCRIPT_TIBETAN, 0xF00, 0xF47 },
+ { SCRIPT_TIBETAN, 0xF49, 0xF6C },
+ { SCRIPT_TIBETAN, 0xF71, 0xF97 },
+ { SCRIPT_TIBETAN, 0xF99, 0xFBC },
+ { SCRIPT_TIBETAN, 0xFBE, 0xFCC },
+ { SCRIPT_TIBETAN, 0xFCE, 0xFD4 },
+ { SCRIPT_TIBETAN, 0xFD9, 0xFDA },
+ { SCRIPT_MYANMAR, 0x1000, 0x109F },
+ { SCRIPT_MYANMAR, 0xA9E0, 0xA9FE },
+ { SCRIPT_MYANMAR, 0xAA60, 0xAA7F },
+ { SCRIPT_OTHER, 0x2EA, 0x2EB },
+ { SCRIPT_OTHER, 0x7C0, 0x7FA },
+ { SCRIPT_OTHER, 0x800, 0x82D },
+ { SCRIPT_OTHER, 0x830, 0x83E },
+ { SCRIPT_OTHER, 0x840, 0x85B },
+ { SCRIPT_OTHER, 0x85E, 0x85E },
+ { SCRIPT_OTHER, 0x13A0, 0x13F5 },
+ { SCRIPT_OTHER, 0x13F8, 0x13FD },
+ { SCRIPT_OTHER, 0x1400, 0x169C },
+ { SCRIPT_OTHER, 0x1700, 0x170C },
+ { SCRIPT_OTHER, 0x170E, 0x1714 },
+ { SCRIPT_OTHER, 0x1720, 0x1734 },
+ { SCRIPT_OTHER, 0x1740, 0x1753 },
+ { SCRIPT_OTHER, 0x1760, 0x176C },
+ { SCRIPT_OTHER, 0x176E, 0x1770 },
+ { SCRIPT_OTHER, 0x1772, 0x1773 },
+ { SCRIPT_OTHER, 0x18B0, 0x18F5 },
+ { SCRIPT_OTHER, 0x1900, 0x191E },
+ { SCRIPT_OTHER, 0x1920, 0x192B },
+ { SCRIPT_OTHER, 0x1930, 0x193B },
+ { SCRIPT_OTHER, 0x1940, 0x1940 },
+ { SCRIPT_OTHER, 0x1944, 0x196D },
+ { SCRIPT_OTHER, 0x1970, 0x1974 },
+ { SCRIPT_OTHER, 0x1980, 0x19AB },
+ { SCRIPT_OTHER, 0x19B0, 0x19C9 },
+ { SCRIPT_OTHER, 0x19D0, 0x19DA },
+ { SCRIPT_OTHER, 0x19DE, 0x19DF },
+ { SCRIPT_OTHER, 0x1A00, 0x1A1B },
+ { SCRIPT_OTHER, 0x1A1E, 0x1A5E },
+ { SCRIPT_OTHER, 0x1A60, 0x1A7C },
+ { SCRIPT_OTHER, 0x1A7F, 0x1A89 },
+ { SCRIPT_OTHER, 0x1A90, 0x1A99 },
+ { SCRIPT_OTHER, 0x1AA0, 0x1AAD },
+ { SCRIPT_OTHER, 0x1B00, 0x1B4B },
+ { SCRIPT_OTHER, 0x1B50, 0x1B7C },
+ { SCRIPT_OTHER, 0x1B80, 0x1BF3 },
+ { SCRIPT_OTHER, 0x1BFC, 0x1C37 },
+ { SCRIPT_OTHER, 0x1C3B, 0x1C49 },
+ { SCRIPT_OTHER, 0x1C4D, 0x1C7F },
+ { SCRIPT_OTHER, 0x1CC0, 0x1CC7 },
+ { SCRIPT_OTHER, 0x2800, 0x28FF },
+ { SCRIPT_OTHER, 0x2C00, 0x2C2E },
+ { SCRIPT_OTHER, 0x2C30, 0x2C5E },
+ { SCRIPT_OTHER, 0x2D30, 0x2D67 },
+ { SCRIPT_OTHER, 0x2D6F, 0x2D70 },
+ { SCRIPT_OTHER, 0x2D7F, 0x2D7F },
+ { SCRIPT_OTHER, 0x3105, 0x312D },
+ { SCRIPT_OTHER, 0x31A0, 0x31BA },
+ { SCRIPT_OTHER, 0xA000, 0xA48C },
+ { SCRIPT_OTHER, 0xA490, 0xA4C6 },
+ { SCRIPT_OTHER, 0xA4D0, 0xA62B },
+ { SCRIPT_OTHER, 0xA6A0, 0xA6F7 },
+ { SCRIPT_OTHER, 0xA800, 0xA82B },
+ { SCRIPT_OTHER, 0xA840, 0xA877 },
+ { SCRIPT_OTHER, 0xA880, 0xA8C5 },
+ { SCRIPT_OTHER, 0xA8CE, 0xA8D9 },
+ { SCRIPT_OTHER, 0xA900, 0xA92D },
+ { SCRIPT_OTHER, 0xA92F, 0xA953 },
+ { SCRIPT_OTHER, 0xA95F, 0xA95F },
+ { SCRIPT_OTHER, 0xA980, 0xA9CD },
+ { SCRIPT_OTHER, 0xA9D0, 0xA9D9 },
+ { SCRIPT_OTHER, 0xA9DE, 0xA9DF },
+ { SCRIPT_OTHER, 0xAA00, 0xAA36 },
+ { SCRIPT_OTHER, 0xAA40, 0xAA4D },
+ { SCRIPT_OTHER, 0xAA50, 0xAA59 },
+ { SCRIPT_OTHER, 0xAA5C, 0xAA5F },
+ { SCRIPT_OTHER, 0xAA80, 0xAAC2 },
+ { SCRIPT_OTHER, 0xAADB, 0xAAF6 },
+ { SCRIPT_OTHER, 0xAB70, 0xABED },
+ { SCRIPT_OTHER, 0xABF0, 0xABF9 },
+ { SCRIPT_HAN, 0x2E80, 0x2E99 },
+ { SCRIPT_HAN, 0x2E9B, 0x2EF3 },
+ { SCRIPT_HAN, 0x2F00, 0x2FD5 },
+ { SCRIPT_HAN, 0x3005, 0x3005 },
+ { SCRIPT_HAN, 0x3007, 0x3007 },
+ { SCRIPT_HAN, 0x3021, 0x3029 },
+ { SCRIPT_HAN, 0x3038, 0x303B },
+ { SCRIPT_HAN, 0x3400, 0x4DB5 },
+ { SCRIPT_HAN, 0x4E00, 0x9FD5 },
+ { SCRIPT_HAN, 0xF900, 0xFA6D },
+ { SCRIPT_HAN, 0xFA70, 0xFAD9 },
+ { SCRIPT_THAANA, 0x780, 0x7B1 },
+ { SCRIPT_HIRAGANA, 0x3041, 0x3096 },
+ { SCRIPT_HIRAGANA, 0x309D, 0x309F },
+ { SCRIPT_KATAKANA, 0x30A1, 0x30FA },
+ { SCRIPT_KATAKANA, 0x30FD, 0x30FF },
+ { SCRIPT_KATAKANA, 0x31F0, 0x31FF },
+ { SCRIPT_KATAKANA, 0x32D0, 0x32FE },
+ { SCRIPT_KATAKANA, 0x3300, 0x3357 },
+ { SCRIPT_KATAKANA, 0xFF66, 0xFF6F },
+ { SCRIPT_KATAKANA, 0xFF71, 0xFF9D },
+ { SCRIPT_ORIYA, 0xB01, 0xB03 },
+ { SCRIPT_ORIYA, 0xB05, 0xB0C },
+ { SCRIPT_ORIYA, 0xB0F, 0xB10 },
+ { SCRIPT_ORIYA, 0xB13, 0xB28 },
+ { SCRIPT_ORIYA, 0xB2A, 0xB30 },
+ { SCRIPT_ORIYA, 0xB32, 0xB33 },
+ { SCRIPT_ORIYA, 0xB35, 0xB39 },
+ { SCRIPT_ORIYA, 0xB3C, 0xB44 },
+ { SCRIPT_ORIYA, 0xB47, 0xB48 },
+ { SCRIPT_ORIYA, 0xB4B, 0xB4D },
+ { SCRIPT_ORIYA, 0xB56, 0xB57 },
+ { SCRIPT_ORIYA, 0xB5C, 0xB5D },
+ { SCRIPT_ORIYA, 0xB5F, 0xB63 },
+ { SCRIPT_ORIYA, 0xB66, 0xB77 },
+ { SCRIPT_BENGALI, 0x980, 0x983 },
+ { SCRIPT_BENGALI, 0x985, 0x98C },
+ { SCRIPT_BENGALI, 0x98F, 0x990 },
+ { SCRIPT_BENGALI, 0x993, 0x9A8 },
+ { SCRIPT_BENGALI, 0x9AA, 0x9B0 },
+ { SCRIPT_BENGALI, 0x9B2, 0x9B2 },
+ { SCRIPT_BENGALI, 0x9B6, 0x9B9 },
+ { SCRIPT_BENGALI, 0x9BC, 0x9C4 },
+ { SCRIPT_BENGALI, 0x9C7, 0x9C8 },
+ { SCRIPT_BENGALI, 0x9CB, 0x9CE },
+ { SCRIPT_BENGALI, 0x9D7, 0x9D7 },
+ { SCRIPT_BENGALI, 0x9DC, 0x9DD },
+ { SCRIPT_BENGALI, 0x9DF, 0x9E3 },
+ { SCRIPT_BENGALI, 0x9E6, 0x9FB },
+ { SCRIPT_RUNIC, 0x16A0, 0x16EA },
+ { SCRIPT_RUNIC, 0x16EE, 0x16F8 },
+ { SCRIPT_SINHALA, 0xD82, 0xD83 },
+ { SCRIPT_SINHALA, 0xD85, 0xD96 },
+ { SCRIPT_SINHALA, 0xD9A, 0xDB1 },
+ { SCRIPT_SINHALA, 0xDB3, 0xDBB },
+ { SCRIPT_SINHALA, 0xDBD, 0xDBD },
+ { SCRIPT_SINHALA, 0xDC0, 0xDC6 },
+ { SCRIPT_SINHALA, 0xDCA, 0xDCA },
+ { SCRIPT_SINHALA, 0xDCF, 0xDD4 },
+ { SCRIPT_SINHALA, 0xDD6, 0xDD6 },
+ { SCRIPT_SINHALA, 0xDD8, 0xDDF },
+ { SCRIPT_SINHALA, 0xDE6, 0xDEF },
+ { SCRIPT_SINHALA, 0xDF2, 0xDF4 },
+ { SCRIPT_COPTIC, 0x3E2, 0x3EF },
+ { SCRIPT_COPTIC, 0x2C80, 0x2CF3 },
+ { SCRIPT_COPTIC, 0x2CF9, 0x2CFF },
+ { SCRIPT_GEORGIAN, 0x10A0, 0x10C5 },
+ { SCRIPT_GEORGIAN, 0x10C7, 0x10C7 },
+ { SCRIPT_GEORGIAN, 0x10CD, 0x10CD },
+ { SCRIPT_GEORGIAN, 0x10D0, 0x10FA },
+ { SCRIPT_GEORGIAN, 0x10FC, 0x10FF },
+ { SCRIPT_GEORGIAN, 0x2D00, 0x2D25 },
+ { SCRIPT_GEORGIAN, 0x2D27, 0x2D27 },
+ { SCRIPT_GEORGIAN, 0x2D2D, 0x2D2D },
+ { SCRIPT_GREEK, 0x370, 0x373 },
+ { SCRIPT_GREEK, 0x375, 0x377 },
+ { SCRIPT_GREEK, 0x37A, 0x37D },
+ { SCRIPT_GREEK, 0x37F, 0x37F },
+ { SCRIPT_GREEK, 0x384, 0x384 },
+ { SCRIPT_GREEK, 0x386, 0x386 },
+ { SCRIPT_GREEK, 0x388, 0x38A },
+ { SCRIPT_GREEK, 0x38C, 0x38C },
+ { SCRIPT_GREEK, 0x38E, 0x3A1 },
+ { SCRIPT_GREEK, 0x3A3, 0x3E1 },
+ { SCRIPT_GREEK, 0x3F0, 0x3FF },
+ { SCRIPT_GREEK, 0x1D26, 0x1D2A },
+ { SCRIPT_GREEK, 0x1D5D, 0x1D61 },
+ { SCRIPT_GREEK, 0x1D66, 0x1D6A },
+ { SCRIPT_GREEK, 0x1DBF, 0x1DBF },
+ { SCRIPT_GREEK, 0x1F00, 0x1F15 },
+ { SCRIPT_GREEK, 0x1F18, 0x1F1D },
+ { SCRIPT_GREEK, 0x1F20, 0x1F45 },
+ { SCRIPT_GREEK, 0x1F48, 0x1F4D },
+ { SCRIPT_GREEK, 0x1F50, 0x1F57 },
+ { SCRIPT_GREEK, 0x1F59, 0x1F59 },
+ { SCRIPT_GREEK, 0x1F5B, 0x1F5B },
+ { SCRIPT_GREEK, 0x1F5D, 0x1F5D },
+ { SCRIPT_GREEK, 0x1F5F, 0x1F7D },
+ { SCRIPT_GREEK, 0x1F80, 0x1FB4 },
+ { SCRIPT_GREEK, 0x1FB6, 0x1FC4 },
+ { SCRIPT_GREEK, 0x1FC6, 0x1FD3 },
+ { SCRIPT_GREEK, 0x1FD6, 0x1FDB },
+ { SCRIPT_GREEK, 0x1FDD, 0x1FEF },
+ { SCRIPT_GREEK, 0x1FF2, 0x1FF4 },
+ { SCRIPT_GREEK, 0x1FF6, 0x1FFE },
+ { SCRIPT_GREEK, 0x2126, 0x2126 },
+ { SCRIPT_GREEK, 0xAB65, 0xAB65 },
+ };
+
+ void InitScriptData(ui8 data[], size_t len) {
+ memset (data, 0, len * sizeof(ui8));
+ for (auto range : ScriptRanges) {
+ Y_ASSERT(range.Start <= range.End);
+ Y_ASSERT((unsigned)range.Script < 0x100);
+ size_t end = range.End;
+ if (end >= len)
+ end = len;
+ for (size_t j = range.Start; j <= end; ++j) {
+ data[j] = (ui8)range.Script;
+ }
+ }
+ }
+}
diff --git a/library/cpp/langs/langs.cpp b/library/cpp/langs/langs.cpp
new file mode 100644
index 0000000000..2c508e1602
--- /dev/null
+++ b/library/cpp/langs/langs.cpp
@@ -0,0 +1,330 @@
+#include "langs.h"
+
+#include <library/cpp/digest/lower_case/hash_ops.h>
+
+#include <util/generic/array_size.h>
+#include <util/generic/hash.h>
+#include <util/generic/singleton.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/yexception.h>
+#include <util/system/defaults.h>
+
+#include <array>
+#include <cctype>
+
+/*
+ * define language by ELanguage
+ */
+
+namespace {
+ struct TLanguageNameAndEnum {
+ ELanguage Language;
+ EScript Script;
+ const char* EnglishName;
+ const char* BiblioName;
+ const char* IsoName;
+ const char* Synonyms;
+ };
+
+ const TLanguageNameAndEnum LanguageNameAndEnum[] = {
+ {LANG_UNK, SCRIPT_OTHER, "Unknown", "unk", "mis", nullptr},
+ {LANG_RUS, SCRIPT_CYRILLIC, "Russian", "rus", "ru", "ru-RU"},
+ {LANG_ENG, SCRIPT_LATIN, "English", "eng", "en", "en-US, en-GB, en-CA, en-NZ, en-AU"},
+ {LANG_POL, SCRIPT_LATIN, "Polish", "pol", "pl", nullptr},
+ {LANG_HUN, SCRIPT_LATIN, "Hungarian", "hun", "hu", nullptr},
+ {LANG_UKR, SCRIPT_CYRILLIC, "Ukrainian", "ukr", "uk", "uk-UA"},
+ {LANG_GER, SCRIPT_LATIN, "German", "ger", "de", "deu"},
+ {LANG_FRE, SCRIPT_LATIN, "French", "fre", "fr", "fra, frn, fr-FR, fr-CA"},
+ {LANG_TAT, SCRIPT_CYRILLIC, "Tatar", "tat", "tt", nullptr},
+ {LANG_BEL, SCRIPT_CYRILLIC, "Belarusian", "bel", "be", "blr, Belorussian"},
+ {LANG_KAZ, SCRIPT_CYRILLIC, "Kazakh", "kaz", "kk", "kk-Cyrl"},
+ {LANG_ALB, SCRIPT_LATIN, "Albanian", "alb", "sq", nullptr},
+ {LANG_SPA, SCRIPT_LATIN, "Spanish", "spa", "es", nullptr},
+ {LANG_ITA, SCRIPT_LATIN, "Italian", "ita", "it", nullptr},
+ {LANG_ARM, SCRIPT_ARMENIAN, "Armenian", "arm", "hy", "hye"},
+ {LANG_DAN, SCRIPT_LATIN, "Danish", "dan", "da", nullptr},
+ {LANG_POR, SCRIPT_LATIN, "Portuguese", "por", "pt", nullptr},
+ {LANG_ICE, SCRIPT_LATIN, "Icelandic", "ice", "is", "isl"},
+ {LANG_SLO, SCRIPT_LATIN, "Slovak", "slo", "sk", "slk"},
+ {LANG_SLV, SCRIPT_LATIN, "Slovene", "slv", "sl", "Slovenian"},
+ {LANG_DUT, SCRIPT_LATIN, "Dutch", "dut", "nl", "nld"},
+ {LANG_BUL, SCRIPT_CYRILLIC, "Bulgarian", "bul", "bg", nullptr},
+ {LANG_CAT, SCRIPT_LATIN, "Catalan", "cat", "ca", nullptr},
+ {LANG_HRV, SCRIPT_LATIN, "Croatian", "hrv", "hr", "scr"},
+ {LANG_CZE, SCRIPT_LATIN, "Czech", "cze", "cs", "ces"},
+ {LANG_GRE, SCRIPT_GREEK, "Greek", "gre", "el", "ell"},
+ {LANG_HEB, SCRIPT_HEBREW, "Hebrew", "heb", "he", "iw"}, // 'iw' is old ISO-639 code
+ {LANG_NOR, SCRIPT_LATIN, "Norwegian", "nor", "no", nullptr},
+ {LANG_MAC, SCRIPT_CYRILLIC, "Macedonian", "mac", "mk", nullptr},
+ {LANG_SWE, SCRIPT_LATIN, "Swedish", "swe", "sv", nullptr},
+ {LANG_KOR, SCRIPT_HANGUL, "Korean", "kor", "ko", nullptr},
+ {LANG_LAT, SCRIPT_LATIN, "Latin", "lat", "la", nullptr},
+ {LANG_BASIC_RUS, SCRIPT_CYRILLIC, "Basic Russian", "basic-rus", "bas-ru", nullptr},
+ {LANG_BOS, SCRIPT_LATIN, "Bosnian", "bos", "bs", nullptr},
+ {LANG_MLT, SCRIPT_LATIN, "Maltese", "mlt", "mt", nullptr},
+
+ {LANG_EMPTY, SCRIPT_OTHER, "Empty", "empty", nullptr, nullptr},
+ {LANG_UNK_LAT, SCRIPT_LATIN, "Unknown Latin", "unklat", nullptr, nullptr},
+ {LANG_UNK_CYR, SCRIPT_CYRILLIC, "Unknown Cyrillic", "unkcyr", nullptr, nullptr},
+ {LANG_UNK_ALPHA, SCRIPT_OTHER, "Unknown Alpha", "unkalpha", nullptr, nullptr},
+
+ {LANG_FIN, SCRIPT_LATIN, "Finnish", "fin", "fi", nullptr},
+ {LANG_EST, SCRIPT_LATIN, "Estonian", "est", "et", nullptr},
+ {LANG_LAV, SCRIPT_LATIN, "Latvian", "lav", "lv", nullptr},
+ {LANG_LIT, SCRIPT_LATIN, "Lithuanian", "lit", "lt", nullptr},
+ {LANG_BAK, SCRIPT_CYRILLIC, "Bashkir", "bak", "ba", nullptr},
+ {LANG_TUR, SCRIPT_LATIN, "Turkish", "tur", "tr", nullptr},
+ {LANG_RUM, SCRIPT_LATIN, "Romanian", "rum", "ro", "ron"},
+ {LANG_MON, SCRIPT_CYRILLIC, "Mongolian", "mon", "mn", nullptr},
+ {LANG_UZB, SCRIPT_LATIN, "Uzbek", "uzb", "uz", "uz-Latn"},
+ {LANG_KIR, SCRIPT_CYRILLIC, "Kirghiz", "kir", "ky", "Kyrgyz"},
+ {LANG_TGK, SCRIPT_CYRILLIC, "Tajik", "tgk", "tg", nullptr},
+ {LANG_TUK, SCRIPT_LATIN, "Turkmen", "tuk", "tk", nullptr},
+ {LANG_SRP, SCRIPT_CYRILLIC, "Serbian", "srp", "sr", nullptr},
+ {LANG_AZE, SCRIPT_LATIN, "Azerbaijani", "aze", "az", "Azeri"},
+ {LANG_BASIC_ENG, SCRIPT_LATIN, "Basic English", "basic-eng", "bas-en", nullptr},
+ {LANG_GEO, SCRIPT_GEORGIAN, "Georgian", "geo", "ka", "kat"},
+ {LANG_ARA, SCRIPT_ARABIC, "Arabic", "ara", "ar", nullptr},
+ {LANG_PER, SCRIPT_ARABIC, "Persian", "per", "fa", "fas"},
+ {LANG_CHU, SCRIPT_CYRILLIC, "Church Slavonic", "chu", "cu", nullptr},
+ {LANG_CHI, SCRIPT_HAN, "Chinese", "chi", "zh", "zho"},
+ {LANG_JPN, SCRIPT_HIRAGANA, "Japanese", "jpn", "ja", nullptr},
+ {LANG_IND, SCRIPT_LATIN, "Indonesian", "ind", "id", "in"}, // 'in' is old ISO-639 code
+ {LANG_MAY, SCRIPT_LATIN, "Malay", "may", "ms", "msa"},
+ {LANG_THA, SCRIPT_THAI, "Thai", "tha", "th", nullptr},
+ {LANG_VIE, SCRIPT_LATIN, "Vietnamese", "vie", "vi", nullptr},
+ {LANG_GLE, SCRIPT_LATIN, "Irish", "gle", "ga", nullptr},
+ {LANG_TGL, SCRIPT_LATIN, "Tagalog", "tgl", "tl", "fil"},
+ {LANG_HIN, SCRIPT_DEVANAGARI, "Hindi", "hin", "hi", nullptr},
+ {LANG_AFR, SCRIPT_LATIN, "Afrikaans", "afr", "af", nullptr},
+ {LANG_URD, SCRIPT_ARABIC, "Urdu", "urd", "ur", nullptr},
+ {LANG_MYA, SCRIPT_MYANMAR, "Burmese", "mya", "my", nullptr},
+ {LANG_KHM, SCRIPT_KHMER, "Khmer", "khm", "km", nullptr},
+ {LANG_LAO, SCRIPT_LAO, "Lao", "lao", "lo", "Laotian, Laothian"},
+ {LANG_TAM, SCRIPT_TAMIL, "Tamil", "tam", "ta", nullptr},
+ {LANG_BEN, SCRIPT_BENGALI, "Bengali", "ben", "bn", nullptr},
+ {LANG_GUJ, SCRIPT_GUJARATI, "Gujarati", "guj", "gu", nullptr},
+ {LANG_KAN, SCRIPT_KANNADA, "Kannada", "kan", "kn", nullptr},
+ {LANG_PAN, SCRIPT_GURMUKHI, "Punjabi", "pan", "pa", nullptr},
+ {LANG_SIN, SCRIPT_SINHALA, "Sinhalese", "sin", "si", nullptr},
+ {LANG_SWA, SCRIPT_LATIN, "Swahili", "swa", "sw", nullptr},
+ {LANG_BAQ, SCRIPT_LATIN, "Basque", "baq", "eu", "eus"},
+ {LANG_WEL, SCRIPT_LATIN, "Welsh", "wel", "cy", "cym"},
+ {LANG_GLG, SCRIPT_LATIN, "Galician", "glg", "gl", nullptr},
+ {LANG_HAT, SCRIPT_LATIN, "Haitian Creole", "hat", "ht", "Haitian"},
+ {LANG_MLG, SCRIPT_LATIN, "Malagasy", "mlg", "mg", nullptr},
+ {LANG_CHV, SCRIPT_CYRILLIC, "Chuvash", "chv", "cv", nullptr},
+ {LANG_UDM, SCRIPT_CYRILLIC, "Udmurt", "udm", "udm", nullptr},
+ {LANG_KPV, SCRIPT_CYRILLIC, "Komi-Zyrian", "kpv", "kv", "Komi, kom"},
+ {LANG_MHR, SCRIPT_CYRILLIC, "Meadow Mari", "mhr", "mhr", "EasternMari, Mari, chm"},
+ {LANG_SJN, SCRIPT_LATIN, "Sindarin", "sjn", "sjn", nullptr},
+ {LANG_MRJ, SCRIPT_CYRILLIC, "Hill Mari", "mrj", "mrj", "WesternMari"},
+ {LANG_KOI, SCRIPT_CYRILLIC, "Komi-Permyak", "koi", "koi", nullptr},
+ {LANG_LTZ, SCRIPT_LATIN, "Luxembourgish", "ltz", "lb", "Luxemburgish"},
+ {LANG_GLA, SCRIPT_LATIN, "Scottish Gaelic", "gla", "gd", "Gaelic"},
+ {LANG_CEB, SCRIPT_LATIN, "Cebuano", "ceb", "ceb", "Bisaya, Binisaya, Visayan"},
+ {LANG_PUS, SCRIPT_ARABIC, "Pashto", "pus", "ps", nullptr},
+ {LANG_KMR, SCRIPT_LATIN, "Kurmanji", "kmr", "ku", "Kurdish"},
+ {LANG_AMH, SCRIPT_ETHIOPIC, "Amharic", "amh", "am", nullptr},
+ {LANG_ZUL, SCRIPT_LATIN, "Zulu", "zul", "zu", nullptr},
+ {LANG_IBO, SCRIPT_LATIN, "Igbo", "ibo", "ig", "Ibo"},
+ {LANG_YOR, SCRIPT_LATIN, "Yoruba", "yor", "yo", nullptr},
+ {LANG_COS, SCRIPT_LATIN, "Corsican", "cos", "co", nullptr},
+ {LANG_XHO, SCRIPT_LATIN, "Xhosa", "xho", "xh", nullptr},
+ {LANG_JAV, SCRIPT_LATIN, "Javanese", "jav", "jv", nullptr}, // Also SCRIPT_JAVANESE and SCRIPT_ARABIC
+ {LANG_NEP, SCRIPT_DEVANAGARI, "Nepali", "nep", "ne", nullptr},
+ {LANG_SND, SCRIPT_DEVANAGARI, "Sindhi", "snd", "sd", nullptr}, // Also SCRIPT_ARABIC and SCRIPT_GUJARATI
+ {LANG_SOM, SCRIPT_LATIN, "Somali", "som", "so", nullptr},
+ {LANG_EPO, SCRIPT_LATIN, "Esperanto", "epo", "eo", nullptr},
+ {LANG_TEL, SCRIPT_TELUGU, "Telugu", "tel", "te", nullptr},
+ {LANG_MAR, SCRIPT_DEVANAGARI, "Marathi", "mar", "mr", nullptr},
+ {LANG_HAU, SCRIPT_LATIN, "Hausa", "hau", "ha", nullptr},
+ {LANG_YID, SCRIPT_HEBREW, "Yiddish", "yid", "yi", nullptr},
+ {LANG_MAL, SCRIPT_MALAYALAM, "Malayalam", "mal", "ml", nullptr},
+ {LANG_MAO, SCRIPT_LATIN, "Maori", "mao", "mi", "mri"},
+ {LANG_SUN, SCRIPT_LATIN, "Sundanese", "sun", "su", nullptr},
+ {LANG_PAP, SCRIPT_LATIN, "Papiamento", "pap", "pap", nullptr},
+ {LANG_UZB_CYR, SCRIPT_CYRILLIC, "Cyrillic Uzbek", "uzbcyr", "uz-Cyrl", nullptr}, // https://tools.ietf.org/html/rfc5646
+ {LANG_TRANSCR_IPA, SCRIPT_LATIN, "International Phonetic Alphabet Transcription", "ipa", "tr-ipa", nullptr},
+ {LANG_EMJ, SCRIPT_LATIN, "Emoji", "emj", "emj", nullptr},
+ {LANG_UYG, SCRIPT_ARABIC, "Uyghur", "uig", "ug", nullptr},
+ {LANG_BRE, SCRIPT_LATIN, "Breton", "bre", "br", nullptr},
+ {LANG_SAH, SCRIPT_CYRILLIC, "Yakut", "sah", "sah", nullptr},
+ {LANG_KAZ_LAT, SCRIPT_LATIN, "Latin Kazakh", "kazlat", "kk-Latn", nullptr},
+ };
+
+ static_assert(static_cast<size_t>(LANG_MAX) == Y_ARRAY_SIZE(LanguageNameAndEnum), "Size doesn't match");
+
+ class TLanguagesMap {
+ private:
+ static const char* const EMPTY_NAME;
+
+ using TNamesHash = THashMap<TStringBuf, ELanguage, TCIOps, TCIOps>;
+ TNamesHash Hash;
+
+ using TNamesArray = std::array<const char*, static_cast<size_t>(LANG_MAX)>;
+ TNamesArray BiblioNames;
+ TNamesArray IsoNames;
+ TNamesArray FullNames;
+
+ using TScripts = std::array<EScript, static_cast<size_t>(LANG_MAX)>;
+ TScripts Scripts;
+
+ private:
+ void AddNameToHash(const TStringBuf& name, ELanguage language) {
+ if (Hash.find(name) != Hash.end()) {
+ Y_ASSERT(Hash.find(name)->second == language);
+ return;
+ }
+
+ Hash[name] = language;
+ }
+
+ void AddName(const char* name, ELanguage language, TNamesArray& names) {
+ if (name == nullptr || strlen(name) == 0)
+ return;
+
+ Y_ASSERT(names[language] == EMPTY_NAME);
+ names[language] = name;
+
+ AddNameToHash(name, language);
+ }
+
+ void AddSynonyms(const char* syn, ELanguage language) {
+ static const char* del = " ,;";
+ if (!syn)
+ return;
+ while (*syn) {
+ size_t len = strcspn(syn, del);
+ AddNameToHash(TStringBuf(syn, len), language);
+ syn += len;
+ while (*syn && strchr(del, *syn))
+ ++syn;
+ }
+ }
+
+ public:
+ TLanguagesMap() {
+ BiblioNames.fill(EMPTY_NAME);
+ IsoNames.fill(EMPTY_NAME);
+ FullNames.fill(EMPTY_NAME);
+ Scripts.fill(SCRIPT_OTHER);
+
+ for (size_t i = 0; i != Y_ARRAY_SIZE(LanguageNameAndEnum); ++i) {
+ const TLanguageNameAndEnum& val = LanguageNameAndEnum[i];
+
+ ELanguage language = val.Language;
+
+ AddName(val.BiblioName, language, BiblioNames);
+ AddName(val.IsoName, language, IsoNames);
+ AddName(val.EnglishName, language, FullNames);
+ AddSynonyms(val.Synonyms, language);
+
+ if (Scripts[language] == SCRIPT_OTHER) {
+ Scripts[language] = val.Script;
+ }
+ }
+ }
+
+ public:
+ inline ELanguage LanguageByName(const TStringBuf& name, ELanguage def) const {
+ if (!name)
+ return def;
+
+ TNamesHash::const_iterator i = Hash.find(name);
+ if (i == Hash.end()) {
+ // Try to extract the primary language code from constructions like "en-cockney" or "zh_Hant"
+ size_t dash_pos = name.find_first_of("_-");
+ if (dash_pos != TStringBuf::npos)
+ i = Hash.find(name.substr(0, dash_pos));
+ if (i == Hash.end())
+ return def;
+ }
+
+ return i->second;
+ }
+
+ inline const char* FullNameByLanguage(ELanguage language) const {
+ if (language < 0 || static_cast<size_t>(language) >= FullNames.size())
+ return nullptr;
+
+ return FullNames[language];
+ }
+ inline const char* BiblioNameByLanguage(ELanguage language) const {
+ if (language < 0 || static_cast<size_t>(language) >= BiblioNames.size())
+ return nullptr;
+
+ return BiblioNames[language];
+ }
+ inline const char* IsoNameByLanguage(ELanguage language) const {
+ if (language < 0 || static_cast<size_t>(language) >= IsoNames.size())
+ return nullptr;
+
+ return IsoNames[language];
+ }
+
+ inline EScript Script(ELanguage language) const {
+ return Scripts[language];
+ }
+ };
+}
+
+const char* const TLanguagesMap::EMPTY_NAME = "";
+
+const char* FullNameByLanguage(ELanguage language) {
+ return Singleton<TLanguagesMap>()->FullNameByLanguage(language);
+}
+const char* NameByLanguage(ELanguage language) {
+ return Singleton<TLanguagesMap>()->BiblioNameByLanguage(language);
+}
+const char* IsoNameByLanguage(ELanguage language) {
+ return Singleton<TLanguagesMap>()->IsoNameByLanguage(language);
+}
+
+ELanguage LanguageByNameStrict(const TStringBuf& name) {
+ return Singleton<TLanguagesMap>()->LanguageByName(name, LANG_MAX);
+}
+
+ELanguage LanguageByNameOrDie(const TStringBuf& name) {
+ ELanguage result = LanguageByNameStrict(name);
+ if (result == LANG_MAX) {
+ ythrow yexception() << "LanguageByNameOrDie: invalid language '" << name << "'";
+ }
+ return result;
+}
+
+ELanguage LanguageByName(const TStringBuf& name) {
+ return Singleton<TLanguagesMap>()->LanguageByName(name, LANG_UNK);
+}
+
+EScript ScriptByLanguage(ELanguage language) {
+ return Singleton<TLanguagesMap>()->Script(language);
+}
+
+namespace {
+ const size_t MAX_GLYPH = 0x10000;
+ class TScriptGlyphIndex {
+ public:
+ TScriptGlyphIndex() {
+ NCharsetInternal::InitScriptData(Data, MAX_GLYPH);
+ }
+
+ EScript GetGlyphScript(wchar32 glyph) const {
+ if (glyph >= MAX_GLYPH)
+ return SCRIPT_UNKNOWN;
+ return (EScript)Data[glyph];
+ }
+
+ private:
+ ui8 Data[MAX_GLYPH];
+ };
+}
+
+EScript ScriptByGlyph(wchar32 glyph) {
+ return HugeSingleton<TScriptGlyphIndex>()->GetGlyphScript(glyph);
+}
+
+template <>
+void Out<ELanguage>(IOutputStream& o, ELanguage lang) {
+ o << NameByLanguage(lang);
+}
diff --git a/library/cpp/langs/langs.h b/library/cpp/langs/langs.h
new file mode 100644
index 0000000000..360ab6a832
--- /dev/null
+++ b/library/cpp/langs/langs.h
@@ -0,0 +1,229 @@
+#pragma once
+
+#include "scripts.h"
+
+#include <util/generic/strbuf.h>
+#include <util/system/defaults.h>
+
+#if defined(_win_)
+// LANG_LAO is #define in WinNT.h
+#undef LANG_LAO
+#endif
+
+// Language names are given according to ISO 639-2/B
+// Some languages are not present in ISO 639-2/B. Then ISO 639-3 is used.
+// http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+enum ELanguage {
+ LANG_UNK = 0, // Unknown
+ LANG_RUS = 1, // Russian
+ LANG_ENG = 2, // English
+ LANG_POL = 3, // Polish
+ LANG_HUN = 4, // Hungarian
+ LANG_UKR = 5, // Ukrainian
+ LANG_GER = 6, // German
+ LANG_FRE = 7, // French
+ LANG_TAT = 8, // Tatar
+ LANG_BEL = 9, // Belarusian
+ LANG_KAZ = 10, // Kazakh
+ LANG_ALB = 11, // Albanian
+ LANG_SPA = 12, // Spanish
+ LANG_ITA = 13, // Italian
+ LANG_ARM = 14, // Armenian
+ LANG_DAN = 15, // Danish
+ LANG_POR = 16, // Portuguese
+ LANG_ICE = 17, // Icelandic
+ LANG_SLO = 18, // Slovak
+ LANG_SLV = 19, // Slovene
+ LANG_DUT = 20, // Dutch (Netherlandish language)
+ LANG_BUL = 21, // Bulgarian
+ LANG_CAT = 22, // Catalan
+ LANG_HRV = 23, // Croatian
+ LANG_CZE = 24, // Czech
+ LANG_GRE = 25, // Greek
+ LANG_HEB = 26, // Hebrew
+ LANG_NOR = 27, // Norwegian
+ LANG_MAC = 28, // Macedonian
+ LANG_SWE = 29, // Swedish
+ LANG_KOR = 30, // Korean
+ LANG_LAT = 31, // Latin
+ LANG_BASIC_RUS = 32, // Simplified version of Russian (used at lemmer only)
+ LANG_BOS = 33, // Bosnian
+ LANG_MLT = 34, // Maltese
+ LANG_EMPTY = 35, // Indicate that document is empty
+ LANG_UNK_LAT = 36, // Any unrecognized latin language
+ LANG_UNK_CYR = 37, // Any unrecognized cyrillic language
+ LANG_UNK_ALPHA = 38, // Any unrecognized alphabetic language not fit into previous categories
+ LANG_FIN = 39, // Finnish
+ LANG_EST = 40, // Estonian
+ LANG_LAV = 41, // Latvian
+ LANG_LIT = 42, // Lithuanian
+ LANG_BAK = 43, // Bashkir
+ LANG_TUR = 44, // Turkish
+ LANG_RUM = 45, // Romanian (also Moldavian)
+ LANG_MON = 46, // Mongolian
+ LANG_UZB = 47, // Uzbek
+ LANG_KIR = 48, // Kirghiz
+ LANG_TGK = 49, // Tajik
+ LANG_TUK = 50, // Turkmen
+ LANG_SRP = 51, // Serbian
+ LANG_AZE = 52, // Azerbaijani
+ LANG_BASIC_ENG = 53, // Simplified version of English (used at lemmer only)
+ LANG_GEO = 54, // Georgian
+ LANG_ARA = 55, // Arabic
+ LANG_PER = 56, // Persian
+ LANG_CHU = 57, // Church Slavonic
+ LANG_CHI = 58, // Chinese
+ LANG_JPN = 59, // Japanese
+ LANG_IND = 60, // Indonesian
+ LANG_MAY = 61, // Malay
+ LANG_THA = 62, // Thai
+ LANG_VIE = 63, // Vietnamese
+ LANG_GLE = 64, // Irish (Gaelic)
+ LANG_TGL = 65, // Tagalog (Filipino)
+ LANG_HIN = 66, // Hindi
+ LANG_AFR = 67, // Afrikaans
+ LANG_URD = 68, // Urdu
+ LANG_MYA = 69, // Burmese
+ LANG_KHM = 70, // Khmer
+ LANG_LAO = 71, // Lao
+ LANG_TAM = 72, // Tamil
+ LANG_BEN = 73, // Bengali
+ LANG_GUJ = 74, // Gujarati
+ LANG_KAN = 75, // Kannada
+ LANG_PAN = 76, // Punjabi
+ LANG_SIN = 77, // Sinhalese
+ LANG_SWA = 78, // Swahili
+ LANG_BAQ = 79, // Basque
+ LANG_WEL = 80, // Welsh
+ LANG_GLG = 81, // Galician
+ LANG_HAT = 82, // Haitian Creole
+ LANG_MLG = 83, // Malagasy
+ LANG_CHV = 84, // Chuvash
+ LANG_UDM = 85, // Udmurt
+ LANG_KPV = 86, // Komi-Zyrian
+ LANG_MHR = 87, // Meadow Mari (Eastern Mari)
+ LANG_SJN = 88, // Sindarin
+ LANG_MRJ = 89, // Hill Mari (Western Mari)
+ LANG_KOI = 90, // Komi-Permyak
+ LANG_LTZ = 91, // Luxembourgish
+ LANG_GLA = 92, // Scottish Gaelic
+ LANG_CEB = 93, // Cebuano
+ LANG_PUS = 94, // Pashto
+ LANG_KMR = 95, // Kurmanji
+ LANG_AMH = 96, // Amharic
+ LANG_ZUL = 97, // Zulu
+ LANG_IBO = 98, // Igbo
+ LANG_YOR = 99, // Yoruba
+ LANG_COS = 100, // Corsican
+ LANG_XHO = 101, // Xhosa
+ LANG_JAV = 102, // Javanese
+ LANG_NEP = 103, // Nepali
+ LANG_SND = 104, // Sindhi
+ LANG_SOM = 105, // Somali
+ LANG_EPO = 106, // Esperanto
+ LANG_TEL = 107, // Telugu
+ LANG_MAR = 108, // Marathi
+ LANG_HAU = 109, // Hausa
+ LANG_YID = 110, // Yiddish
+ LANG_MAL = 111, // Malayalam
+ LANG_MAO = 112, // Maori
+ LANG_SUN = 113, // Sundanese
+ LANG_PAP = 114, // Papiamento
+ LANG_UZB_CYR = 115, // Cyrillic Uzbek
+ LANG_TRANSCR_IPA = 116, // International Phonetic Alphabet Transcription
+ LANG_EMJ = 117, // Emoji
+ LANG_UYG = 118, // Uyghur
+ LANG_BRE = 119, // Breton
+ LANG_SAH = 120, // Yakut
+ LANG_KAZ_LAT = 121, // Latin Kazakh
+ LANG_MAX
+};
+
+/**
+ * Converts string to corresponding enum. Will try to extract the primary language code from
+ * constructions like "en-cockney" or "zh_Hant". In case of failure will return `LANG_UNK`.
+ *
+ * @param name Language name
+ * @return Language enum
+ */
+ELanguage LanguageByName(const TStringBuf& name);
+
+/**
+ * Same as `LanguageByName`, but in case of failure will return `LANG_MAX`.
+ *
+ * @see LanguageByName
+ */
+ELanguage LanguageByNameStrict(const TStringBuf& name);
+
+/**
+ * Converts language enum to corresponding ISO 639-2/B alpha-3 code. For languages missing in ISO
+ * standard convertions are:
+ * - LANG_UNK: "unk"
+ * - LANG_BASIC_RUS: "basic-rus"
+ * - LANG_EMPTY: "empty"
+ * - LANG_UNK_LAT: "unklat"
+ * - LANG_UNK_CYR: "unkcyr"
+ * - LANG_UNK_ALPHA: "unkalpha"
+ * - LANG_BASIC_ENG: "basic-eng"
+ * - LANG_TRANSCR_IPA "transcr-ipa"
+ * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be
+ * `nullptr`.
+ *
+ * @param language Language enum
+ * @return Language ISO 639-2/B alpha-3 code
+ */
+const char* NameByLanguage(ELanguage language);
+
+/**
+ * Converts language enum to corresponding ISO 639-1 alpha-2 code. For languages missing in ISO
+ * standard convertions are:
+ * - LANG_UNK: "mis"
+ * - LANG_BASIC_RUS: "bas-ru"
+ * - LANG_EMPTY: ""
+ * - LANG_UNK_LAT: ""
+ * - LANG_UNK_CYR: ""
+ * - LANG_UNK_ALPHA: ""
+ * - LANG_BASIC_ENG: "bas-en"
+ * - LANG_TRANSCR_IPA "tr-ipa"
+ * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be
+ * `nullptr`.
+ *
+ * @param language Language enum
+ * @return Language ISO 639-1 alpha-2 code
+ */
+const char* IsoNameByLanguage(ELanguage language);
+
+/**
+ * Converts language enum to corresponding human-readable language name. E.g. "Russian" for
+ * `LANG_RUS` or "Basic Russian" for `LANG_BASIC_RUS`. If language is missing in `ELanguage` or if
+ * it is a `LANG_MAX` then return value will be `nullptr`.
+ *
+ * @param language Language enum
+ */
+const char* FullNameByLanguage(ELanguage language);
+
+/**
+ * Same as `LanguageByNameStrict` but in case of failure will throw `yexception`.
+ *
+ * @see LanguageByNameStrict
+ */
+ELanguage LanguageByNameOrDie(const TStringBuf& name);
+
+constexpr bool UnknownLanguage(const ELanguage language) noexcept {
+ return language == LANG_UNK || language == LANG_UNK_LAT || language == LANG_UNK_CYR || language == LANG_UNK_ALPHA || language == LANG_EMPTY;
+}
+
+EScript ScriptByLanguage(ELanguage language);
+EScript ScriptByGlyph(wchar32 glyph);
+
+namespace NCharsetInternal {
+ void InitScriptData(ui8 data[], size_t len);
+}
+
+inline bool LatinScript(ELanguage language) {
+ return ScriptByLanguage(language) == SCRIPT_LATIN;
+}
+
+inline bool CyrillicScript(ELanguage language) {
+ return ScriptByLanguage(language) == SCRIPT_CYRILLIC;
+}
diff --git a/library/cpp/langs/scripts.cpp b/library/cpp/langs/scripts.cpp
new file mode 100644
index 0000000000..41cc91d3ce
--- /dev/null
+++ b/library/cpp/langs/scripts.cpp
@@ -0,0 +1,158 @@
+#include "scripts.h"
+
+#include <library/cpp/digest/lower_case/hash_ops.h>
+
+#include <util/generic/hash.h>
+#include <util/generic/singleton.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/yexception.h>
+#include <util/system/defaults.h>
+
+#include <array>
+
+namespace {
+ struct TScriptNameAndEnum {
+ EScript Script;
+ const char* EnglishName;
+ const char* IsoName;
+ };
+
+ const TScriptNameAndEnum ScriptNameAndEnum[] = {
+ {SCRIPT_UNKNOWN, "Unknown", "Zzzz"},
+ {SCRIPT_LATIN, "Latin", "Latn"},
+ {SCRIPT_CYRILLIC, "Cyrillic", "Cyrl"},
+
+ {SCRIPT_GREEK, "Greek", "Grek"},
+ {SCRIPT_ARABIC, "Arabic", "Arab"},
+ {SCRIPT_HEBREW, "Hebrew", "Hebr"},
+ {SCRIPT_ARMENIAN, "Armenian", "Armn"},
+ {SCRIPT_GEORGIAN, "Georgian", "Geor"},
+
+ {SCRIPT_HAN, "Han", "Hans"}, // We use more common Simpliied variant (as opposed to Traditional 'Hant')
+ {SCRIPT_KATAKANA, "Katakana", "Kana"},
+ {SCRIPT_HIRAGANA, "Hiragana", "Hira"},
+ {SCRIPT_HANGUL, "Hangul", "Hang"},
+
+ {SCRIPT_DEVANAGARI, "Devanagari", "Deva"},
+ {SCRIPT_BENGALI, "Bengali", "Beng"},
+ {SCRIPT_GUJARATI, "Gujarati", "Gujr"},
+ {SCRIPT_GURMUKHI, "Gurmukhi", "Guru"},
+ {SCRIPT_KANNADA, "Kannada", "Knda"},
+ {SCRIPT_MALAYALAM, "Malayalam", "Mlym"},
+ {SCRIPT_ORIYA, "Oriya", "Orya"},
+ {SCRIPT_TAMIL, "Tamil", "Taml"},
+ {SCRIPT_TELUGU, "Telugu", "Telu"},
+ {SCRIPT_THAANA, "Thaana", "Thaa"},
+ {SCRIPT_SINHALA, "Sinhala", "Sinh"},
+
+ {SCRIPT_MYANMAR, "Myanmar", "Mymr"},
+ {SCRIPT_THAI, "Thai", "Thai"},
+ {SCRIPT_LAO, "Lao", "Laoo"},
+ {SCRIPT_KHMER, "Khmer", "Khmr"},
+ {SCRIPT_TIBETAN, "Tibetan", "Tibt"},
+ {SCRIPT_MONGOLIAN, "Mongolian", "Mong"},
+
+ {SCRIPT_ETHIOPIC, "Ethiopic", "Ethi"},
+ {SCRIPT_RUNIC, "Runic", "Runr"},
+ {SCRIPT_COPTIC, "Coptic", "Copt"},
+ {SCRIPT_SYRIAC, "Syriac", "Syrc"},
+
+ {SCRIPT_OTHER, "Other", "Zyyy"},
+ };
+
+ static_assert(static_cast<size_t>(SCRIPT_MAX) == Y_ARRAY_SIZE(ScriptNameAndEnum), "Size doesn't match");
+
+ class TScriptsMap {
+ private:
+ static const char* const EMPTY_NAME;
+
+ using TNamesHash = THashMap<TStringBuf, EScript, TCIOps, TCIOps>;
+ TNamesHash Hash;
+
+ using TNamesArray = std::array<const char*, static_cast<size_t>(SCRIPT_MAX)>;
+ TNamesArray IsoNames;
+ TNamesArray FullNames;
+
+ private:
+ void AddNameToHash(const TStringBuf& name, EScript script) {
+ if (Hash.find(name) != Hash.end()) {
+ Y_ASSERT(Hash.find(name)->second == script);
+ return;
+ }
+
+ Hash[name] = script;
+ }
+
+ void AddName(const char* name, EScript script, TNamesArray& names) {
+ if (name == nullptr || strlen(name) == 0)
+ return;
+
+ Y_ASSERT(names[script] == EMPTY_NAME);
+ names[script] = name;
+
+ AddNameToHash(name, script);
+ }
+
+ public:
+ TScriptsMap() {
+ IsoNames.fill(EMPTY_NAME);
+ FullNames.fill(EMPTY_NAME);
+
+ for (const auto& val : ScriptNameAndEnum) {
+ EScript script = val.Script;
+
+ AddName(val.IsoName, script, IsoNames);
+ AddName(val.EnglishName, script, FullNames);
+ }
+ }
+
+ public:
+ inline EScript ScriptByName(const TStringBuf& name, EScript def) const {
+ if (!name)
+ return def;
+
+ TNamesHash::const_iterator i = Hash.find(name);
+ if (i == Hash.end()) {
+ return def;
+ }
+
+ return i->second;
+ }
+
+ inline const char* FullNameByScript(EScript script) const {
+ if (script < 0 || static_cast<size_t>(script) >= FullNames.size())
+ return nullptr;
+
+ return FullNames[script];
+ }
+
+ inline const char* IsoNameByScript(EScript script) const {
+ if (script < 0 || static_cast<size_t>(script) >= IsoNames.size())
+ return nullptr;
+
+ return IsoNames[script];
+ }
+ };
+}
+
+const char* const TScriptsMap::EMPTY_NAME = "";
+
+const char* FullNameByScript(EScript script) {
+ return Singleton<TScriptsMap>()->FullNameByScript(script);
+}
+
+const char* IsoNameByScript(EScript script) {
+ return Singleton<TScriptsMap>()->IsoNameByScript(script);
+}
+
+EScript ScriptByName(const TStringBuf& name) {
+ return Singleton<TScriptsMap>()->ScriptByName(name, SCRIPT_UNKNOWN);
+}
+
+EScript ScriptByNameOrDie(const TStringBuf& name) {
+ EScript result = ScriptByName(name);
+ if (result == SCRIPT_UNKNOWN) {
+ ythrow yexception() << "ScriptByNameOrDie: invalid script '" << name << "'";
+ }
+ return result;
+}
diff --git a/library/cpp/langs/scripts.h b/library/cpp/langs/scripts.h
new file mode 100644
index 0000000000..4c47a33d2c
--- /dev/null
+++ b/library/cpp/langs/scripts.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <util/generic/strbuf.h>
+
+// Writing systems, a.k.a. scripts
+//
+enum EScript {
+ SCRIPT_UNKNOWN = 0,
+ SCRIPT_LATIN,
+ SCRIPT_CYRILLIC,
+
+ SCRIPT_GREEK,
+ SCRIPT_ARABIC,
+ SCRIPT_HEBREW,
+ SCRIPT_ARMENIAN,
+ SCRIPT_GEORGIAN,
+
+ SCRIPT_HAN,
+ SCRIPT_KATAKANA,
+ SCRIPT_HIRAGANA,
+ SCRIPT_HANGUL,
+
+ SCRIPT_DEVANAGARI,
+ SCRIPT_BENGALI,
+ SCRIPT_GUJARATI,
+ SCRIPT_GURMUKHI,
+ SCRIPT_KANNADA,
+ SCRIPT_MALAYALAM,
+ SCRIPT_ORIYA,
+ SCRIPT_TAMIL,
+ SCRIPT_TELUGU,
+ SCRIPT_THAANA,
+ SCRIPT_SINHALA,
+
+ SCRIPT_MYANMAR,
+ SCRIPT_THAI,
+ SCRIPT_LAO,
+ SCRIPT_KHMER,
+ SCRIPT_TIBETAN,
+ SCRIPT_MONGOLIAN,
+
+ SCRIPT_ETHIOPIC,
+ SCRIPT_RUNIC,
+ SCRIPT_COPTIC,
+ SCRIPT_SYRIAC,
+
+ SCRIPT_OTHER,
+ SCRIPT_MAX
+};
+
+// According to ISO 15924 codes. See https://en.wikipedia.org/wiki/ISO_15924
+//
+EScript ScriptByName(const TStringBuf& name);
+EScript ScriptByNameOrDie(const TStringBuf& name);
+const char* IsoNameByScript(EScript script);
+const char* FullNameByScript(EScript script);