diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/langs | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/langs')
-rw-r--r-- | library/cpp/langs/README.md | 8 | ||||
-rw-r--r-- | library/cpp/langs/generated/uniscripts.cpp | 458 | ||||
-rw-r--r-- | library/cpp/langs/langs.cpp | 330 | ||||
-rw-r--r-- | library/cpp/langs/langs.h | 229 | ||||
-rw-r--r-- | library/cpp/langs/scripts.cpp | 158 | ||||
-rw-r--r-- | library/cpp/langs/scripts.h | 56 |
6 files changed, 1239 insertions, 0 deletions
diff --git a/library/cpp/langs/README.md b/library/cpp/langs/README.md new file mode 100644 index 0000000000..537ae31e1b --- /dev/null +++ b/library/cpp/langs/README.md @@ -0,0 +1,8 @@ +Здесь описаны константы для [языков](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h) и [письменностей](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/scripts.h) (скриптов в терминах Unicode). + +В терминах этих констант языков работают [документная](https://a.yandex-team.ru/arc/trunk/arcadia/kernel/recshell/recshell.h) и [запросная](https://a.yandex-team.ru/arc/trunk/arcadia/dict/recognize/queryrec) распознавалки языка. + +Имеется [набор функций](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h?rev=r6909333#L142-214) для преобразования констант в двухбуквенный или трехбуквенный код и обратного получения константы по строке с учетом синонимов. Есть [функции](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h?rev=r6909333#L216-217) для определения письменности по языку и по символу). + +В списке констант представлены не все языки и письменности, а лишь те, которые представляли интерес для поиска Яндекса и машинного перевода. +Имеется несколько псевдоязыков типа `LANG_UZB_CYR` или `LANG_KAZ_LAT`. diff --git a/library/cpp/langs/generated/uniscripts.cpp b/library/cpp/langs/generated/uniscripts.cpp new file mode 100644 index 0000000000..59cc6a70c2 --- /dev/null +++ b/library/cpp/langs/generated/uniscripts.cpp @@ -0,0 +1,458 @@ +// Generated from http://www.unicode.org/Public/UNIDATA/Scripts.txt +// The best way to alter this file is to modify uniscripts.py +#include <library/cpp/langs/langs.h> +#include <util/system/yassert.h> + +#include <cstring> + +namespace NCharsetInternal { + struct TScriptRange { + EScript Script; + wchar32 Start; + wchar32 End; + }; + + const TScriptRange ScriptRanges[] = { + { SCRIPT_ETHIOPIC, 0x1200, 0x1248 }, + { SCRIPT_ETHIOPIC, 0x124A, 0x124D }, + { SCRIPT_ETHIOPIC, 0x1250, 0x1256 }, + { SCRIPT_ETHIOPIC, 0x1258, 0x1258 }, + { SCRIPT_ETHIOPIC, 0x125A, 0x125D }, + { SCRIPT_ETHIOPIC, 0x1260, 0x1288 }, + { SCRIPT_ETHIOPIC, 0x128A, 0x128D }, + { SCRIPT_ETHIOPIC, 0x1290, 0x12B0 }, + { SCRIPT_ETHIOPIC, 0x12B2, 0x12B5 }, + { SCRIPT_ETHIOPIC, 0x12B8, 0x12BE }, + { SCRIPT_ETHIOPIC, 0x12C0, 0x12C0 }, + { SCRIPT_ETHIOPIC, 0x12C2, 0x12C5 }, + { SCRIPT_ETHIOPIC, 0x12C8, 0x12D6 }, + { SCRIPT_ETHIOPIC, 0x12D8, 0x1310 }, + { SCRIPT_ETHIOPIC, 0x1312, 0x1315 }, + { SCRIPT_ETHIOPIC, 0x1318, 0x135A }, + { SCRIPT_ETHIOPIC, 0x135D, 0x137C }, + { SCRIPT_ETHIOPIC, 0x1380, 0x1399 }, + { SCRIPT_ETHIOPIC, 0x2D80, 0x2D96 }, + { SCRIPT_ETHIOPIC, 0x2DA0, 0x2DA6 }, + { SCRIPT_ETHIOPIC, 0x2DA8, 0x2DAE }, + { SCRIPT_ETHIOPIC, 0x2DB0, 0x2DB6 }, + { SCRIPT_ETHIOPIC, 0x2DB8, 0x2DBE }, + { SCRIPT_ETHIOPIC, 0x2DC0, 0x2DC6 }, + { SCRIPT_ETHIOPIC, 0x2DC8, 0x2DCE }, + { SCRIPT_ETHIOPIC, 0x2DD0, 0x2DD6 }, + { SCRIPT_ETHIOPIC, 0x2DD8, 0x2DDE }, + { SCRIPT_ETHIOPIC, 0xAB01, 0xAB06 }, + { SCRIPT_ETHIOPIC, 0xAB09, 0xAB0E }, + { SCRIPT_ETHIOPIC, 0xAB11, 0xAB16 }, + { SCRIPT_ETHIOPIC, 0xAB20, 0xAB26 }, + { SCRIPT_ETHIOPIC, 0xAB28, 0xAB2E }, + { SCRIPT_ARABIC, 0x600, 0x604 }, + { SCRIPT_ARABIC, 0x606, 0x60B }, + { SCRIPT_ARABIC, 0x60D, 0x61A }, + { SCRIPT_ARABIC, 0x61E, 0x61E }, + { SCRIPT_ARABIC, 0x620, 0x63F }, + { SCRIPT_ARABIC, 0x641, 0x64A }, + { SCRIPT_ARABIC, 0x656, 0x66F }, + { SCRIPT_ARABIC, 0x671, 0x6DC }, + { SCRIPT_ARABIC, 0x6DE, 0x6FF }, + { SCRIPT_ARABIC, 0x750, 0x77F }, + { SCRIPT_ARABIC, 0x8A0, 0x8B4 }, + { SCRIPT_ARABIC, 0x8B6, 0x8BD }, + { SCRIPT_ARABIC, 0x8D4, 0x8E1 }, + { SCRIPT_ARABIC, 0x8E3, 0x8FF }, + { SCRIPT_ARABIC, 0xFB50, 0xFBC1 }, + { SCRIPT_ARABIC, 0xFBD3, 0xFD3D }, + { SCRIPT_ARABIC, 0xFD50, 0xFD8F }, + { SCRIPT_ARABIC, 0xFD92, 0xFDC7 }, + { SCRIPT_ARABIC, 0xFDF0, 0xFDFD }, + { SCRIPT_ARABIC, 0xFE70, 0xFE74 }, + { SCRIPT_ARABIC, 0xFE76, 0xFEFC }, + { SCRIPT_MONGOLIAN, 0x1800, 0x1801 }, + { SCRIPT_MONGOLIAN, 0x1804, 0x1804 }, + { SCRIPT_MONGOLIAN, 0x1806, 0x180E }, + { SCRIPT_MONGOLIAN, 0x1810, 0x1819 }, + { SCRIPT_MONGOLIAN, 0x1820, 0x1877 }, + { SCRIPT_MONGOLIAN, 0x1880, 0x18AA }, + { SCRIPT_TAMIL, 0xB82, 0xB83 }, + { SCRIPT_TAMIL, 0xB85, 0xB8A }, + { SCRIPT_TAMIL, 0xB8E, 0xB90 }, + { SCRIPT_TAMIL, 0xB92, 0xB95 }, + { SCRIPT_TAMIL, 0xB99, 0xB9A }, + { SCRIPT_TAMIL, 0xB9C, 0xB9C }, + { SCRIPT_TAMIL, 0xB9E, 0xB9F }, + { SCRIPT_TAMIL, 0xBA3, 0xBA4 }, + { SCRIPT_TAMIL, 0xBA8, 0xBAA }, + { SCRIPT_TAMIL, 0xBAE, 0xBB9 }, + { SCRIPT_TAMIL, 0xBBE, 0xBC2 }, + { SCRIPT_TAMIL, 0xBC6, 0xBC8 }, + { SCRIPT_TAMIL, 0xBCA, 0xBCD }, + { SCRIPT_TAMIL, 0xBD0, 0xBD0 }, + { SCRIPT_TAMIL, 0xBD7, 0xBD7 }, + { SCRIPT_TAMIL, 0xBE6, 0xBFA }, + { SCRIPT_GUJARATI, 0xA81, 0xA83 }, + { SCRIPT_GUJARATI, 0xA85, 0xA8D }, + { SCRIPT_GUJARATI, 0xA8F, 0xA91 }, + { SCRIPT_GUJARATI, 0xA93, 0xAA8 }, + { SCRIPT_GUJARATI, 0xAAA, 0xAB0 }, + { SCRIPT_GUJARATI, 0xAB2, 0xAB3 }, + { SCRIPT_GUJARATI, 0xAB5, 0xAB9 }, + { SCRIPT_GUJARATI, 0xABC, 0xAC5 }, + { SCRIPT_GUJARATI, 0xAC7, 0xAC9 }, + { SCRIPT_GUJARATI, 0xACB, 0xACD }, + { SCRIPT_GUJARATI, 0xAD0, 0xAD0 }, + { SCRIPT_GUJARATI, 0xAE0, 0xAE3 }, + { SCRIPT_GUJARATI, 0xAE6, 0xAF1 }, + { SCRIPT_GUJARATI, 0xAF9, 0xAF9 }, + { SCRIPT_MALAYALAM, 0xD01, 0xD03 }, + { SCRIPT_MALAYALAM, 0xD05, 0xD0C }, + { SCRIPT_MALAYALAM, 0xD0E, 0xD10 }, + { SCRIPT_MALAYALAM, 0xD12, 0xD3A }, + { SCRIPT_MALAYALAM, 0xD3D, 0xD44 }, + { SCRIPT_MALAYALAM, 0xD46, 0xD48 }, + { SCRIPT_MALAYALAM, 0xD4A, 0xD4F }, + { SCRIPT_MALAYALAM, 0xD54, 0xD63 }, + { SCRIPT_MALAYALAM, 0xD66, 0xD7F }, + { SCRIPT_ARMENIAN, 0x531, 0x556 }, + { SCRIPT_ARMENIAN, 0x559, 0x55F }, + { SCRIPT_ARMENIAN, 0x561, 0x587 }, + { SCRIPT_ARMENIAN, 0x58A, 0x58A }, + { SCRIPT_ARMENIAN, 0x58D, 0x58F }, + { SCRIPT_ARMENIAN, 0xFB13, 0xFB17 }, + { SCRIPT_HANGUL, 0x1100, 0x11FF }, + { SCRIPT_HANGUL, 0x302E, 0x302F }, + { SCRIPT_HANGUL, 0x3131, 0x318E }, + { SCRIPT_HANGUL, 0x3200, 0x321E }, + { SCRIPT_HANGUL, 0x3260, 0x327E }, + { SCRIPT_HANGUL, 0xA960, 0xA97C }, + { SCRIPT_HANGUL, 0xAC00, 0xD7A3 }, + { SCRIPT_HANGUL, 0xD7B0, 0xD7C6 }, + { SCRIPT_HANGUL, 0xD7CB, 0xD7FB }, + { SCRIPT_HANGUL, 0xFFA0, 0xFFBE }, + { SCRIPT_HANGUL, 0xFFC2, 0xFFC7 }, + { SCRIPT_HANGUL, 0xFFCA, 0xFFCF }, + { SCRIPT_HANGUL, 0xFFD2, 0xFFD7 }, + { SCRIPT_HANGUL, 0xFFDA, 0xFFDC }, + { SCRIPT_GURMUKHI, 0xA01, 0xA03 }, + { SCRIPT_GURMUKHI, 0xA05, 0xA0A }, + { SCRIPT_GURMUKHI, 0xA0F, 0xA10 }, + { SCRIPT_GURMUKHI, 0xA13, 0xA28 }, + { SCRIPT_GURMUKHI, 0xA2A, 0xA30 }, + { SCRIPT_GURMUKHI, 0xA32, 0xA33 }, + { SCRIPT_GURMUKHI, 0xA35, 0xA36 }, + { SCRIPT_GURMUKHI, 0xA38, 0xA39 }, + { SCRIPT_GURMUKHI, 0xA3C, 0xA3C }, + { SCRIPT_GURMUKHI, 0xA3E, 0xA42 }, + { SCRIPT_GURMUKHI, 0xA47, 0xA48 }, + { SCRIPT_GURMUKHI, 0xA4B, 0xA4D }, + { SCRIPT_GURMUKHI, 0xA51, 0xA51 }, + { SCRIPT_GURMUKHI, 0xA59, 0xA5C }, + { SCRIPT_GURMUKHI, 0xA5E, 0xA5E }, + { SCRIPT_GURMUKHI, 0xA66, 0xA75 }, + { SCRIPT_CYRILLIC, 0x400, 0x484 }, + { SCRIPT_CYRILLIC, 0x487, 0x52F }, + { SCRIPT_CYRILLIC, 0x1C80, 0x1C88 }, + { SCRIPT_CYRILLIC, 0x1D2B, 0x1D2B }, + { SCRIPT_CYRILLIC, 0x1D78, 0x1D78 }, + { SCRIPT_CYRILLIC, 0x2DE0, 0x2DFF }, + { SCRIPT_CYRILLIC, 0xA640, 0xA69F }, + { SCRIPT_CYRILLIC, 0xFE2E, 0xFE2F }, + { SCRIPT_DEVANAGARI, 0x900, 0x950 }, + { SCRIPT_DEVANAGARI, 0x953, 0x963 }, + { SCRIPT_DEVANAGARI, 0x966, 0x97F }, + { SCRIPT_DEVANAGARI, 0xA8E0, 0xA8FD }, + { SCRIPT_HEBREW, 0x591, 0x5C7 }, + { SCRIPT_HEBREW, 0x5D0, 0x5EA }, + { SCRIPT_HEBREW, 0x5F0, 0x5F4 }, + { SCRIPT_HEBREW, 0xFB1D, 0xFB36 }, + { SCRIPT_HEBREW, 0xFB38, 0xFB3C }, + { SCRIPT_HEBREW, 0xFB3E, 0xFB3E }, + { SCRIPT_HEBREW, 0xFB40, 0xFB41 }, + { SCRIPT_HEBREW, 0xFB43, 0xFB44 }, + { SCRIPT_HEBREW, 0xFB46, 0xFB4F }, + { SCRIPT_THAI, 0xE01, 0xE3A }, + { SCRIPT_THAI, 0xE40, 0xE5B }, + { SCRIPT_SYRIAC, 0x700, 0x70D }, + { SCRIPT_SYRIAC, 0x70F, 0x74A }, + { SCRIPT_SYRIAC, 0x74D, 0x74F }, + { SCRIPT_KANNADA, 0xC80, 0xC83 }, + { SCRIPT_KANNADA, 0xC85, 0xC8C }, + { SCRIPT_KANNADA, 0xC8E, 0xC90 }, + { SCRIPT_KANNADA, 0xC92, 0xCA8 }, + { SCRIPT_KANNADA, 0xCAA, 0xCB3 }, + { SCRIPT_KANNADA, 0xCB5, 0xCB9 }, + { SCRIPT_KANNADA, 0xCBC, 0xCC4 }, + { SCRIPT_KANNADA, 0xCC6, 0xCC8 }, + { SCRIPT_KANNADA, 0xCCA, 0xCCD }, + { SCRIPT_KANNADA, 0xCD5, 0xCD6 }, + { SCRIPT_KANNADA, 0xCDE, 0xCDE }, + { SCRIPT_KANNADA, 0xCE0, 0xCE3 }, + { SCRIPT_KANNADA, 0xCE6, 0xCEF }, + { SCRIPT_KANNADA, 0xCF1, 0xCF2 }, + { SCRIPT_LAO, 0xE81, 0xE82 }, + { SCRIPT_LAO, 0xE84, 0xE84 }, + { SCRIPT_LAO, 0xE87, 0xE88 }, + { SCRIPT_LAO, 0xE8A, 0xE8A }, + { SCRIPT_LAO, 0xE8D, 0xE8D }, + { SCRIPT_LAO, 0xE94, 0xE97 }, + { SCRIPT_LAO, 0xE99, 0xE9F }, + { SCRIPT_LAO, 0xEA1, 0xEA3 }, + { SCRIPT_LAO, 0xEA5, 0xEA5 }, + { SCRIPT_LAO, 0xEA7, 0xEA7 }, + { SCRIPT_LAO, 0xEAA, 0xEAB }, + { SCRIPT_LAO, 0xEAD, 0xEB9 }, + { SCRIPT_LAO, 0xEBB, 0xEBD }, + { SCRIPT_LAO, 0xEC0, 0xEC4 }, + { SCRIPT_LAO, 0xEC6, 0xEC6 }, + { SCRIPT_LAO, 0xEC8, 0xECD }, + { SCRIPT_LAO, 0xED0, 0xED9 }, + { SCRIPT_LAO, 0xEDC, 0xEDF }, + { SCRIPT_TELUGU, 0xC00, 0xC03 }, + { SCRIPT_TELUGU, 0xC05, 0xC0C }, + { SCRIPT_TELUGU, 0xC0E, 0xC10 }, + { SCRIPT_TELUGU, 0xC12, 0xC28 }, + { SCRIPT_TELUGU, 0xC2A, 0xC39 }, + { SCRIPT_TELUGU, 0xC3D, 0xC44 }, + { SCRIPT_TELUGU, 0xC46, 0xC48 }, + { SCRIPT_TELUGU, 0xC4A, 0xC4D }, + { SCRIPT_TELUGU, 0xC55, 0xC56 }, + { SCRIPT_TELUGU, 0xC58, 0xC5A }, + { SCRIPT_TELUGU, 0xC60, 0xC63 }, + { SCRIPT_TELUGU, 0xC66, 0xC6F }, + { SCRIPT_TELUGU, 0xC78, 0xC7F }, + { SCRIPT_KHMER, 0x1780, 0x17DD }, + { SCRIPT_KHMER, 0x17E0, 0x17E9 }, + { SCRIPT_KHMER, 0x17F0, 0x17F9 }, + { SCRIPT_KHMER, 0x19E0, 0x19FF }, + { SCRIPT_LATIN, 0x41, 0x5A }, + { SCRIPT_LATIN, 0x61, 0x7A }, + { SCRIPT_LATIN, 0xAA, 0xAA }, + { SCRIPT_LATIN, 0xBA, 0xBA }, + { SCRIPT_LATIN, 0xC0, 0xD6 }, + { SCRIPT_LATIN, 0xD8, 0xF6 }, + { SCRIPT_LATIN, 0xF8, 0x2B8 }, + { SCRIPT_LATIN, 0x2E0, 0x2E4 }, + { SCRIPT_LATIN, 0x1D00, 0x1D25 }, + { SCRIPT_LATIN, 0x1D2C, 0x1D5C }, + { SCRIPT_LATIN, 0x1D62, 0x1D65 }, + { SCRIPT_LATIN, 0x1D6B, 0x1D77 }, + { SCRIPT_LATIN, 0x1D79, 0x1DBE }, + { SCRIPT_LATIN, 0x1E00, 0x1EFF }, + { SCRIPT_LATIN, 0x2071, 0x2071 }, + { SCRIPT_LATIN, 0x207F, 0x207F }, + { SCRIPT_LATIN, 0x2090, 0x209C }, + { SCRIPT_LATIN, 0x212A, 0x212B }, + { SCRIPT_LATIN, 0x2132, 0x2132 }, + { SCRIPT_LATIN, 0x214E, 0x214E }, + { SCRIPT_LATIN, 0x2160, 0x2188 }, + { SCRIPT_LATIN, 0x2C60, 0x2C7F }, + { SCRIPT_LATIN, 0xA722, 0xA787 }, + { SCRIPT_LATIN, 0xA78B, 0xA7AE }, + { SCRIPT_LATIN, 0xA7B0, 0xA7B7 }, + { SCRIPT_LATIN, 0xA7F7, 0xA7FF }, + { SCRIPT_LATIN, 0xAB30, 0xAB5A }, + { SCRIPT_LATIN, 0xAB5C, 0xAB64 }, + { SCRIPT_LATIN, 0xFB00, 0xFB06 }, + { SCRIPT_LATIN, 0xFF21, 0xFF3A }, + { SCRIPT_LATIN, 0xFF41, 0xFF5A }, + { SCRIPT_TIBETAN, 0xF00, 0xF47 }, + { SCRIPT_TIBETAN, 0xF49, 0xF6C }, + { SCRIPT_TIBETAN, 0xF71, 0xF97 }, + { SCRIPT_TIBETAN, 0xF99, 0xFBC }, + { SCRIPT_TIBETAN, 0xFBE, 0xFCC }, + { SCRIPT_TIBETAN, 0xFCE, 0xFD4 }, + { SCRIPT_TIBETAN, 0xFD9, 0xFDA }, + { SCRIPT_MYANMAR, 0x1000, 0x109F }, + { SCRIPT_MYANMAR, 0xA9E0, 0xA9FE }, + { SCRIPT_MYANMAR, 0xAA60, 0xAA7F }, + { SCRIPT_OTHER, 0x2EA, 0x2EB }, + { SCRIPT_OTHER, 0x7C0, 0x7FA }, + { SCRIPT_OTHER, 0x800, 0x82D }, + { SCRIPT_OTHER, 0x830, 0x83E }, + { SCRIPT_OTHER, 0x840, 0x85B }, + { SCRIPT_OTHER, 0x85E, 0x85E }, + { SCRIPT_OTHER, 0x13A0, 0x13F5 }, + { SCRIPT_OTHER, 0x13F8, 0x13FD }, + { SCRIPT_OTHER, 0x1400, 0x169C }, + { SCRIPT_OTHER, 0x1700, 0x170C }, + { SCRIPT_OTHER, 0x170E, 0x1714 }, + { SCRIPT_OTHER, 0x1720, 0x1734 }, + { SCRIPT_OTHER, 0x1740, 0x1753 }, + { SCRIPT_OTHER, 0x1760, 0x176C }, + { SCRIPT_OTHER, 0x176E, 0x1770 }, + { SCRIPT_OTHER, 0x1772, 0x1773 }, + { SCRIPT_OTHER, 0x18B0, 0x18F5 }, + { SCRIPT_OTHER, 0x1900, 0x191E }, + { SCRIPT_OTHER, 0x1920, 0x192B }, + { SCRIPT_OTHER, 0x1930, 0x193B }, + { SCRIPT_OTHER, 0x1940, 0x1940 }, + { SCRIPT_OTHER, 0x1944, 0x196D }, + { SCRIPT_OTHER, 0x1970, 0x1974 }, + { SCRIPT_OTHER, 0x1980, 0x19AB }, + { SCRIPT_OTHER, 0x19B0, 0x19C9 }, + { SCRIPT_OTHER, 0x19D0, 0x19DA }, + { SCRIPT_OTHER, 0x19DE, 0x19DF }, + { SCRIPT_OTHER, 0x1A00, 0x1A1B }, + { SCRIPT_OTHER, 0x1A1E, 0x1A5E }, + { SCRIPT_OTHER, 0x1A60, 0x1A7C }, + { SCRIPT_OTHER, 0x1A7F, 0x1A89 }, + { SCRIPT_OTHER, 0x1A90, 0x1A99 }, + { SCRIPT_OTHER, 0x1AA0, 0x1AAD }, + { SCRIPT_OTHER, 0x1B00, 0x1B4B }, + { SCRIPT_OTHER, 0x1B50, 0x1B7C }, + { SCRIPT_OTHER, 0x1B80, 0x1BF3 }, + { SCRIPT_OTHER, 0x1BFC, 0x1C37 }, + { SCRIPT_OTHER, 0x1C3B, 0x1C49 }, + { SCRIPT_OTHER, 0x1C4D, 0x1C7F }, + { SCRIPT_OTHER, 0x1CC0, 0x1CC7 }, + { SCRIPT_OTHER, 0x2800, 0x28FF }, + { SCRIPT_OTHER, 0x2C00, 0x2C2E }, + { SCRIPT_OTHER, 0x2C30, 0x2C5E }, + { SCRIPT_OTHER, 0x2D30, 0x2D67 }, + { SCRIPT_OTHER, 0x2D6F, 0x2D70 }, + { SCRIPT_OTHER, 0x2D7F, 0x2D7F }, + { SCRIPT_OTHER, 0x3105, 0x312D }, + { SCRIPT_OTHER, 0x31A0, 0x31BA }, + { SCRIPT_OTHER, 0xA000, 0xA48C }, + { SCRIPT_OTHER, 0xA490, 0xA4C6 }, + { SCRIPT_OTHER, 0xA4D0, 0xA62B }, + { SCRIPT_OTHER, 0xA6A0, 0xA6F7 }, + { SCRIPT_OTHER, 0xA800, 0xA82B }, + { SCRIPT_OTHER, 0xA840, 0xA877 }, + { SCRIPT_OTHER, 0xA880, 0xA8C5 }, + { SCRIPT_OTHER, 0xA8CE, 0xA8D9 }, + { SCRIPT_OTHER, 0xA900, 0xA92D }, + { SCRIPT_OTHER, 0xA92F, 0xA953 }, + { SCRIPT_OTHER, 0xA95F, 0xA95F }, + { SCRIPT_OTHER, 0xA980, 0xA9CD }, + { SCRIPT_OTHER, 0xA9D0, 0xA9D9 }, + { SCRIPT_OTHER, 0xA9DE, 0xA9DF }, + { SCRIPT_OTHER, 0xAA00, 0xAA36 }, + { SCRIPT_OTHER, 0xAA40, 0xAA4D }, + { SCRIPT_OTHER, 0xAA50, 0xAA59 }, + { SCRIPT_OTHER, 0xAA5C, 0xAA5F }, + { SCRIPT_OTHER, 0xAA80, 0xAAC2 }, + { SCRIPT_OTHER, 0xAADB, 0xAAF6 }, + { SCRIPT_OTHER, 0xAB70, 0xABED }, + { SCRIPT_OTHER, 0xABF0, 0xABF9 }, + { SCRIPT_HAN, 0x2E80, 0x2E99 }, + { SCRIPT_HAN, 0x2E9B, 0x2EF3 }, + { SCRIPT_HAN, 0x2F00, 0x2FD5 }, + { SCRIPT_HAN, 0x3005, 0x3005 }, + { SCRIPT_HAN, 0x3007, 0x3007 }, + { SCRIPT_HAN, 0x3021, 0x3029 }, + { SCRIPT_HAN, 0x3038, 0x303B }, + { SCRIPT_HAN, 0x3400, 0x4DB5 }, + { SCRIPT_HAN, 0x4E00, 0x9FD5 }, + { SCRIPT_HAN, 0xF900, 0xFA6D }, + { SCRIPT_HAN, 0xFA70, 0xFAD9 }, + { SCRIPT_THAANA, 0x780, 0x7B1 }, + { SCRIPT_HIRAGANA, 0x3041, 0x3096 }, + { SCRIPT_HIRAGANA, 0x309D, 0x309F }, + { SCRIPT_KATAKANA, 0x30A1, 0x30FA }, + { SCRIPT_KATAKANA, 0x30FD, 0x30FF }, + { SCRIPT_KATAKANA, 0x31F0, 0x31FF }, + { SCRIPT_KATAKANA, 0x32D0, 0x32FE }, + { SCRIPT_KATAKANA, 0x3300, 0x3357 }, + { SCRIPT_KATAKANA, 0xFF66, 0xFF6F }, + { SCRIPT_KATAKANA, 0xFF71, 0xFF9D }, + { SCRIPT_ORIYA, 0xB01, 0xB03 }, + { SCRIPT_ORIYA, 0xB05, 0xB0C }, + { SCRIPT_ORIYA, 0xB0F, 0xB10 }, + { SCRIPT_ORIYA, 0xB13, 0xB28 }, + { SCRIPT_ORIYA, 0xB2A, 0xB30 }, + { SCRIPT_ORIYA, 0xB32, 0xB33 }, + { SCRIPT_ORIYA, 0xB35, 0xB39 }, + { SCRIPT_ORIYA, 0xB3C, 0xB44 }, + { SCRIPT_ORIYA, 0xB47, 0xB48 }, + { SCRIPT_ORIYA, 0xB4B, 0xB4D }, + { SCRIPT_ORIYA, 0xB56, 0xB57 }, + { SCRIPT_ORIYA, 0xB5C, 0xB5D }, + { SCRIPT_ORIYA, 0xB5F, 0xB63 }, + { SCRIPT_ORIYA, 0xB66, 0xB77 }, + { SCRIPT_BENGALI, 0x980, 0x983 }, + { SCRIPT_BENGALI, 0x985, 0x98C }, + { SCRIPT_BENGALI, 0x98F, 0x990 }, + { SCRIPT_BENGALI, 0x993, 0x9A8 }, + { SCRIPT_BENGALI, 0x9AA, 0x9B0 }, + { SCRIPT_BENGALI, 0x9B2, 0x9B2 }, + { SCRIPT_BENGALI, 0x9B6, 0x9B9 }, + { SCRIPT_BENGALI, 0x9BC, 0x9C4 }, + { SCRIPT_BENGALI, 0x9C7, 0x9C8 }, + { SCRIPT_BENGALI, 0x9CB, 0x9CE }, + { SCRIPT_BENGALI, 0x9D7, 0x9D7 }, + { SCRIPT_BENGALI, 0x9DC, 0x9DD }, + { SCRIPT_BENGALI, 0x9DF, 0x9E3 }, + { SCRIPT_BENGALI, 0x9E6, 0x9FB }, + { SCRIPT_RUNIC, 0x16A0, 0x16EA }, + { SCRIPT_RUNIC, 0x16EE, 0x16F8 }, + { SCRIPT_SINHALA, 0xD82, 0xD83 }, + { SCRIPT_SINHALA, 0xD85, 0xD96 }, + { SCRIPT_SINHALA, 0xD9A, 0xDB1 }, + { SCRIPT_SINHALA, 0xDB3, 0xDBB }, + { SCRIPT_SINHALA, 0xDBD, 0xDBD }, + { SCRIPT_SINHALA, 0xDC0, 0xDC6 }, + { SCRIPT_SINHALA, 0xDCA, 0xDCA }, + { SCRIPT_SINHALA, 0xDCF, 0xDD4 }, + { SCRIPT_SINHALA, 0xDD6, 0xDD6 }, + { SCRIPT_SINHALA, 0xDD8, 0xDDF }, + { SCRIPT_SINHALA, 0xDE6, 0xDEF }, + { SCRIPT_SINHALA, 0xDF2, 0xDF4 }, + { SCRIPT_COPTIC, 0x3E2, 0x3EF }, + { SCRIPT_COPTIC, 0x2C80, 0x2CF3 }, + { SCRIPT_COPTIC, 0x2CF9, 0x2CFF }, + { SCRIPT_GEORGIAN, 0x10A0, 0x10C5 }, + { SCRIPT_GEORGIAN, 0x10C7, 0x10C7 }, + { SCRIPT_GEORGIAN, 0x10CD, 0x10CD }, + { SCRIPT_GEORGIAN, 0x10D0, 0x10FA }, + { SCRIPT_GEORGIAN, 0x10FC, 0x10FF }, + { SCRIPT_GEORGIAN, 0x2D00, 0x2D25 }, + { SCRIPT_GEORGIAN, 0x2D27, 0x2D27 }, + { SCRIPT_GEORGIAN, 0x2D2D, 0x2D2D }, + { SCRIPT_GREEK, 0x370, 0x373 }, + { SCRIPT_GREEK, 0x375, 0x377 }, + { SCRIPT_GREEK, 0x37A, 0x37D }, + { SCRIPT_GREEK, 0x37F, 0x37F }, + { SCRIPT_GREEK, 0x384, 0x384 }, + { SCRIPT_GREEK, 0x386, 0x386 }, + { SCRIPT_GREEK, 0x388, 0x38A }, + { SCRIPT_GREEK, 0x38C, 0x38C }, + { SCRIPT_GREEK, 0x38E, 0x3A1 }, + { SCRIPT_GREEK, 0x3A3, 0x3E1 }, + { SCRIPT_GREEK, 0x3F0, 0x3FF }, + { SCRIPT_GREEK, 0x1D26, 0x1D2A }, + { SCRIPT_GREEK, 0x1D5D, 0x1D61 }, + { SCRIPT_GREEK, 0x1D66, 0x1D6A }, + { SCRIPT_GREEK, 0x1DBF, 0x1DBF }, + { SCRIPT_GREEK, 0x1F00, 0x1F15 }, + { SCRIPT_GREEK, 0x1F18, 0x1F1D }, + { SCRIPT_GREEK, 0x1F20, 0x1F45 }, + { SCRIPT_GREEK, 0x1F48, 0x1F4D }, + { SCRIPT_GREEK, 0x1F50, 0x1F57 }, + { SCRIPT_GREEK, 0x1F59, 0x1F59 }, + { SCRIPT_GREEK, 0x1F5B, 0x1F5B }, + { SCRIPT_GREEK, 0x1F5D, 0x1F5D }, + { SCRIPT_GREEK, 0x1F5F, 0x1F7D }, + { SCRIPT_GREEK, 0x1F80, 0x1FB4 }, + { SCRIPT_GREEK, 0x1FB6, 0x1FC4 }, + { SCRIPT_GREEK, 0x1FC6, 0x1FD3 }, + { SCRIPT_GREEK, 0x1FD6, 0x1FDB }, + { SCRIPT_GREEK, 0x1FDD, 0x1FEF }, + { SCRIPT_GREEK, 0x1FF2, 0x1FF4 }, + { SCRIPT_GREEK, 0x1FF6, 0x1FFE }, + { SCRIPT_GREEK, 0x2126, 0x2126 }, + { SCRIPT_GREEK, 0xAB65, 0xAB65 }, + }; + + void InitScriptData(ui8 data[], size_t len) { + memset (data, 0, len * sizeof(ui8)); + for (auto range : ScriptRanges) { + Y_ASSERT(range.Start <= range.End); + Y_ASSERT((unsigned)range.Script < 0x100); + size_t end = range.End; + if (end >= len) + end = len; + for (size_t j = range.Start; j <= end; ++j) { + data[j] = (ui8)range.Script; + } + } + } +} diff --git a/library/cpp/langs/langs.cpp b/library/cpp/langs/langs.cpp new file mode 100644 index 0000000000..2c508e1602 --- /dev/null +++ b/library/cpp/langs/langs.cpp @@ -0,0 +1,330 @@ +#include "langs.h" + +#include <library/cpp/digest/lower_case/hash_ops.h> + +#include <util/generic/array_size.h> +#include <util/generic/hash.h> +#include <util/generic/singleton.h> +#include <util/generic/strbuf.h> +#include <util/generic/yexception.h> +#include <util/system/defaults.h> + +#include <array> +#include <cctype> + +/* + * define language by ELanguage + */ + +namespace { + struct TLanguageNameAndEnum { + ELanguage Language; + EScript Script; + const char* EnglishName; + const char* BiblioName; + const char* IsoName; + const char* Synonyms; + }; + + const TLanguageNameAndEnum LanguageNameAndEnum[] = { + {LANG_UNK, SCRIPT_OTHER, "Unknown", "unk", "mis", nullptr}, + {LANG_RUS, SCRIPT_CYRILLIC, "Russian", "rus", "ru", "ru-RU"}, + {LANG_ENG, SCRIPT_LATIN, "English", "eng", "en", "en-US, en-GB, en-CA, en-NZ, en-AU"}, + {LANG_POL, SCRIPT_LATIN, "Polish", "pol", "pl", nullptr}, + {LANG_HUN, SCRIPT_LATIN, "Hungarian", "hun", "hu", nullptr}, + {LANG_UKR, SCRIPT_CYRILLIC, "Ukrainian", "ukr", "uk", "uk-UA"}, + {LANG_GER, SCRIPT_LATIN, "German", "ger", "de", "deu"}, + {LANG_FRE, SCRIPT_LATIN, "French", "fre", "fr", "fra, frn, fr-FR, fr-CA"}, + {LANG_TAT, SCRIPT_CYRILLIC, "Tatar", "tat", "tt", nullptr}, + {LANG_BEL, SCRIPT_CYRILLIC, "Belarusian", "bel", "be", "blr, Belorussian"}, + {LANG_KAZ, SCRIPT_CYRILLIC, "Kazakh", "kaz", "kk", "kk-Cyrl"}, + {LANG_ALB, SCRIPT_LATIN, "Albanian", "alb", "sq", nullptr}, + {LANG_SPA, SCRIPT_LATIN, "Spanish", "spa", "es", nullptr}, + {LANG_ITA, SCRIPT_LATIN, "Italian", "ita", "it", nullptr}, + {LANG_ARM, SCRIPT_ARMENIAN, "Armenian", "arm", "hy", "hye"}, + {LANG_DAN, SCRIPT_LATIN, "Danish", "dan", "da", nullptr}, + {LANG_POR, SCRIPT_LATIN, "Portuguese", "por", "pt", nullptr}, + {LANG_ICE, SCRIPT_LATIN, "Icelandic", "ice", "is", "isl"}, + {LANG_SLO, SCRIPT_LATIN, "Slovak", "slo", "sk", "slk"}, + {LANG_SLV, SCRIPT_LATIN, "Slovene", "slv", "sl", "Slovenian"}, + {LANG_DUT, SCRIPT_LATIN, "Dutch", "dut", "nl", "nld"}, + {LANG_BUL, SCRIPT_CYRILLIC, "Bulgarian", "bul", "bg", nullptr}, + {LANG_CAT, SCRIPT_LATIN, "Catalan", "cat", "ca", nullptr}, + {LANG_HRV, SCRIPT_LATIN, "Croatian", "hrv", "hr", "scr"}, + {LANG_CZE, SCRIPT_LATIN, "Czech", "cze", "cs", "ces"}, + {LANG_GRE, SCRIPT_GREEK, "Greek", "gre", "el", "ell"}, + {LANG_HEB, SCRIPT_HEBREW, "Hebrew", "heb", "he", "iw"}, // 'iw' is old ISO-639 code + {LANG_NOR, SCRIPT_LATIN, "Norwegian", "nor", "no", nullptr}, + {LANG_MAC, SCRIPT_CYRILLIC, "Macedonian", "mac", "mk", nullptr}, + {LANG_SWE, SCRIPT_LATIN, "Swedish", "swe", "sv", nullptr}, + {LANG_KOR, SCRIPT_HANGUL, "Korean", "kor", "ko", nullptr}, + {LANG_LAT, SCRIPT_LATIN, "Latin", "lat", "la", nullptr}, + {LANG_BASIC_RUS, SCRIPT_CYRILLIC, "Basic Russian", "basic-rus", "bas-ru", nullptr}, + {LANG_BOS, SCRIPT_LATIN, "Bosnian", "bos", "bs", nullptr}, + {LANG_MLT, SCRIPT_LATIN, "Maltese", "mlt", "mt", nullptr}, + + {LANG_EMPTY, SCRIPT_OTHER, "Empty", "empty", nullptr, nullptr}, + {LANG_UNK_LAT, SCRIPT_LATIN, "Unknown Latin", "unklat", nullptr, nullptr}, + {LANG_UNK_CYR, SCRIPT_CYRILLIC, "Unknown Cyrillic", "unkcyr", nullptr, nullptr}, + {LANG_UNK_ALPHA, SCRIPT_OTHER, "Unknown Alpha", "unkalpha", nullptr, nullptr}, + + {LANG_FIN, SCRIPT_LATIN, "Finnish", "fin", "fi", nullptr}, + {LANG_EST, SCRIPT_LATIN, "Estonian", "est", "et", nullptr}, + {LANG_LAV, SCRIPT_LATIN, "Latvian", "lav", "lv", nullptr}, + {LANG_LIT, SCRIPT_LATIN, "Lithuanian", "lit", "lt", nullptr}, + {LANG_BAK, SCRIPT_CYRILLIC, "Bashkir", "bak", "ba", nullptr}, + {LANG_TUR, SCRIPT_LATIN, "Turkish", "tur", "tr", nullptr}, + {LANG_RUM, SCRIPT_LATIN, "Romanian", "rum", "ro", "ron"}, + {LANG_MON, SCRIPT_CYRILLIC, "Mongolian", "mon", "mn", nullptr}, + {LANG_UZB, SCRIPT_LATIN, "Uzbek", "uzb", "uz", "uz-Latn"}, + {LANG_KIR, SCRIPT_CYRILLIC, "Kirghiz", "kir", "ky", "Kyrgyz"}, + {LANG_TGK, SCRIPT_CYRILLIC, "Tajik", "tgk", "tg", nullptr}, + {LANG_TUK, SCRIPT_LATIN, "Turkmen", "tuk", "tk", nullptr}, + {LANG_SRP, SCRIPT_CYRILLIC, "Serbian", "srp", "sr", nullptr}, + {LANG_AZE, SCRIPT_LATIN, "Azerbaijani", "aze", "az", "Azeri"}, + {LANG_BASIC_ENG, SCRIPT_LATIN, "Basic English", "basic-eng", "bas-en", nullptr}, + {LANG_GEO, SCRIPT_GEORGIAN, "Georgian", "geo", "ka", "kat"}, + {LANG_ARA, SCRIPT_ARABIC, "Arabic", "ara", "ar", nullptr}, + {LANG_PER, SCRIPT_ARABIC, "Persian", "per", "fa", "fas"}, + {LANG_CHU, SCRIPT_CYRILLIC, "Church Slavonic", "chu", "cu", nullptr}, + {LANG_CHI, SCRIPT_HAN, "Chinese", "chi", "zh", "zho"}, + {LANG_JPN, SCRIPT_HIRAGANA, "Japanese", "jpn", "ja", nullptr}, + {LANG_IND, SCRIPT_LATIN, "Indonesian", "ind", "id", "in"}, // 'in' is old ISO-639 code + {LANG_MAY, SCRIPT_LATIN, "Malay", "may", "ms", "msa"}, + {LANG_THA, SCRIPT_THAI, "Thai", "tha", "th", nullptr}, + {LANG_VIE, SCRIPT_LATIN, "Vietnamese", "vie", "vi", nullptr}, + {LANG_GLE, SCRIPT_LATIN, "Irish", "gle", "ga", nullptr}, + {LANG_TGL, SCRIPT_LATIN, "Tagalog", "tgl", "tl", "fil"}, + {LANG_HIN, SCRIPT_DEVANAGARI, "Hindi", "hin", "hi", nullptr}, + {LANG_AFR, SCRIPT_LATIN, "Afrikaans", "afr", "af", nullptr}, + {LANG_URD, SCRIPT_ARABIC, "Urdu", "urd", "ur", nullptr}, + {LANG_MYA, SCRIPT_MYANMAR, "Burmese", "mya", "my", nullptr}, + {LANG_KHM, SCRIPT_KHMER, "Khmer", "khm", "km", nullptr}, + {LANG_LAO, SCRIPT_LAO, "Lao", "lao", "lo", "Laotian, Laothian"}, + {LANG_TAM, SCRIPT_TAMIL, "Tamil", "tam", "ta", nullptr}, + {LANG_BEN, SCRIPT_BENGALI, "Bengali", "ben", "bn", nullptr}, + {LANG_GUJ, SCRIPT_GUJARATI, "Gujarati", "guj", "gu", nullptr}, + {LANG_KAN, SCRIPT_KANNADA, "Kannada", "kan", "kn", nullptr}, + {LANG_PAN, SCRIPT_GURMUKHI, "Punjabi", "pan", "pa", nullptr}, + {LANG_SIN, SCRIPT_SINHALA, "Sinhalese", "sin", "si", nullptr}, + {LANG_SWA, SCRIPT_LATIN, "Swahili", "swa", "sw", nullptr}, + {LANG_BAQ, SCRIPT_LATIN, "Basque", "baq", "eu", "eus"}, + {LANG_WEL, SCRIPT_LATIN, "Welsh", "wel", "cy", "cym"}, + {LANG_GLG, SCRIPT_LATIN, "Galician", "glg", "gl", nullptr}, + {LANG_HAT, SCRIPT_LATIN, "Haitian Creole", "hat", "ht", "Haitian"}, + {LANG_MLG, SCRIPT_LATIN, "Malagasy", "mlg", "mg", nullptr}, + {LANG_CHV, SCRIPT_CYRILLIC, "Chuvash", "chv", "cv", nullptr}, + {LANG_UDM, SCRIPT_CYRILLIC, "Udmurt", "udm", "udm", nullptr}, + {LANG_KPV, SCRIPT_CYRILLIC, "Komi-Zyrian", "kpv", "kv", "Komi, kom"}, + {LANG_MHR, SCRIPT_CYRILLIC, "Meadow Mari", "mhr", "mhr", "EasternMari, Mari, chm"}, + {LANG_SJN, SCRIPT_LATIN, "Sindarin", "sjn", "sjn", nullptr}, + {LANG_MRJ, SCRIPT_CYRILLIC, "Hill Mari", "mrj", "mrj", "WesternMari"}, + {LANG_KOI, SCRIPT_CYRILLIC, "Komi-Permyak", "koi", "koi", nullptr}, + {LANG_LTZ, SCRIPT_LATIN, "Luxembourgish", "ltz", "lb", "Luxemburgish"}, + {LANG_GLA, SCRIPT_LATIN, "Scottish Gaelic", "gla", "gd", "Gaelic"}, + {LANG_CEB, SCRIPT_LATIN, "Cebuano", "ceb", "ceb", "Bisaya, Binisaya, Visayan"}, + {LANG_PUS, SCRIPT_ARABIC, "Pashto", "pus", "ps", nullptr}, + {LANG_KMR, SCRIPT_LATIN, "Kurmanji", "kmr", "ku", "Kurdish"}, + {LANG_AMH, SCRIPT_ETHIOPIC, "Amharic", "amh", "am", nullptr}, + {LANG_ZUL, SCRIPT_LATIN, "Zulu", "zul", "zu", nullptr}, + {LANG_IBO, SCRIPT_LATIN, "Igbo", "ibo", "ig", "Ibo"}, + {LANG_YOR, SCRIPT_LATIN, "Yoruba", "yor", "yo", nullptr}, + {LANG_COS, SCRIPT_LATIN, "Corsican", "cos", "co", nullptr}, + {LANG_XHO, SCRIPT_LATIN, "Xhosa", "xho", "xh", nullptr}, + {LANG_JAV, SCRIPT_LATIN, "Javanese", "jav", "jv", nullptr}, // Also SCRIPT_JAVANESE and SCRIPT_ARABIC + {LANG_NEP, SCRIPT_DEVANAGARI, "Nepali", "nep", "ne", nullptr}, + {LANG_SND, SCRIPT_DEVANAGARI, "Sindhi", "snd", "sd", nullptr}, // Also SCRIPT_ARABIC and SCRIPT_GUJARATI + {LANG_SOM, SCRIPT_LATIN, "Somali", "som", "so", nullptr}, + {LANG_EPO, SCRIPT_LATIN, "Esperanto", "epo", "eo", nullptr}, + {LANG_TEL, SCRIPT_TELUGU, "Telugu", "tel", "te", nullptr}, + {LANG_MAR, SCRIPT_DEVANAGARI, "Marathi", "mar", "mr", nullptr}, + {LANG_HAU, SCRIPT_LATIN, "Hausa", "hau", "ha", nullptr}, + {LANG_YID, SCRIPT_HEBREW, "Yiddish", "yid", "yi", nullptr}, + {LANG_MAL, SCRIPT_MALAYALAM, "Malayalam", "mal", "ml", nullptr}, + {LANG_MAO, SCRIPT_LATIN, "Maori", "mao", "mi", "mri"}, + {LANG_SUN, SCRIPT_LATIN, "Sundanese", "sun", "su", nullptr}, + {LANG_PAP, SCRIPT_LATIN, "Papiamento", "pap", "pap", nullptr}, + {LANG_UZB_CYR, SCRIPT_CYRILLIC, "Cyrillic Uzbek", "uzbcyr", "uz-Cyrl", nullptr}, // https://tools.ietf.org/html/rfc5646 + {LANG_TRANSCR_IPA, SCRIPT_LATIN, "International Phonetic Alphabet Transcription", "ipa", "tr-ipa", nullptr}, + {LANG_EMJ, SCRIPT_LATIN, "Emoji", "emj", "emj", nullptr}, + {LANG_UYG, SCRIPT_ARABIC, "Uyghur", "uig", "ug", nullptr}, + {LANG_BRE, SCRIPT_LATIN, "Breton", "bre", "br", nullptr}, + {LANG_SAH, SCRIPT_CYRILLIC, "Yakut", "sah", "sah", nullptr}, + {LANG_KAZ_LAT, SCRIPT_LATIN, "Latin Kazakh", "kazlat", "kk-Latn", nullptr}, + }; + + static_assert(static_cast<size_t>(LANG_MAX) == Y_ARRAY_SIZE(LanguageNameAndEnum), "Size doesn't match"); + + class TLanguagesMap { + private: + static const char* const EMPTY_NAME; + + using TNamesHash = THashMap<TStringBuf, ELanguage, TCIOps, TCIOps>; + TNamesHash Hash; + + using TNamesArray = std::array<const char*, static_cast<size_t>(LANG_MAX)>; + TNamesArray BiblioNames; + TNamesArray IsoNames; + TNamesArray FullNames; + + using TScripts = std::array<EScript, static_cast<size_t>(LANG_MAX)>; + TScripts Scripts; + + private: + void AddNameToHash(const TStringBuf& name, ELanguage language) { + if (Hash.find(name) != Hash.end()) { + Y_ASSERT(Hash.find(name)->second == language); + return; + } + + Hash[name] = language; + } + + void AddName(const char* name, ELanguage language, TNamesArray& names) { + if (name == nullptr || strlen(name) == 0) + return; + + Y_ASSERT(names[language] == EMPTY_NAME); + names[language] = name; + + AddNameToHash(name, language); + } + + void AddSynonyms(const char* syn, ELanguage language) { + static const char* del = " ,;"; + if (!syn) + return; + while (*syn) { + size_t len = strcspn(syn, del); + AddNameToHash(TStringBuf(syn, len), language); + syn += len; + while (*syn && strchr(del, *syn)) + ++syn; + } + } + + public: + TLanguagesMap() { + BiblioNames.fill(EMPTY_NAME); + IsoNames.fill(EMPTY_NAME); + FullNames.fill(EMPTY_NAME); + Scripts.fill(SCRIPT_OTHER); + + for (size_t i = 0; i != Y_ARRAY_SIZE(LanguageNameAndEnum); ++i) { + const TLanguageNameAndEnum& val = LanguageNameAndEnum[i]; + + ELanguage language = val.Language; + + AddName(val.BiblioName, language, BiblioNames); + AddName(val.IsoName, language, IsoNames); + AddName(val.EnglishName, language, FullNames); + AddSynonyms(val.Synonyms, language); + + if (Scripts[language] == SCRIPT_OTHER) { + Scripts[language] = val.Script; + } + } + } + + public: + inline ELanguage LanguageByName(const TStringBuf& name, ELanguage def) const { + if (!name) + return def; + + TNamesHash::const_iterator i = Hash.find(name); + if (i == Hash.end()) { + // Try to extract the primary language code from constructions like "en-cockney" or "zh_Hant" + size_t dash_pos = name.find_first_of("_-"); + if (dash_pos != TStringBuf::npos) + i = Hash.find(name.substr(0, dash_pos)); + if (i == Hash.end()) + return def; + } + + return i->second; + } + + inline const char* FullNameByLanguage(ELanguage language) const { + if (language < 0 || static_cast<size_t>(language) >= FullNames.size()) + return nullptr; + + return FullNames[language]; + } + inline const char* BiblioNameByLanguage(ELanguage language) const { + if (language < 0 || static_cast<size_t>(language) >= BiblioNames.size()) + return nullptr; + + return BiblioNames[language]; + } + inline const char* IsoNameByLanguage(ELanguage language) const { + if (language < 0 || static_cast<size_t>(language) >= IsoNames.size()) + return nullptr; + + return IsoNames[language]; + } + + inline EScript Script(ELanguage language) const { + return Scripts[language]; + } + }; +} + +const char* const TLanguagesMap::EMPTY_NAME = ""; + +const char* FullNameByLanguage(ELanguage language) { + return Singleton<TLanguagesMap>()->FullNameByLanguage(language); +} +const char* NameByLanguage(ELanguage language) { + return Singleton<TLanguagesMap>()->BiblioNameByLanguage(language); +} +const char* IsoNameByLanguage(ELanguage language) { + return Singleton<TLanguagesMap>()->IsoNameByLanguage(language); +} + +ELanguage LanguageByNameStrict(const TStringBuf& name) { + return Singleton<TLanguagesMap>()->LanguageByName(name, LANG_MAX); +} + +ELanguage LanguageByNameOrDie(const TStringBuf& name) { + ELanguage result = LanguageByNameStrict(name); + if (result == LANG_MAX) { + ythrow yexception() << "LanguageByNameOrDie: invalid language '" << name << "'"; + } + return result; +} + +ELanguage LanguageByName(const TStringBuf& name) { + return Singleton<TLanguagesMap>()->LanguageByName(name, LANG_UNK); +} + +EScript ScriptByLanguage(ELanguage language) { + return Singleton<TLanguagesMap>()->Script(language); +} + +namespace { + const size_t MAX_GLYPH = 0x10000; + class TScriptGlyphIndex { + public: + TScriptGlyphIndex() { + NCharsetInternal::InitScriptData(Data, MAX_GLYPH); + } + + EScript GetGlyphScript(wchar32 glyph) const { + if (glyph >= MAX_GLYPH) + return SCRIPT_UNKNOWN; + return (EScript)Data[glyph]; + } + + private: + ui8 Data[MAX_GLYPH]; + }; +} + +EScript ScriptByGlyph(wchar32 glyph) { + return HugeSingleton<TScriptGlyphIndex>()->GetGlyphScript(glyph); +} + +template <> +void Out<ELanguage>(IOutputStream& o, ELanguage lang) { + o << NameByLanguage(lang); +} diff --git a/library/cpp/langs/langs.h b/library/cpp/langs/langs.h new file mode 100644 index 0000000000..360ab6a832 --- /dev/null +++ b/library/cpp/langs/langs.h @@ -0,0 +1,229 @@ +#pragma once + +#include "scripts.h" + +#include <util/generic/strbuf.h> +#include <util/system/defaults.h> + +#if defined(_win_) +// LANG_LAO is #define in WinNT.h +#undef LANG_LAO +#endif + +// Language names are given according to ISO 639-2/B +// Some languages are not present in ISO 639-2/B. Then ISO 639-3 is used. +// http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes +enum ELanguage { + LANG_UNK = 0, // Unknown + LANG_RUS = 1, // Russian + LANG_ENG = 2, // English + LANG_POL = 3, // Polish + LANG_HUN = 4, // Hungarian + LANG_UKR = 5, // Ukrainian + LANG_GER = 6, // German + LANG_FRE = 7, // French + LANG_TAT = 8, // Tatar + LANG_BEL = 9, // Belarusian + LANG_KAZ = 10, // Kazakh + LANG_ALB = 11, // Albanian + LANG_SPA = 12, // Spanish + LANG_ITA = 13, // Italian + LANG_ARM = 14, // Armenian + LANG_DAN = 15, // Danish + LANG_POR = 16, // Portuguese + LANG_ICE = 17, // Icelandic + LANG_SLO = 18, // Slovak + LANG_SLV = 19, // Slovene + LANG_DUT = 20, // Dutch (Netherlandish language) + LANG_BUL = 21, // Bulgarian + LANG_CAT = 22, // Catalan + LANG_HRV = 23, // Croatian + LANG_CZE = 24, // Czech + LANG_GRE = 25, // Greek + LANG_HEB = 26, // Hebrew + LANG_NOR = 27, // Norwegian + LANG_MAC = 28, // Macedonian + LANG_SWE = 29, // Swedish + LANG_KOR = 30, // Korean + LANG_LAT = 31, // Latin + LANG_BASIC_RUS = 32, // Simplified version of Russian (used at lemmer only) + LANG_BOS = 33, // Bosnian + LANG_MLT = 34, // Maltese + LANG_EMPTY = 35, // Indicate that document is empty + LANG_UNK_LAT = 36, // Any unrecognized latin language + LANG_UNK_CYR = 37, // Any unrecognized cyrillic language + LANG_UNK_ALPHA = 38, // Any unrecognized alphabetic language not fit into previous categories + LANG_FIN = 39, // Finnish + LANG_EST = 40, // Estonian + LANG_LAV = 41, // Latvian + LANG_LIT = 42, // Lithuanian + LANG_BAK = 43, // Bashkir + LANG_TUR = 44, // Turkish + LANG_RUM = 45, // Romanian (also Moldavian) + LANG_MON = 46, // Mongolian + LANG_UZB = 47, // Uzbek + LANG_KIR = 48, // Kirghiz + LANG_TGK = 49, // Tajik + LANG_TUK = 50, // Turkmen + LANG_SRP = 51, // Serbian + LANG_AZE = 52, // Azerbaijani + LANG_BASIC_ENG = 53, // Simplified version of English (used at lemmer only) + LANG_GEO = 54, // Georgian + LANG_ARA = 55, // Arabic + LANG_PER = 56, // Persian + LANG_CHU = 57, // Church Slavonic + LANG_CHI = 58, // Chinese + LANG_JPN = 59, // Japanese + LANG_IND = 60, // Indonesian + LANG_MAY = 61, // Malay + LANG_THA = 62, // Thai + LANG_VIE = 63, // Vietnamese + LANG_GLE = 64, // Irish (Gaelic) + LANG_TGL = 65, // Tagalog (Filipino) + LANG_HIN = 66, // Hindi + LANG_AFR = 67, // Afrikaans + LANG_URD = 68, // Urdu + LANG_MYA = 69, // Burmese + LANG_KHM = 70, // Khmer + LANG_LAO = 71, // Lao + LANG_TAM = 72, // Tamil + LANG_BEN = 73, // Bengali + LANG_GUJ = 74, // Gujarati + LANG_KAN = 75, // Kannada + LANG_PAN = 76, // Punjabi + LANG_SIN = 77, // Sinhalese + LANG_SWA = 78, // Swahili + LANG_BAQ = 79, // Basque + LANG_WEL = 80, // Welsh + LANG_GLG = 81, // Galician + LANG_HAT = 82, // Haitian Creole + LANG_MLG = 83, // Malagasy + LANG_CHV = 84, // Chuvash + LANG_UDM = 85, // Udmurt + LANG_KPV = 86, // Komi-Zyrian + LANG_MHR = 87, // Meadow Mari (Eastern Mari) + LANG_SJN = 88, // Sindarin + LANG_MRJ = 89, // Hill Mari (Western Mari) + LANG_KOI = 90, // Komi-Permyak + LANG_LTZ = 91, // Luxembourgish + LANG_GLA = 92, // Scottish Gaelic + LANG_CEB = 93, // Cebuano + LANG_PUS = 94, // Pashto + LANG_KMR = 95, // Kurmanji + LANG_AMH = 96, // Amharic + LANG_ZUL = 97, // Zulu + LANG_IBO = 98, // Igbo + LANG_YOR = 99, // Yoruba + LANG_COS = 100, // Corsican + LANG_XHO = 101, // Xhosa + LANG_JAV = 102, // Javanese + LANG_NEP = 103, // Nepali + LANG_SND = 104, // Sindhi + LANG_SOM = 105, // Somali + LANG_EPO = 106, // Esperanto + LANG_TEL = 107, // Telugu + LANG_MAR = 108, // Marathi + LANG_HAU = 109, // Hausa + LANG_YID = 110, // Yiddish + LANG_MAL = 111, // Malayalam + LANG_MAO = 112, // Maori + LANG_SUN = 113, // Sundanese + LANG_PAP = 114, // Papiamento + LANG_UZB_CYR = 115, // Cyrillic Uzbek + LANG_TRANSCR_IPA = 116, // International Phonetic Alphabet Transcription + LANG_EMJ = 117, // Emoji + LANG_UYG = 118, // Uyghur + LANG_BRE = 119, // Breton + LANG_SAH = 120, // Yakut + LANG_KAZ_LAT = 121, // Latin Kazakh + LANG_MAX +}; + +/** + * Converts string to corresponding enum. Will try to extract the primary language code from + * constructions like "en-cockney" or "zh_Hant". In case of failure will return `LANG_UNK`. + * + * @param name Language name + * @return Language enum + */ +ELanguage LanguageByName(const TStringBuf& name); + +/** + * Same as `LanguageByName`, but in case of failure will return `LANG_MAX`. + * + * @see LanguageByName + */ +ELanguage LanguageByNameStrict(const TStringBuf& name); + +/** + * Converts language enum to corresponding ISO 639-2/B alpha-3 code. For languages missing in ISO + * standard convertions are: + * - LANG_UNK: "unk" + * - LANG_BASIC_RUS: "basic-rus" + * - LANG_EMPTY: "empty" + * - LANG_UNK_LAT: "unklat" + * - LANG_UNK_CYR: "unkcyr" + * - LANG_UNK_ALPHA: "unkalpha" + * - LANG_BASIC_ENG: "basic-eng" + * - LANG_TRANSCR_IPA "transcr-ipa" + * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be + * `nullptr`. + * + * @param language Language enum + * @return Language ISO 639-2/B alpha-3 code + */ +const char* NameByLanguage(ELanguage language); + +/** + * Converts language enum to corresponding ISO 639-1 alpha-2 code. For languages missing in ISO + * standard convertions are: + * - LANG_UNK: "mis" + * - LANG_BASIC_RUS: "bas-ru" + * - LANG_EMPTY: "" + * - LANG_UNK_LAT: "" + * - LANG_UNK_CYR: "" + * - LANG_UNK_ALPHA: "" + * - LANG_BASIC_ENG: "bas-en" + * - LANG_TRANSCR_IPA "tr-ipa" + * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be + * `nullptr`. + * + * @param language Language enum + * @return Language ISO 639-1 alpha-2 code + */ +const char* IsoNameByLanguage(ELanguage language); + +/** + * Converts language enum to corresponding human-readable language name. E.g. "Russian" for + * `LANG_RUS` or "Basic Russian" for `LANG_BASIC_RUS`. If language is missing in `ELanguage` or if + * it is a `LANG_MAX` then return value will be `nullptr`. + * + * @param language Language enum + */ +const char* FullNameByLanguage(ELanguage language); + +/** + * Same as `LanguageByNameStrict` but in case of failure will throw `yexception`. + * + * @see LanguageByNameStrict + */ +ELanguage LanguageByNameOrDie(const TStringBuf& name); + +constexpr bool UnknownLanguage(const ELanguage language) noexcept { + return language == LANG_UNK || language == LANG_UNK_LAT || language == LANG_UNK_CYR || language == LANG_UNK_ALPHA || language == LANG_EMPTY; +} + +EScript ScriptByLanguage(ELanguage language); +EScript ScriptByGlyph(wchar32 glyph); + +namespace NCharsetInternal { + void InitScriptData(ui8 data[], size_t len); +} + +inline bool LatinScript(ELanguage language) { + return ScriptByLanguage(language) == SCRIPT_LATIN; +} + +inline bool CyrillicScript(ELanguage language) { + return ScriptByLanguage(language) == SCRIPT_CYRILLIC; +} diff --git a/library/cpp/langs/scripts.cpp b/library/cpp/langs/scripts.cpp new file mode 100644 index 0000000000..41cc91d3ce --- /dev/null +++ b/library/cpp/langs/scripts.cpp @@ -0,0 +1,158 @@ +#include "scripts.h" + +#include <library/cpp/digest/lower_case/hash_ops.h> + +#include <util/generic/hash.h> +#include <util/generic/singleton.h> +#include <util/generic/strbuf.h> +#include <util/generic/yexception.h> +#include <util/system/defaults.h> + +#include <array> + +namespace { + struct TScriptNameAndEnum { + EScript Script; + const char* EnglishName; + const char* IsoName; + }; + + const TScriptNameAndEnum ScriptNameAndEnum[] = { + {SCRIPT_UNKNOWN, "Unknown", "Zzzz"}, + {SCRIPT_LATIN, "Latin", "Latn"}, + {SCRIPT_CYRILLIC, "Cyrillic", "Cyrl"}, + + {SCRIPT_GREEK, "Greek", "Grek"}, + {SCRIPT_ARABIC, "Arabic", "Arab"}, + {SCRIPT_HEBREW, "Hebrew", "Hebr"}, + {SCRIPT_ARMENIAN, "Armenian", "Armn"}, + {SCRIPT_GEORGIAN, "Georgian", "Geor"}, + + {SCRIPT_HAN, "Han", "Hans"}, // We use more common Simpliied variant (as opposed to Traditional 'Hant') + {SCRIPT_KATAKANA, "Katakana", "Kana"}, + {SCRIPT_HIRAGANA, "Hiragana", "Hira"}, + {SCRIPT_HANGUL, "Hangul", "Hang"}, + + {SCRIPT_DEVANAGARI, "Devanagari", "Deva"}, + {SCRIPT_BENGALI, "Bengali", "Beng"}, + {SCRIPT_GUJARATI, "Gujarati", "Gujr"}, + {SCRIPT_GURMUKHI, "Gurmukhi", "Guru"}, + {SCRIPT_KANNADA, "Kannada", "Knda"}, + {SCRIPT_MALAYALAM, "Malayalam", "Mlym"}, + {SCRIPT_ORIYA, "Oriya", "Orya"}, + {SCRIPT_TAMIL, "Tamil", "Taml"}, + {SCRIPT_TELUGU, "Telugu", "Telu"}, + {SCRIPT_THAANA, "Thaana", "Thaa"}, + {SCRIPT_SINHALA, "Sinhala", "Sinh"}, + + {SCRIPT_MYANMAR, "Myanmar", "Mymr"}, + {SCRIPT_THAI, "Thai", "Thai"}, + {SCRIPT_LAO, "Lao", "Laoo"}, + {SCRIPT_KHMER, "Khmer", "Khmr"}, + {SCRIPT_TIBETAN, "Tibetan", "Tibt"}, + {SCRIPT_MONGOLIAN, "Mongolian", "Mong"}, + + {SCRIPT_ETHIOPIC, "Ethiopic", "Ethi"}, + {SCRIPT_RUNIC, "Runic", "Runr"}, + {SCRIPT_COPTIC, "Coptic", "Copt"}, + {SCRIPT_SYRIAC, "Syriac", "Syrc"}, + + {SCRIPT_OTHER, "Other", "Zyyy"}, + }; + + static_assert(static_cast<size_t>(SCRIPT_MAX) == Y_ARRAY_SIZE(ScriptNameAndEnum), "Size doesn't match"); + + class TScriptsMap { + private: + static const char* const EMPTY_NAME; + + using TNamesHash = THashMap<TStringBuf, EScript, TCIOps, TCIOps>; + TNamesHash Hash; + + using TNamesArray = std::array<const char*, static_cast<size_t>(SCRIPT_MAX)>; + TNamesArray IsoNames; + TNamesArray FullNames; + + private: + void AddNameToHash(const TStringBuf& name, EScript script) { + if (Hash.find(name) != Hash.end()) { + Y_ASSERT(Hash.find(name)->second == script); + return; + } + + Hash[name] = script; + } + + void AddName(const char* name, EScript script, TNamesArray& names) { + if (name == nullptr || strlen(name) == 0) + return; + + Y_ASSERT(names[script] == EMPTY_NAME); + names[script] = name; + + AddNameToHash(name, script); + } + + public: + TScriptsMap() { + IsoNames.fill(EMPTY_NAME); + FullNames.fill(EMPTY_NAME); + + for (const auto& val : ScriptNameAndEnum) { + EScript script = val.Script; + + AddName(val.IsoName, script, IsoNames); + AddName(val.EnglishName, script, FullNames); + } + } + + public: + inline EScript ScriptByName(const TStringBuf& name, EScript def) const { + if (!name) + return def; + + TNamesHash::const_iterator i = Hash.find(name); + if (i == Hash.end()) { + return def; + } + + return i->second; + } + + inline const char* FullNameByScript(EScript script) const { + if (script < 0 || static_cast<size_t>(script) >= FullNames.size()) + return nullptr; + + return FullNames[script]; + } + + inline const char* IsoNameByScript(EScript script) const { + if (script < 0 || static_cast<size_t>(script) >= IsoNames.size()) + return nullptr; + + return IsoNames[script]; + } + }; +} + +const char* const TScriptsMap::EMPTY_NAME = ""; + +const char* FullNameByScript(EScript script) { + return Singleton<TScriptsMap>()->FullNameByScript(script); +} + +const char* IsoNameByScript(EScript script) { + return Singleton<TScriptsMap>()->IsoNameByScript(script); +} + +EScript ScriptByName(const TStringBuf& name) { + return Singleton<TScriptsMap>()->ScriptByName(name, SCRIPT_UNKNOWN); +} + +EScript ScriptByNameOrDie(const TStringBuf& name) { + EScript result = ScriptByName(name); + if (result == SCRIPT_UNKNOWN) { + ythrow yexception() << "ScriptByNameOrDie: invalid script '" << name << "'"; + } + return result; +} diff --git a/library/cpp/langs/scripts.h b/library/cpp/langs/scripts.h new file mode 100644 index 0000000000..4c47a33d2c --- /dev/null +++ b/library/cpp/langs/scripts.h @@ -0,0 +1,56 @@ +#pragma once + +#include <util/generic/strbuf.h> + +// Writing systems, a.k.a. scripts +// +enum EScript { + SCRIPT_UNKNOWN = 0, + SCRIPT_LATIN, + SCRIPT_CYRILLIC, + + SCRIPT_GREEK, + SCRIPT_ARABIC, + SCRIPT_HEBREW, + SCRIPT_ARMENIAN, + SCRIPT_GEORGIAN, + + SCRIPT_HAN, + SCRIPT_KATAKANA, + SCRIPT_HIRAGANA, + SCRIPT_HANGUL, + + SCRIPT_DEVANAGARI, + SCRIPT_BENGALI, + SCRIPT_GUJARATI, + SCRIPT_GURMUKHI, + SCRIPT_KANNADA, + SCRIPT_MALAYALAM, + SCRIPT_ORIYA, + SCRIPT_TAMIL, + SCRIPT_TELUGU, + SCRIPT_THAANA, + SCRIPT_SINHALA, + + SCRIPT_MYANMAR, + SCRIPT_THAI, + SCRIPT_LAO, + SCRIPT_KHMER, + SCRIPT_TIBETAN, + SCRIPT_MONGOLIAN, + + SCRIPT_ETHIOPIC, + SCRIPT_RUNIC, + SCRIPT_COPTIC, + SCRIPT_SYRIAC, + + SCRIPT_OTHER, + SCRIPT_MAX +}; + +// According to ISO 15924 codes. See https://en.wikipedia.org/wiki/ISO_15924 +// +EScript ScriptByName(const TStringBuf& name); +EScript ScriptByNameOrDie(const TStringBuf& name); +const char* IsoNameByScript(EScript script); +const char* FullNameByScript(EScript script); |