diff options
author | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
commit | b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch) | |
tree | 2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/langs | |
parent | 559174a9144de40d6bb3997ea4073c82289b4974 (diff) | |
download | ydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz |
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/langs')
-rw-r--r-- | library/cpp/langs/README.md | 8 | ||||
-rw-r--r-- | library/cpp/langs/generated/uniscripts.cpp | 458 | ||||
-rw-r--r-- | library/cpp/langs/langs.cpp | 330 | ||||
-rw-r--r-- | library/cpp/langs/langs.h | 229 | ||||
-rw-r--r-- | library/cpp/langs/scripts.cpp | 158 | ||||
-rw-r--r-- | library/cpp/langs/scripts.h | 56 |
6 files changed, 0 insertions, 1239 deletions
diff --git a/library/cpp/langs/README.md b/library/cpp/langs/README.md deleted file mode 100644 index 537ae31e1bb..00000000000 --- a/library/cpp/langs/README.md +++ /dev/null @@ -1,8 +0,0 @@ -Здесь описаны константы для [языков](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h) и [письменностей](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/scripts.h) (скриптов в терминах Unicode). - -В терминах этих констант языков работают [документная](https://a.yandex-team.ru/arc/trunk/arcadia/kernel/recshell/recshell.h) и [запросная](https://a.yandex-team.ru/arc/trunk/arcadia/dict/recognize/queryrec) распознавалки языка. - -Имеется [набор функций](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h?rev=r6909333#L142-214) для преобразования констант в двухбуквенный или трехбуквенный код и обратного получения константы по строке с учетом синонимов. Есть [функции](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs/langs.h?rev=r6909333#L216-217) для определения письменности по языку и по символу). - -В списке констант представлены не все языки и письменности, а лишь те, которые представляли интерес для поиска Яндекса и машинного перевода. -Имеется несколько псевдоязыков типа `LANG_UZB_CYR` или `LANG_KAZ_LAT`. diff --git a/library/cpp/langs/generated/uniscripts.cpp b/library/cpp/langs/generated/uniscripts.cpp deleted file mode 100644 index 59cc6a70c25..00000000000 --- a/library/cpp/langs/generated/uniscripts.cpp +++ /dev/null @@ -1,458 +0,0 @@ -// Generated from http://www.unicode.org/Public/UNIDATA/Scripts.txt -// The best way to alter this file is to modify uniscripts.py -#include <library/cpp/langs/langs.h> -#include <util/system/yassert.h> - -#include <cstring> - -namespace NCharsetInternal { - struct TScriptRange { - EScript Script; - wchar32 Start; - wchar32 End; - }; - - const TScriptRange ScriptRanges[] = { - { SCRIPT_ETHIOPIC, 0x1200, 0x1248 }, - { SCRIPT_ETHIOPIC, 0x124A, 0x124D }, - { SCRIPT_ETHIOPIC, 0x1250, 0x1256 }, - { SCRIPT_ETHIOPIC, 0x1258, 0x1258 }, - { SCRIPT_ETHIOPIC, 0x125A, 0x125D }, - { SCRIPT_ETHIOPIC, 0x1260, 0x1288 }, - { SCRIPT_ETHIOPIC, 0x128A, 0x128D }, - { SCRIPT_ETHIOPIC, 0x1290, 0x12B0 }, - { SCRIPT_ETHIOPIC, 0x12B2, 0x12B5 }, - { SCRIPT_ETHIOPIC, 0x12B8, 0x12BE }, - { SCRIPT_ETHIOPIC, 0x12C0, 0x12C0 }, - { SCRIPT_ETHIOPIC, 0x12C2, 0x12C5 }, - { SCRIPT_ETHIOPIC, 0x12C8, 0x12D6 }, - { SCRIPT_ETHIOPIC, 0x12D8, 0x1310 }, - { SCRIPT_ETHIOPIC, 0x1312, 0x1315 }, - { SCRIPT_ETHIOPIC, 0x1318, 0x135A }, - { SCRIPT_ETHIOPIC, 0x135D, 0x137C }, - { SCRIPT_ETHIOPIC, 0x1380, 0x1399 }, - { SCRIPT_ETHIOPIC, 0x2D80, 0x2D96 }, - { SCRIPT_ETHIOPIC, 0x2DA0, 0x2DA6 }, - { SCRIPT_ETHIOPIC, 0x2DA8, 0x2DAE }, - { SCRIPT_ETHIOPIC, 0x2DB0, 0x2DB6 }, - { SCRIPT_ETHIOPIC, 0x2DB8, 0x2DBE }, - { SCRIPT_ETHIOPIC, 0x2DC0, 0x2DC6 }, - { SCRIPT_ETHIOPIC, 0x2DC8, 0x2DCE }, - { SCRIPT_ETHIOPIC, 0x2DD0, 0x2DD6 }, - { SCRIPT_ETHIOPIC, 0x2DD8, 0x2DDE }, - { SCRIPT_ETHIOPIC, 0xAB01, 0xAB06 }, - { SCRIPT_ETHIOPIC, 0xAB09, 0xAB0E }, - { SCRIPT_ETHIOPIC, 0xAB11, 0xAB16 }, - { SCRIPT_ETHIOPIC, 0xAB20, 0xAB26 }, - { SCRIPT_ETHIOPIC, 0xAB28, 0xAB2E }, - { SCRIPT_ARABIC, 0x600, 0x604 }, - { SCRIPT_ARABIC, 0x606, 0x60B }, - { SCRIPT_ARABIC, 0x60D, 0x61A }, - { SCRIPT_ARABIC, 0x61E, 0x61E }, - { SCRIPT_ARABIC, 0x620, 0x63F }, - { SCRIPT_ARABIC, 0x641, 0x64A }, - { SCRIPT_ARABIC, 0x656, 0x66F }, - { SCRIPT_ARABIC, 0x671, 0x6DC }, - { SCRIPT_ARABIC, 0x6DE, 0x6FF }, - { SCRIPT_ARABIC, 0x750, 0x77F }, - { SCRIPT_ARABIC, 0x8A0, 0x8B4 }, - { SCRIPT_ARABIC, 0x8B6, 0x8BD }, - { SCRIPT_ARABIC, 0x8D4, 0x8E1 }, - { SCRIPT_ARABIC, 0x8E3, 0x8FF }, - { SCRIPT_ARABIC, 0xFB50, 0xFBC1 }, - { SCRIPT_ARABIC, 0xFBD3, 0xFD3D }, - { SCRIPT_ARABIC, 0xFD50, 0xFD8F }, - { SCRIPT_ARABIC, 0xFD92, 0xFDC7 }, - { SCRIPT_ARABIC, 0xFDF0, 0xFDFD }, - { SCRIPT_ARABIC, 0xFE70, 0xFE74 }, - { SCRIPT_ARABIC, 0xFE76, 0xFEFC }, - { SCRIPT_MONGOLIAN, 0x1800, 0x1801 }, - { SCRIPT_MONGOLIAN, 0x1804, 0x1804 }, - { SCRIPT_MONGOLIAN, 0x1806, 0x180E }, - { SCRIPT_MONGOLIAN, 0x1810, 0x1819 }, - { SCRIPT_MONGOLIAN, 0x1820, 0x1877 }, - { SCRIPT_MONGOLIAN, 0x1880, 0x18AA }, - { SCRIPT_TAMIL, 0xB82, 0xB83 }, - { SCRIPT_TAMIL, 0xB85, 0xB8A }, - { SCRIPT_TAMIL, 0xB8E, 0xB90 }, - { SCRIPT_TAMIL, 0xB92, 0xB95 }, - { SCRIPT_TAMIL, 0xB99, 0xB9A }, - { SCRIPT_TAMIL, 0xB9C, 0xB9C }, - { SCRIPT_TAMIL, 0xB9E, 0xB9F }, - { SCRIPT_TAMIL, 0xBA3, 0xBA4 }, - { SCRIPT_TAMIL, 0xBA8, 0xBAA }, - { SCRIPT_TAMIL, 0xBAE, 0xBB9 }, - { SCRIPT_TAMIL, 0xBBE, 0xBC2 }, - { SCRIPT_TAMIL, 0xBC6, 0xBC8 }, - { SCRIPT_TAMIL, 0xBCA, 0xBCD }, - { SCRIPT_TAMIL, 0xBD0, 0xBD0 }, - { SCRIPT_TAMIL, 0xBD7, 0xBD7 }, - { SCRIPT_TAMIL, 0xBE6, 0xBFA }, - { SCRIPT_GUJARATI, 0xA81, 0xA83 }, - { SCRIPT_GUJARATI, 0xA85, 0xA8D }, - { SCRIPT_GUJARATI, 0xA8F, 0xA91 }, - { SCRIPT_GUJARATI, 0xA93, 0xAA8 }, - { SCRIPT_GUJARATI, 0xAAA, 0xAB0 }, - { SCRIPT_GUJARATI, 0xAB2, 0xAB3 }, - { SCRIPT_GUJARATI, 0xAB5, 0xAB9 }, - { SCRIPT_GUJARATI, 0xABC, 0xAC5 }, - { SCRIPT_GUJARATI, 0xAC7, 0xAC9 }, - { SCRIPT_GUJARATI, 0xACB, 0xACD }, - { SCRIPT_GUJARATI, 0xAD0, 0xAD0 }, - { SCRIPT_GUJARATI, 0xAE0, 0xAE3 }, - { SCRIPT_GUJARATI, 0xAE6, 0xAF1 }, - { SCRIPT_GUJARATI, 0xAF9, 0xAF9 }, - { SCRIPT_MALAYALAM, 0xD01, 0xD03 }, - { SCRIPT_MALAYALAM, 0xD05, 0xD0C }, - { SCRIPT_MALAYALAM, 0xD0E, 0xD10 }, - { SCRIPT_MALAYALAM, 0xD12, 0xD3A }, - { SCRIPT_MALAYALAM, 0xD3D, 0xD44 }, - { SCRIPT_MALAYALAM, 0xD46, 0xD48 }, - { SCRIPT_MALAYALAM, 0xD4A, 0xD4F }, - { SCRIPT_MALAYALAM, 0xD54, 0xD63 }, - { SCRIPT_MALAYALAM, 0xD66, 0xD7F }, - { SCRIPT_ARMENIAN, 0x531, 0x556 }, - { SCRIPT_ARMENIAN, 0x559, 0x55F }, - { SCRIPT_ARMENIAN, 0x561, 0x587 }, - { SCRIPT_ARMENIAN, 0x58A, 0x58A }, - { SCRIPT_ARMENIAN, 0x58D, 0x58F }, - { SCRIPT_ARMENIAN, 0xFB13, 0xFB17 }, - { SCRIPT_HANGUL, 0x1100, 0x11FF }, - { SCRIPT_HANGUL, 0x302E, 0x302F }, - { SCRIPT_HANGUL, 0x3131, 0x318E }, - { SCRIPT_HANGUL, 0x3200, 0x321E }, - { SCRIPT_HANGUL, 0x3260, 0x327E }, - { SCRIPT_HANGUL, 0xA960, 0xA97C }, - { SCRIPT_HANGUL, 0xAC00, 0xD7A3 }, - { SCRIPT_HANGUL, 0xD7B0, 0xD7C6 }, - { SCRIPT_HANGUL, 0xD7CB, 0xD7FB }, - { SCRIPT_HANGUL, 0xFFA0, 0xFFBE }, - { SCRIPT_HANGUL, 0xFFC2, 0xFFC7 }, - { SCRIPT_HANGUL, 0xFFCA, 0xFFCF }, - { SCRIPT_HANGUL, 0xFFD2, 0xFFD7 }, - { SCRIPT_HANGUL, 0xFFDA, 0xFFDC }, - { SCRIPT_GURMUKHI, 0xA01, 0xA03 }, - { SCRIPT_GURMUKHI, 0xA05, 0xA0A }, - { SCRIPT_GURMUKHI, 0xA0F, 0xA10 }, - { SCRIPT_GURMUKHI, 0xA13, 0xA28 }, - { SCRIPT_GURMUKHI, 0xA2A, 0xA30 }, - { SCRIPT_GURMUKHI, 0xA32, 0xA33 }, - { SCRIPT_GURMUKHI, 0xA35, 0xA36 }, - { SCRIPT_GURMUKHI, 0xA38, 0xA39 }, - { SCRIPT_GURMUKHI, 0xA3C, 0xA3C }, - { SCRIPT_GURMUKHI, 0xA3E, 0xA42 }, - { SCRIPT_GURMUKHI, 0xA47, 0xA48 }, - { SCRIPT_GURMUKHI, 0xA4B, 0xA4D }, - { SCRIPT_GURMUKHI, 0xA51, 0xA51 }, - { SCRIPT_GURMUKHI, 0xA59, 0xA5C }, - { SCRIPT_GURMUKHI, 0xA5E, 0xA5E }, - { SCRIPT_GURMUKHI, 0xA66, 0xA75 }, - { SCRIPT_CYRILLIC, 0x400, 0x484 }, - { SCRIPT_CYRILLIC, 0x487, 0x52F }, - { SCRIPT_CYRILLIC, 0x1C80, 0x1C88 }, - { SCRIPT_CYRILLIC, 0x1D2B, 0x1D2B }, - { SCRIPT_CYRILLIC, 0x1D78, 0x1D78 }, - { SCRIPT_CYRILLIC, 0x2DE0, 0x2DFF }, - { SCRIPT_CYRILLIC, 0xA640, 0xA69F }, - { SCRIPT_CYRILLIC, 0xFE2E, 0xFE2F }, - { SCRIPT_DEVANAGARI, 0x900, 0x950 }, - { SCRIPT_DEVANAGARI, 0x953, 0x963 }, - { SCRIPT_DEVANAGARI, 0x966, 0x97F }, - { SCRIPT_DEVANAGARI, 0xA8E0, 0xA8FD }, - { SCRIPT_HEBREW, 0x591, 0x5C7 }, - { SCRIPT_HEBREW, 0x5D0, 0x5EA }, - { SCRIPT_HEBREW, 0x5F0, 0x5F4 }, - { SCRIPT_HEBREW, 0xFB1D, 0xFB36 }, - { SCRIPT_HEBREW, 0xFB38, 0xFB3C }, - { SCRIPT_HEBREW, 0xFB3E, 0xFB3E }, - { SCRIPT_HEBREW, 0xFB40, 0xFB41 }, - { SCRIPT_HEBREW, 0xFB43, 0xFB44 }, - { SCRIPT_HEBREW, 0xFB46, 0xFB4F }, - { SCRIPT_THAI, 0xE01, 0xE3A }, - { SCRIPT_THAI, 0xE40, 0xE5B }, - { SCRIPT_SYRIAC, 0x700, 0x70D }, - { SCRIPT_SYRIAC, 0x70F, 0x74A }, - { SCRIPT_SYRIAC, 0x74D, 0x74F }, - { SCRIPT_KANNADA, 0xC80, 0xC83 }, - { SCRIPT_KANNADA, 0xC85, 0xC8C }, - { SCRIPT_KANNADA, 0xC8E, 0xC90 }, - { SCRIPT_KANNADA, 0xC92, 0xCA8 }, - { SCRIPT_KANNADA, 0xCAA, 0xCB3 }, - { SCRIPT_KANNADA, 0xCB5, 0xCB9 }, - { SCRIPT_KANNADA, 0xCBC, 0xCC4 }, - { SCRIPT_KANNADA, 0xCC6, 0xCC8 }, - { SCRIPT_KANNADA, 0xCCA, 0xCCD }, - { SCRIPT_KANNADA, 0xCD5, 0xCD6 }, - { SCRIPT_KANNADA, 0xCDE, 0xCDE }, - { SCRIPT_KANNADA, 0xCE0, 0xCE3 }, - { SCRIPT_KANNADA, 0xCE6, 0xCEF }, - { SCRIPT_KANNADA, 0xCF1, 0xCF2 }, - { SCRIPT_LAO, 0xE81, 0xE82 }, - { SCRIPT_LAO, 0xE84, 0xE84 }, - { SCRIPT_LAO, 0xE87, 0xE88 }, - { SCRIPT_LAO, 0xE8A, 0xE8A }, - { SCRIPT_LAO, 0xE8D, 0xE8D }, - { SCRIPT_LAO, 0xE94, 0xE97 }, - { SCRIPT_LAO, 0xE99, 0xE9F }, - { SCRIPT_LAO, 0xEA1, 0xEA3 }, - { SCRIPT_LAO, 0xEA5, 0xEA5 }, - { SCRIPT_LAO, 0xEA7, 0xEA7 }, - { SCRIPT_LAO, 0xEAA, 0xEAB }, - { SCRIPT_LAO, 0xEAD, 0xEB9 }, - { SCRIPT_LAO, 0xEBB, 0xEBD }, - { SCRIPT_LAO, 0xEC0, 0xEC4 }, - { SCRIPT_LAO, 0xEC6, 0xEC6 }, - { SCRIPT_LAO, 0xEC8, 0xECD }, - { SCRIPT_LAO, 0xED0, 0xED9 }, - { SCRIPT_LAO, 0xEDC, 0xEDF }, - { SCRIPT_TELUGU, 0xC00, 0xC03 }, - { SCRIPT_TELUGU, 0xC05, 0xC0C }, - { SCRIPT_TELUGU, 0xC0E, 0xC10 }, - { SCRIPT_TELUGU, 0xC12, 0xC28 }, - { SCRIPT_TELUGU, 0xC2A, 0xC39 }, - { SCRIPT_TELUGU, 0xC3D, 0xC44 }, - { SCRIPT_TELUGU, 0xC46, 0xC48 }, - { SCRIPT_TELUGU, 0xC4A, 0xC4D }, - { SCRIPT_TELUGU, 0xC55, 0xC56 }, - { SCRIPT_TELUGU, 0xC58, 0xC5A }, - { SCRIPT_TELUGU, 0xC60, 0xC63 }, - { SCRIPT_TELUGU, 0xC66, 0xC6F }, - { SCRIPT_TELUGU, 0xC78, 0xC7F }, - { SCRIPT_KHMER, 0x1780, 0x17DD }, - { SCRIPT_KHMER, 0x17E0, 0x17E9 }, - { SCRIPT_KHMER, 0x17F0, 0x17F9 }, - { SCRIPT_KHMER, 0x19E0, 0x19FF }, - { SCRIPT_LATIN, 0x41, 0x5A }, - { SCRIPT_LATIN, 0x61, 0x7A }, - { SCRIPT_LATIN, 0xAA, 0xAA }, - { SCRIPT_LATIN, 0xBA, 0xBA }, - { SCRIPT_LATIN, 0xC0, 0xD6 }, - { SCRIPT_LATIN, 0xD8, 0xF6 }, - { SCRIPT_LATIN, 0xF8, 0x2B8 }, - { SCRIPT_LATIN, 0x2E0, 0x2E4 }, - { SCRIPT_LATIN, 0x1D00, 0x1D25 }, - { SCRIPT_LATIN, 0x1D2C, 0x1D5C }, - { SCRIPT_LATIN, 0x1D62, 0x1D65 }, - { SCRIPT_LATIN, 0x1D6B, 0x1D77 }, - { SCRIPT_LATIN, 0x1D79, 0x1DBE }, - { SCRIPT_LATIN, 0x1E00, 0x1EFF }, - { SCRIPT_LATIN, 0x2071, 0x2071 }, - { SCRIPT_LATIN, 0x207F, 0x207F }, - { SCRIPT_LATIN, 0x2090, 0x209C }, - { SCRIPT_LATIN, 0x212A, 0x212B }, - { SCRIPT_LATIN, 0x2132, 0x2132 }, - { SCRIPT_LATIN, 0x214E, 0x214E }, - { SCRIPT_LATIN, 0x2160, 0x2188 }, - { SCRIPT_LATIN, 0x2C60, 0x2C7F }, - { SCRIPT_LATIN, 0xA722, 0xA787 }, - { SCRIPT_LATIN, 0xA78B, 0xA7AE }, - { SCRIPT_LATIN, 0xA7B0, 0xA7B7 }, - { SCRIPT_LATIN, 0xA7F7, 0xA7FF }, - { SCRIPT_LATIN, 0xAB30, 0xAB5A }, - { SCRIPT_LATIN, 0xAB5C, 0xAB64 }, - { SCRIPT_LATIN, 0xFB00, 0xFB06 }, - { SCRIPT_LATIN, 0xFF21, 0xFF3A }, - { SCRIPT_LATIN, 0xFF41, 0xFF5A }, - { SCRIPT_TIBETAN, 0xF00, 0xF47 }, - { SCRIPT_TIBETAN, 0xF49, 0xF6C }, - { SCRIPT_TIBETAN, 0xF71, 0xF97 }, - { SCRIPT_TIBETAN, 0xF99, 0xFBC }, - { SCRIPT_TIBETAN, 0xFBE, 0xFCC }, - { SCRIPT_TIBETAN, 0xFCE, 0xFD4 }, - { SCRIPT_TIBETAN, 0xFD9, 0xFDA }, - { SCRIPT_MYANMAR, 0x1000, 0x109F }, - { SCRIPT_MYANMAR, 0xA9E0, 0xA9FE }, - { SCRIPT_MYANMAR, 0xAA60, 0xAA7F }, - { SCRIPT_OTHER, 0x2EA, 0x2EB }, - { SCRIPT_OTHER, 0x7C0, 0x7FA }, - { SCRIPT_OTHER, 0x800, 0x82D }, - { SCRIPT_OTHER, 0x830, 0x83E }, - { SCRIPT_OTHER, 0x840, 0x85B }, - { SCRIPT_OTHER, 0x85E, 0x85E }, - { SCRIPT_OTHER, 0x13A0, 0x13F5 }, - { SCRIPT_OTHER, 0x13F8, 0x13FD }, - { SCRIPT_OTHER, 0x1400, 0x169C }, - { SCRIPT_OTHER, 0x1700, 0x170C }, - { SCRIPT_OTHER, 0x170E, 0x1714 }, - { SCRIPT_OTHER, 0x1720, 0x1734 }, - { SCRIPT_OTHER, 0x1740, 0x1753 }, - { SCRIPT_OTHER, 0x1760, 0x176C }, - { SCRIPT_OTHER, 0x176E, 0x1770 }, - { SCRIPT_OTHER, 0x1772, 0x1773 }, - { SCRIPT_OTHER, 0x18B0, 0x18F5 }, - { SCRIPT_OTHER, 0x1900, 0x191E }, - { SCRIPT_OTHER, 0x1920, 0x192B }, - { SCRIPT_OTHER, 0x1930, 0x193B }, - { SCRIPT_OTHER, 0x1940, 0x1940 }, - { SCRIPT_OTHER, 0x1944, 0x196D }, - { SCRIPT_OTHER, 0x1970, 0x1974 }, - { SCRIPT_OTHER, 0x1980, 0x19AB }, - { SCRIPT_OTHER, 0x19B0, 0x19C9 }, - { SCRIPT_OTHER, 0x19D0, 0x19DA }, - { SCRIPT_OTHER, 0x19DE, 0x19DF }, - { SCRIPT_OTHER, 0x1A00, 0x1A1B }, - { SCRIPT_OTHER, 0x1A1E, 0x1A5E }, - { SCRIPT_OTHER, 0x1A60, 0x1A7C }, - { SCRIPT_OTHER, 0x1A7F, 0x1A89 }, - { SCRIPT_OTHER, 0x1A90, 0x1A99 }, - { SCRIPT_OTHER, 0x1AA0, 0x1AAD }, - { SCRIPT_OTHER, 0x1B00, 0x1B4B }, - { SCRIPT_OTHER, 0x1B50, 0x1B7C }, - { SCRIPT_OTHER, 0x1B80, 0x1BF3 }, - { SCRIPT_OTHER, 0x1BFC, 0x1C37 }, - { SCRIPT_OTHER, 0x1C3B, 0x1C49 }, - { SCRIPT_OTHER, 0x1C4D, 0x1C7F }, - { SCRIPT_OTHER, 0x1CC0, 0x1CC7 }, - { SCRIPT_OTHER, 0x2800, 0x28FF }, - { SCRIPT_OTHER, 0x2C00, 0x2C2E }, - { SCRIPT_OTHER, 0x2C30, 0x2C5E }, - { SCRIPT_OTHER, 0x2D30, 0x2D67 }, - { SCRIPT_OTHER, 0x2D6F, 0x2D70 }, - { SCRIPT_OTHER, 0x2D7F, 0x2D7F }, - { SCRIPT_OTHER, 0x3105, 0x312D }, - { SCRIPT_OTHER, 0x31A0, 0x31BA }, - { SCRIPT_OTHER, 0xA000, 0xA48C }, - { SCRIPT_OTHER, 0xA490, 0xA4C6 }, - { SCRIPT_OTHER, 0xA4D0, 0xA62B }, - { SCRIPT_OTHER, 0xA6A0, 0xA6F7 }, - { SCRIPT_OTHER, 0xA800, 0xA82B }, - { SCRIPT_OTHER, 0xA840, 0xA877 }, - { SCRIPT_OTHER, 0xA880, 0xA8C5 }, - { SCRIPT_OTHER, 0xA8CE, 0xA8D9 }, - { SCRIPT_OTHER, 0xA900, 0xA92D }, - { SCRIPT_OTHER, 0xA92F, 0xA953 }, - { SCRIPT_OTHER, 0xA95F, 0xA95F }, - { SCRIPT_OTHER, 0xA980, 0xA9CD }, - { SCRIPT_OTHER, 0xA9D0, 0xA9D9 }, - { SCRIPT_OTHER, 0xA9DE, 0xA9DF }, - { SCRIPT_OTHER, 0xAA00, 0xAA36 }, - { SCRIPT_OTHER, 0xAA40, 0xAA4D }, - { SCRIPT_OTHER, 0xAA50, 0xAA59 }, - { SCRIPT_OTHER, 0xAA5C, 0xAA5F }, - { SCRIPT_OTHER, 0xAA80, 0xAAC2 }, - { SCRIPT_OTHER, 0xAADB, 0xAAF6 }, - { SCRIPT_OTHER, 0xAB70, 0xABED }, - { SCRIPT_OTHER, 0xABF0, 0xABF9 }, - { SCRIPT_HAN, 0x2E80, 0x2E99 }, - { SCRIPT_HAN, 0x2E9B, 0x2EF3 }, - { SCRIPT_HAN, 0x2F00, 0x2FD5 }, - { SCRIPT_HAN, 0x3005, 0x3005 }, - { SCRIPT_HAN, 0x3007, 0x3007 }, - { SCRIPT_HAN, 0x3021, 0x3029 }, - { SCRIPT_HAN, 0x3038, 0x303B }, - { SCRIPT_HAN, 0x3400, 0x4DB5 }, - { SCRIPT_HAN, 0x4E00, 0x9FD5 }, - { SCRIPT_HAN, 0xF900, 0xFA6D }, - { SCRIPT_HAN, 0xFA70, 0xFAD9 }, - { SCRIPT_THAANA, 0x780, 0x7B1 }, - { SCRIPT_HIRAGANA, 0x3041, 0x3096 }, - { SCRIPT_HIRAGANA, 0x309D, 0x309F }, - { SCRIPT_KATAKANA, 0x30A1, 0x30FA }, - { SCRIPT_KATAKANA, 0x30FD, 0x30FF }, - { SCRIPT_KATAKANA, 0x31F0, 0x31FF }, - { SCRIPT_KATAKANA, 0x32D0, 0x32FE }, - { SCRIPT_KATAKANA, 0x3300, 0x3357 }, - { SCRIPT_KATAKANA, 0xFF66, 0xFF6F }, - { SCRIPT_KATAKANA, 0xFF71, 0xFF9D }, - { SCRIPT_ORIYA, 0xB01, 0xB03 }, - { SCRIPT_ORIYA, 0xB05, 0xB0C }, - { SCRIPT_ORIYA, 0xB0F, 0xB10 }, - { SCRIPT_ORIYA, 0xB13, 0xB28 }, - { SCRIPT_ORIYA, 0xB2A, 0xB30 }, - { SCRIPT_ORIYA, 0xB32, 0xB33 }, - { SCRIPT_ORIYA, 0xB35, 0xB39 }, - { SCRIPT_ORIYA, 0xB3C, 0xB44 }, - { SCRIPT_ORIYA, 0xB47, 0xB48 }, - { SCRIPT_ORIYA, 0xB4B, 0xB4D }, - { SCRIPT_ORIYA, 0xB56, 0xB57 }, - { SCRIPT_ORIYA, 0xB5C, 0xB5D }, - { SCRIPT_ORIYA, 0xB5F, 0xB63 }, - { SCRIPT_ORIYA, 0xB66, 0xB77 }, - { SCRIPT_BENGALI, 0x980, 0x983 }, - { SCRIPT_BENGALI, 0x985, 0x98C }, - { SCRIPT_BENGALI, 0x98F, 0x990 }, - { SCRIPT_BENGALI, 0x993, 0x9A8 }, - { SCRIPT_BENGALI, 0x9AA, 0x9B0 }, - { SCRIPT_BENGALI, 0x9B2, 0x9B2 }, - { SCRIPT_BENGALI, 0x9B6, 0x9B9 }, - { SCRIPT_BENGALI, 0x9BC, 0x9C4 }, - { SCRIPT_BENGALI, 0x9C7, 0x9C8 }, - { SCRIPT_BENGALI, 0x9CB, 0x9CE }, - { SCRIPT_BENGALI, 0x9D7, 0x9D7 }, - { SCRIPT_BENGALI, 0x9DC, 0x9DD }, - { SCRIPT_BENGALI, 0x9DF, 0x9E3 }, - { SCRIPT_BENGALI, 0x9E6, 0x9FB }, - { SCRIPT_RUNIC, 0x16A0, 0x16EA }, - { SCRIPT_RUNIC, 0x16EE, 0x16F8 }, - { SCRIPT_SINHALA, 0xD82, 0xD83 }, - { SCRIPT_SINHALA, 0xD85, 0xD96 }, - { SCRIPT_SINHALA, 0xD9A, 0xDB1 }, - { SCRIPT_SINHALA, 0xDB3, 0xDBB }, - { SCRIPT_SINHALA, 0xDBD, 0xDBD }, - { SCRIPT_SINHALA, 0xDC0, 0xDC6 }, - { SCRIPT_SINHALA, 0xDCA, 0xDCA }, - { SCRIPT_SINHALA, 0xDCF, 0xDD4 }, - { SCRIPT_SINHALA, 0xDD6, 0xDD6 }, - { SCRIPT_SINHALA, 0xDD8, 0xDDF }, - { SCRIPT_SINHALA, 0xDE6, 0xDEF }, - { SCRIPT_SINHALA, 0xDF2, 0xDF4 }, - { SCRIPT_COPTIC, 0x3E2, 0x3EF }, - { SCRIPT_COPTIC, 0x2C80, 0x2CF3 }, - { SCRIPT_COPTIC, 0x2CF9, 0x2CFF }, - { SCRIPT_GEORGIAN, 0x10A0, 0x10C5 }, - { SCRIPT_GEORGIAN, 0x10C7, 0x10C7 }, - { SCRIPT_GEORGIAN, 0x10CD, 0x10CD }, - { SCRIPT_GEORGIAN, 0x10D0, 0x10FA }, - { SCRIPT_GEORGIAN, 0x10FC, 0x10FF }, - { SCRIPT_GEORGIAN, 0x2D00, 0x2D25 }, - { SCRIPT_GEORGIAN, 0x2D27, 0x2D27 }, - { SCRIPT_GEORGIAN, 0x2D2D, 0x2D2D }, - { SCRIPT_GREEK, 0x370, 0x373 }, - { SCRIPT_GREEK, 0x375, 0x377 }, - { SCRIPT_GREEK, 0x37A, 0x37D }, - { SCRIPT_GREEK, 0x37F, 0x37F }, - { SCRIPT_GREEK, 0x384, 0x384 }, - { SCRIPT_GREEK, 0x386, 0x386 }, - { SCRIPT_GREEK, 0x388, 0x38A }, - { SCRIPT_GREEK, 0x38C, 0x38C }, - { SCRIPT_GREEK, 0x38E, 0x3A1 }, - { SCRIPT_GREEK, 0x3A3, 0x3E1 }, - { SCRIPT_GREEK, 0x3F0, 0x3FF }, - { SCRIPT_GREEK, 0x1D26, 0x1D2A }, - { SCRIPT_GREEK, 0x1D5D, 0x1D61 }, - { SCRIPT_GREEK, 0x1D66, 0x1D6A }, - { SCRIPT_GREEK, 0x1DBF, 0x1DBF }, - { SCRIPT_GREEK, 0x1F00, 0x1F15 }, - { SCRIPT_GREEK, 0x1F18, 0x1F1D }, - { SCRIPT_GREEK, 0x1F20, 0x1F45 }, - { SCRIPT_GREEK, 0x1F48, 0x1F4D }, - { SCRIPT_GREEK, 0x1F50, 0x1F57 }, - { SCRIPT_GREEK, 0x1F59, 0x1F59 }, - { SCRIPT_GREEK, 0x1F5B, 0x1F5B }, - { SCRIPT_GREEK, 0x1F5D, 0x1F5D }, - { SCRIPT_GREEK, 0x1F5F, 0x1F7D }, - { SCRIPT_GREEK, 0x1F80, 0x1FB4 }, - { SCRIPT_GREEK, 0x1FB6, 0x1FC4 }, - { SCRIPT_GREEK, 0x1FC6, 0x1FD3 }, - { SCRIPT_GREEK, 0x1FD6, 0x1FDB }, - { SCRIPT_GREEK, 0x1FDD, 0x1FEF }, - { SCRIPT_GREEK, 0x1FF2, 0x1FF4 }, - { SCRIPT_GREEK, 0x1FF6, 0x1FFE }, - { SCRIPT_GREEK, 0x2126, 0x2126 }, - { SCRIPT_GREEK, 0xAB65, 0xAB65 }, - }; - - void InitScriptData(ui8 data[], size_t len) { - memset (data, 0, len * sizeof(ui8)); - for (auto range : ScriptRanges) { - Y_ASSERT(range.Start <= range.End); - Y_ASSERT((unsigned)range.Script < 0x100); - size_t end = range.End; - if (end >= len) - end = len; - for (size_t j = range.Start; j <= end; ++j) { - data[j] = (ui8)range.Script; - } - } - } -} diff --git a/library/cpp/langs/langs.cpp b/library/cpp/langs/langs.cpp deleted file mode 100644 index 2c508e16022..00000000000 --- a/library/cpp/langs/langs.cpp +++ /dev/null @@ -1,330 +0,0 @@ -#include "langs.h" - -#include <library/cpp/digest/lower_case/hash_ops.h> - -#include <util/generic/array_size.h> -#include <util/generic/hash.h> -#include <util/generic/singleton.h> -#include <util/generic/strbuf.h> -#include <util/generic/yexception.h> -#include <util/system/defaults.h> - -#include <array> -#include <cctype> - -/* - * define language by ELanguage - */ - -namespace { - struct TLanguageNameAndEnum { - ELanguage Language; - EScript Script; - const char* EnglishName; - const char* BiblioName; - const char* IsoName; - const char* Synonyms; - }; - - const TLanguageNameAndEnum LanguageNameAndEnum[] = { - {LANG_UNK, SCRIPT_OTHER, "Unknown", "unk", "mis", nullptr}, - {LANG_RUS, SCRIPT_CYRILLIC, "Russian", "rus", "ru", "ru-RU"}, - {LANG_ENG, SCRIPT_LATIN, "English", "eng", "en", "en-US, en-GB, en-CA, en-NZ, en-AU"}, - {LANG_POL, SCRIPT_LATIN, "Polish", "pol", "pl", nullptr}, - {LANG_HUN, SCRIPT_LATIN, "Hungarian", "hun", "hu", nullptr}, - {LANG_UKR, SCRIPT_CYRILLIC, "Ukrainian", "ukr", "uk", "uk-UA"}, - {LANG_GER, SCRIPT_LATIN, "German", "ger", "de", "deu"}, - {LANG_FRE, SCRIPT_LATIN, "French", "fre", "fr", "fra, frn, fr-FR, fr-CA"}, - {LANG_TAT, SCRIPT_CYRILLIC, "Tatar", "tat", "tt", nullptr}, - {LANG_BEL, SCRIPT_CYRILLIC, "Belarusian", "bel", "be", "blr, Belorussian"}, - {LANG_KAZ, SCRIPT_CYRILLIC, "Kazakh", "kaz", "kk", "kk-Cyrl"}, - {LANG_ALB, SCRIPT_LATIN, "Albanian", "alb", "sq", nullptr}, - {LANG_SPA, SCRIPT_LATIN, "Spanish", "spa", "es", nullptr}, - {LANG_ITA, SCRIPT_LATIN, "Italian", "ita", "it", nullptr}, - {LANG_ARM, SCRIPT_ARMENIAN, "Armenian", "arm", "hy", "hye"}, - {LANG_DAN, SCRIPT_LATIN, "Danish", "dan", "da", nullptr}, - {LANG_POR, SCRIPT_LATIN, "Portuguese", "por", "pt", nullptr}, - {LANG_ICE, SCRIPT_LATIN, "Icelandic", "ice", "is", "isl"}, - {LANG_SLO, SCRIPT_LATIN, "Slovak", "slo", "sk", "slk"}, - {LANG_SLV, SCRIPT_LATIN, "Slovene", "slv", "sl", "Slovenian"}, - {LANG_DUT, SCRIPT_LATIN, "Dutch", "dut", "nl", "nld"}, - {LANG_BUL, SCRIPT_CYRILLIC, "Bulgarian", "bul", "bg", nullptr}, - {LANG_CAT, SCRIPT_LATIN, "Catalan", "cat", "ca", nullptr}, - {LANG_HRV, SCRIPT_LATIN, "Croatian", "hrv", "hr", "scr"}, - {LANG_CZE, SCRIPT_LATIN, "Czech", "cze", "cs", "ces"}, - {LANG_GRE, SCRIPT_GREEK, "Greek", "gre", "el", "ell"}, - {LANG_HEB, SCRIPT_HEBREW, "Hebrew", "heb", "he", "iw"}, // 'iw' is old ISO-639 code - {LANG_NOR, SCRIPT_LATIN, "Norwegian", "nor", "no", nullptr}, - {LANG_MAC, SCRIPT_CYRILLIC, "Macedonian", "mac", "mk", nullptr}, - {LANG_SWE, SCRIPT_LATIN, "Swedish", "swe", "sv", nullptr}, - {LANG_KOR, SCRIPT_HANGUL, "Korean", "kor", "ko", nullptr}, - {LANG_LAT, SCRIPT_LATIN, "Latin", "lat", "la", nullptr}, - {LANG_BASIC_RUS, SCRIPT_CYRILLIC, "Basic Russian", "basic-rus", "bas-ru", nullptr}, - {LANG_BOS, SCRIPT_LATIN, "Bosnian", "bos", "bs", nullptr}, - {LANG_MLT, SCRIPT_LATIN, "Maltese", "mlt", "mt", nullptr}, - - {LANG_EMPTY, SCRIPT_OTHER, "Empty", "empty", nullptr, nullptr}, - {LANG_UNK_LAT, SCRIPT_LATIN, "Unknown Latin", "unklat", nullptr, nullptr}, - {LANG_UNK_CYR, SCRIPT_CYRILLIC, "Unknown Cyrillic", "unkcyr", nullptr, nullptr}, - {LANG_UNK_ALPHA, SCRIPT_OTHER, "Unknown Alpha", "unkalpha", nullptr, nullptr}, - - {LANG_FIN, SCRIPT_LATIN, "Finnish", "fin", "fi", nullptr}, - {LANG_EST, SCRIPT_LATIN, "Estonian", "est", "et", nullptr}, - {LANG_LAV, SCRIPT_LATIN, "Latvian", "lav", "lv", nullptr}, - {LANG_LIT, SCRIPT_LATIN, "Lithuanian", "lit", "lt", nullptr}, - {LANG_BAK, SCRIPT_CYRILLIC, "Bashkir", "bak", "ba", nullptr}, - {LANG_TUR, SCRIPT_LATIN, "Turkish", "tur", "tr", nullptr}, - {LANG_RUM, SCRIPT_LATIN, "Romanian", "rum", "ro", "ron"}, - {LANG_MON, SCRIPT_CYRILLIC, "Mongolian", "mon", "mn", nullptr}, - {LANG_UZB, SCRIPT_LATIN, "Uzbek", "uzb", "uz", "uz-Latn"}, - {LANG_KIR, SCRIPT_CYRILLIC, "Kirghiz", "kir", "ky", "Kyrgyz"}, - {LANG_TGK, SCRIPT_CYRILLIC, "Tajik", "tgk", "tg", nullptr}, - {LANG_TUK, SCRIPT_LATIN, "Turkmen", "tuk", "tk", nullptr}, - {LANG_SRP, SCRIPT_CYRILLIC, "Serbian", "srp", "sr", nullptr}, - {LANG_AZE, SCRIPT_LATIN, "Azerbaijani", "aze", "az", "Azeri"}, - {LANG_BASIC_ENG, SCRIPT_LATIN, "Basic English", "basic-eng", "bas-en", nullptr}, - {LANG_GEO, SCRIPT_GEORGIAN, "Georgian", "geo", "ka", "kat"}, - {LANG_ARA, SCRIPT_ARABIC, "Arabic", "ara", "ar", nullptr}, - {LANG_PER, SCRIPT_ARABIC, "Persian", "per", "fa", "fas"}, - {LANG_CHU, SCRIPT_CYRILLIC, "Church Slavonic", "chu", "cu", nullptr}, - {LANG_CHI, SCRIPT_HAN, "Chinese", "chi", "zh", "zho"}, - {LANG_JPN, SCRIPT_HIRAGANA, "Japanese", "jpn", "ja", nullptr}, - {LANG_IND, SCRIPT_LATIN, "Indonesian", "ind", "id", "in"}, // 'in' is old ISO-639 code - {LANG_MAY, SCRIPT_LATIN, "Malay", "may", "ms", "msa"}, - {LANG_THA, SCRIPT_THAI, "Thai", "tha", "th", nullptr}, - {LANG_VIE, SCRIPT_LATIN, "Vietnamese", "vie", "vi", nullptr}, - {LANG_GLE, SCRIPT_LATIN, "Irish", "gle", "ga", nullptr}, - {LANG_TGL, SCRIPT_LATIN, "Tagalog", "tgl", "tl", "fil"}, - {LANG_HIN, SCRIPT_DEVANAGARI, "Hindi", "hin", "hi", nullptr}, - {LANG_AFR, SCRIPT_LATIN, "Afrikaans", "afr", "af", nullptr}, - {LANG_URD, SCRIPT_ARABIC, "Urdu", "urd", "ur", nullptr}, - {LANG_MYA, SCRIPT_MYANMAR, "Burmese", "mya", "my", nullptr}, - {LANG_KHM, SCRIPT_KHMER, "Khmer", "khm", "km", nullptr}, - {LANG_LAO, SCRIPT_LAO, "Lao", "lao", "lo", "Laotian, Laothian"}, - {LANG_TAM, SCRIPT_TAMIL, "Tamil", "tam", "ta", nullptr}, - {LANG_BEN, SCRIPT_BENGALI, "Bengali", "ben", "bn", nullptr}, - {LANG_GUJ, SCRIPT_GUJARATI, "Gujarati", "guj", "gu", nullptr}, - {LANG_KAN, SCRIPT_KANNADA, "Kannada", "kan", "kn", nullptr}, - {LANG_PAN, SCRIPT_GURMUKHI, "Punjabi", "pan", "pa", nullptr}, - {LANG_SIN, SCRIPT_SINHALA, "Sinhalese", "sin", "si", nullptr}, - {LANG_SWA, SCRIPT_LATIN, "Swahili", "swa", "sw", nullptr}, - {LANG_BAQ, SCRIPT_LATIN, "Basque", "baq", "eu", "eus"}, - {LANG_WEL, SCRIPT_LATIN, "Welsh", "wel", "cy", "cym"}, - {LANG_GLG, SCRIPT_LATIN, "Galician", "glg", "gl", nullptr}, - {LANG_HAT, SCRIPT_LATIN, "Haitian Creole", "hat", "ht", "Haitian"}, - {LANG_MLG, SCRIPT_LATIN, "Malagasy", "mlg", "mg", nullptr}, - {LANG_CHV, SCRIPT_CYRILLIC, "Chuvash", "chv", "cv", nullptr}, - {LANG_UDM, SCRIPT_CYRILLIC, "Udmurt", "udm", "udm", nullptr}, - {LANG_KPV, SCRIPT_CYRILLIC, "Komi-Zyrian", "kpv", "kv", "Komi, kom"}, - {LANG_MHR, SCRIPT_CYRILLIC, "Meadow Mari", "mhr", "mhr", "EasternMari, Mari, chm"}, - {LANG_SJN, SCRIPT_LATIN, "Sindarin", "sjn", "sjn", nullptr}, - {LANG_MRJ, SCRIPT_CYRILLIC, "Hill Mari", "mrj", "mrj", "WesternMari"}, - {LANG_KOI, SCRIPT_CYRILLIC, "Komi-Permyak", "koi", "koi", nullptr}, - {LANG_LTZ, SCRIPT_LATIN, "Luxembourgish", "ltz", "lb", "Luxemburgish"}, - {LANG_GLA, SCRIPT_LATIN, "Scottish Gaelic", "gla", "gd", "Gaelic"}, - {LANG_CEB, SCRIPT_LATIN, "Cebuano", "ceb", "ceb", "Bisaya, Binisaya, Visayan"}, - {LANG_PUS, SCRIPT_ARABIC, "Pashto", "pus", "ps", nullptr}, - {LANG_KMR, SCRIPT_LATIN, "Kurmanji", "kmr", "ku", "Kurdish"}, - {LANG_AMH, SCRIPT_ETHIOPIC, "Amharic", "amh", "am", nullptr}, - {LANG_ZUL, SCRIPT_LATIN, "Zulu", "zul", "zu", nullptr}, - {LANG_IBO, SCRIPT_LATIN, "Igbo", "ibo", "ig", "Ibo"}, - {LANG_YOR, SCRIPT_LATIN, "Yoruba", "yor", "yo", nullptr}, - {LANG_COS, SCRIPT_LATIN, "Corsican", "cos", "co", nullptr}, - {LANG_XHO, SCRIPT_LATIN, "Xhosa", "xho", "xh", nullptr}, - {LANG_JAV, SCRIPT_LATIN, "Javanese", "jav", "jv", nullptr}, // Also SCRIPT_JAVANESE and SCRIPT_ARABIC - {LANG_NEP, SCRIPT_DEVANAGARI, "Nepali", "nep", "ne", nullptr}, - {LANG_SND, SCRIPT_DEVANAGARI, "Sindhi", "snd", "sd", nullptr}, // Also SCRIPT_ARABIC and SCRIPT_GUJARATI - {LANG_SOM, SCRIPT_LATIN, "Somali", "som", "so", nullptr}, - {LANG_EPO, SCRIPT_LATIN, "Esperanto", "epo", "eo", nullptr}, - {LANG_TEL, SCRIPT_TELUGU, "Telugu", "tel", "te", nullptr}, - {LANG_MAR, SCRIPT_DEVANAGARI, "Marathi", "mar", "mr", nullptr}, - {LANG_HAU, SCRIPT_LATIN, "Hausa", "hau", "ha", nullptr}, - {LANG_YID, SCRIPT_HEBREW, "Yiddish", "yid", "yi", nullptr}, - {LANG_MAL, SCRIPT_MALAYALAM, "Malayalam", "mal", "ml", nullptr}, - {LANG_MAO, SCRIPT_LATIN, "Maori", "mao", "mi", "mri"}, - {LANG_SUN, SCRIPT_LATIN, "Sundanese", "sun", "su", nullptr}, - {LANG_PAP, SCRIPT_LATIN, "Papiamento", "pap", "pap", nullptr}, - {LANG_UZB_CYR, SCRIPT_CYRILLIC, "Cyrillic Uzbek", "uzbcyr", "uz-Cyrl", nullptr}, // https://tools.ietf.org/html/rfc5646 - {LANG_TRANSCR_IPA, SCRIPT_LATIN, "International Phonetic Alphabet Transcription", "ipa", "tr-ipa", nullptr}, - {LANG_EMJ, SCRIPT_LATIN, "Emoji", "emj", "emj", nullptr}, - {LANG_UYG, SCRIPT_ARABIC, "Uyghur", "uig", "ug", nullptr}, - {LANG_BRE, SCRIPT_LATIN, "Breton", "bre", "br", nullptr}, - {LANG_SAH, SCRIPT_CYRILLIC, "Yakut", "sah", "sah", nullptr}, - {LANG_KAZ_LAT, SCRIPT_LATIN, "Latin Kazakh", "kazlat", "kk-Latn", nullptr}, - }; - - static_assert(static_cast<size_t>(LANG_MAX) == Y_ARRAY_SIZE(LanguageNameAndEnum), "Size doesn't match"); - - class TLanguagesMap { - private: - static const char* const EMPTY_NAME; - - using TNamesHash = THashMap<TStringBuf, ELanguage, TCIOps, TCIOps>; - TNamesHash Hash; - - using TNamesArray = std::array<const char*, static_cast<size_t>(LANG_MAX)>; - TNamesArray BiblioNames; - TNamesArray IsoNames; - TNamesArray FullNames; - - using TScripts = std::array<EScript, static_cast<size_t>(LANG_MAX)>; - TScripts Scripts; - - private: - void AddNameToHash(const TStringBuf& name, ELanguage language) { - if (Hash.find(name) != Hash.end()) { - Y_ASSERT(Hash.find(name)->second == language); - return; - } - - Hash[name] = language; - } - - void AddName(const char* name, ELanguage language, TNamesArray& names) { - if (name == nullptr || strlen(name) == 0) - return; - - Y_ASSERT(names[language] == EMPTY_NAME); - names[language] = name; - - AddNameToHash(name, language); - } - - void AddSynonyms(const char* syn, ELanguage language) { - static const char* del = " ,;"; - if (!syn) - return; - while (*syn) { - size_t len = strcspn(syn, del); - AddNameToHash(TStringBuf(syn, len), language); - syn += len; - while (*syn && strchr(del, *syn)) - ++syn; - } - } - - public: - TLanguagesMap() { - BiblioNames.fill(EMPTY_NAME); - IsoNames.fill(EMPTY_NAME); - FullNames.fill(EMPTY_NAME); - Scripts.fill(SCRIPT_OTHER); - - for (size_t i = 0; i != Y_ARRAY_SIZE(LanguageNameAndEnum); ++i) { - const TLanguageNameAndEnum& val = LanguageNameAndEnum[i]; - - ELanguage language = val.Language; - - AddName(val.BiblioName, language, BiblioNames); - AddName(val.IsoName, language, IsoNames); - AddName(val.EnglishName, language, FullNames); - AddSynonyms(val.Synonyms, language); - - if (Scripts[language] == SCRIPT_OTHER) { - Scripts[language] = val.Script; - } - } - } - - public: - inline ELanguage LanguageByName(const TStringBuf& name, ELanguage def) const { - if (!name) - return def; - - TNamesHash::const_iterator i = Hash.find(name); - if (i == Hash.end()) { - // Try to extract the primary language code from constructions like "en-cockney" or "zh_Hant" - size_t dash_pos = name.find_first_of("_-"); - if (dash_pos != TStringBuf::npos) - i = Hash.find(name.substr(0, dash_pos)); - if (i == Hash.end()) - return def; - } - - return i->second; - } - - inline const char* FullNameByLanguage(ELanguage language) const { - if (language < 0 || static_cast<size_t>(language) >= FullNames.size()) - return nullptr; - - return FullNames[language]; - } - inline const char* BiblioNameByLanguage(ELanguage language) const { - if (language < 0 || static_cast<size_t>(language) >= BiblioNames.size()) - return nullptr; - - return BiblioNames[language]; - } - inline const char* IsoNameByLanguage(ELanguage language) const { - if (language < 0 || static_cast<size_t>(language) >= IsoNames.size()) - return nullptr; - - return IsoNames[language]; - } - - inline EScript Script(ELanguage language) const { - return Scripts[language]; - } - }; -} - -const char* const TLanguagesMap::EMPTY_NAME = ""; - -const char* FullNameByLanguage(ELanguage language) { - return Singleton<TLanguagesMap>()->FullNameByLanguage(language); -} -const char* NameByLanguage(ELanguage language) { - return Singleton<TLanguagesMap>()->BiblioNameByLanguage(language); -} -const char* IsoNameByLanguage(ELanguage language) { - return Singleton<TLanguagesMap>()->IsoNameByLanguage(language); -} - -ELanguage LanguageByNameStrict(const TStringBuf& name) { - return Singleton<TLanguagesMap>()->LanguageByName(name, LANG_MAX); -} - -ELanguage LanguageByNameOrDie(const TStringBuf& name) { - ELanguage result = LanguageByNameStrict(name); - if (result == LANG_MAX) { - ythrow yexception() << "LanguageByNameOrDie: invalid language '" << name << "'"; - } - return result; -} - -ELanguage LanguageByName(const TStringBuf& name) { - return Singleton<TLanguagesMap>()->LanguageByName(name, LANG_UNK); -} - -EScript ScriptByLanguage(ELanguage language) { - return Singleton<TLanguagesMap>()->Script(language); -} - -namespace { - const size_t MAX_GLYPH = 0x10000; - class TScriptGlyphIndex { - public: - TScriptGlyphIndex() { - NCharsetInternal::InitScriptData(Data, MAX_GLYPH); - } - - EScript GetGlyphScript(wchar32 glyph) const { - if (glyph >= MAX_GLYPH) - return SCRIPT_UNKNOWN; - return (EScript)Data[glyph]; - } - - private: - ui8 Data[MAX_GLYPH]; - }; -} - -EScript ScriptByGlyph(wchar32 glyph) { - return HugeSingleton<TScriptGlyphIndex>()->GetGlyphScript(glyph); -} - -template <> -void Out<ELanguage>(IOutputStream& o, ELanguage lang) { - o << NameByLanguage(lang); -} diff --git a/library/cpp/langs/langs.h b/library/cpp/langs/langs.h deleted file mode 100644 index 360ab6a8321..00000000000 --- a/library/cpp/langs/langs.h +++ /dev/null @@ -1,229 +0,0 @@ -#pragma once - -#include "scripts.h" - -#include <util/generic/strbuf.h> -#include <util/system/defaults.h> - -#if defined(_win_) -// LANG_LAO is #define in WinNT.h -#undef LANG_LAO -#endif - -// Language names are given according to ISO 639-2/B -// Some languages are not present in ISO 639-2/B. Then ISO 639-3 is used. -// http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes -enum ELanguage { - LANG_UNK = 0, // Unknown - LANG_RUS = 1, // Russian - LANG_ENG = 2, // English - LANG_POL = 3, // Polish - LANG_HUN = 4, // Hungarian - LANG_UKR = 5, // Ukrainian - LANG_GER = 6, // German - LANG_FRE = 7, // French - LANG_TAT = 8, // Tatar - LANG_BEL = 9, // Belarusian - LANG_KAZ = 10, // Kazakh - LANG_ALB = 11, // Albanian - LANG_SPA = 12, // Spanish - LANG_ITA = 13, // Italian - LANG_ARM = 14, // Armenian - LANG_DAN = 15, // Danish - LANG_POR = 16, // Portuguese - LANG_ICE = 17, // Icelandic - LANG_SLO = 18, // Slovak - LANG_SLV = 19, // Slovene - LANG_DUT = 20, // Dutch (Netherlandish language) - LANG_BUL = 21, // Bulgarian - LANG_CAT = 22, // Catalan - LANG_HRV = 23, // Croatian - LANG_CZE = 24, // Czech - LANG_GRE = 25, // Greek - LANG_HEB = 26, // Hebrew - LANG_NOR = 27, // Norwegian - LANG_MAC = 28, // Macedonian - LANG_SWE = 29, // Swedish - LANG_KOR = 30, // Korean - LANG_LAT = 31, // Latin - LANG_BASIC_RUS = 32, // Simplified version of Russian (used at lemmer only) - LANG_BOS = 33, // Bosnian - LANG_MLT = 34, // Maltese - LANG_EMPTY = 35, // Indicate that document is empty - LANG_UNK_LAT = 36, // Any unrecognized latin language - LANG_UNK_CYR = 37, // Any unrecognized cyrillic language - LANG_UNK_ALPHA = 38, // Any unrecognized alphabetic language not fit into previous categories - LANG_FIN = 39, // Finnish - LANG_EST = 40, // Estonian - LANG_LAV = 41, // Latvian - LANG_LIT = 42, // Lithuanian - LANG_BAK = 43, // Bashkir - LANG_TUR = 44, // Turkish - LANG_RUM = 45, // Romanian (also Moldavian) - LANG_MON = 46, // Mongolian - LANG_UZB = 47, // Uzbek - LANG_KIR = 48, // Kirghiz - LANG_TGK = 49, // Tajik - LANG_TUK = 50, // Turkmen - LANG_SRP = 51, // Serbian - LANG_AZE = 52, // Azerbaijani - LANG_BASIC_ENG = 53, // Simplified version of English (used at lemmer only) - LANG_GEO = 54, // Georgian - LANG_ARA = 55, // Arabic - LANG_PER = 56, // Persian - LANG_CHU = 57, // Church Slavonic - LANG_CHI = 58, // Chinese - LANG_JPN = 59, // Japanese - LANG_IND = 60, // Indonesian - LANG_MAY = 61, // Malay - LANG_THA = 62, // Thai - LANG_VIE = 63, // Vietnamese - LANG_GLE = 64, // Irish (Gaelic) - LANG_TGL = 65, // Tagalog (Filipino) - LANG_HIN = 66, // Hindi - LANG_AFR = 67, // Afrikaans - LANG_URD = 68, // Urdu - LANG_MYA = 69, // Burmese - LANG_KHM = 70, // Khmer - LANG_LAO = 71, // Lao - LANG_TAM = 72, // Tamil - LANG_BEN = 73, // Bengali - LANG_GUJ = 74, // Gujarati - LANG_KAN = 75, // Kannada - LANG_PAN = 76, // Punjabi - LANG_SIN = 77, // Sinhalese - LANG_SWA = 78, // Swahili - LANG_BAQ = 79, // Basque - LANG_WEL = 80, // Welsh - LANG_GLG = 81, // Galician - LANG_HAT = 82, // Haitian Creole - LANG_MLG = 83, // Malagasy - LANG_CHV = 84, // Chuvash - LANG_UDM = 85, // Udmurt - LANG_KPV = 86, // Komi-Zyrian - LANG_MHR = 87, // Meadow Mari (Eastern Mari) - LANG_SJN = 88, // Sindarin - LANG_MRJ = 89, // Hill Mari (Western Mari) - LANG_KOI = 90, // Komi-Permyak - LANG_LTZ = 91, // Luxembourgish - LANG_GLA = 92, // Scottish Gaelic - LANG_CEB = 93, // Cebuano - LANG_PUS = 94, // Pashto - LANG_KMR = 95, // Kurmanji - LANG_AMH = 96, // Amharic - LANG_ZUL = 97, // Zulu - LANG_IBO = 98, // Igbo - LANG_YOR = 99, // Yoruba - LANG_COS = 100, // Corsican - LANG_XHO = 101, // Xhosa - LANG_JAV = 102, // Javanese - LANG_NEP = 103, // Nepali - LANG_SND = 104, // Sindhi - LANG_SOM = 105, // Somali - LANG_EPO = 106, // Esperanto - LANG_TEL = 107, // Telugu - LANG_MAR = 108, // Marathi - LANG_HAU = 109, // Hausa - LANG_YID = 110, // Yiddish - LANG_MAL = 111, // Malayalam - LANG_MAO = 112, // Maori - LANG_SUN = 113, // Sundanese - LANG_PAP = 114, // Papiamento - LANG_UZB_CYR = 115, // Cyrillic Uzbek - LANG_TRANSCR_IPA = 116, // International Phonetic Alphabet Transcription - LANG_EMJ = 117, // Emoji - LANG_UYG = 118, // Uyghur - LANG_BRE = 119, // Breton - LANG_SAH = 120, // Yakut - LANG_KAZ_LAT = 121, // Latin Kazakh - LANG_MAX -}; - -/** - * Converts string to corresponding enum. Will try to extract the primary language code from - * constructions like "en-cockney" or "zh_Hant". In case of failure will return `LANG_UNK`. - * - * @param name Language name - * @return Language enum - */ -ELanguage LanguageByName(const TStringBuf& name); - -/** - * Same as `LanguageByName`, but in case of failure will return `LANG_MAX`. - * - * @see LanguageByName - */ -ELanguage LanguageByNameStrict(const TStringBuf& name); - -/** - * Converts language enum to corresponding ISO 639-2/B alpha-3 code. For languages missing in ISO - * standard convertions are: - * - LANG_UNK: "unk" - * - LANG_BASIC_RUS: "basic-rus" - * - LANG_EMPTY: "empty" - * - LANG_UNK_LAT: "unklat" - * - LANG_UNK_CYR: "unkcyr" - * - LANG_UNK_ALPHA: "unkalpha" - * - LANG_BASIC_ENG: "basic-eng" - * - LANG_TRANSCR_IPA "transcr-ipa" - * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be - * `nullptr`. - * - * @param language Language enum - * @return Language ISO 639-2/B alpha-3 code - */ -const char* NameByLanguage(ELanguage language); - -/** - * Converts language enum to corresponding ISO 639-1 alpha-2 code. For languages missing in ISO - * standard convertions are: - * - LANG_UNK: "mis" - * - LANG_BASIC_RUS: "bas-ru" - * - LANG_EMPTY: "" - * - LANG_UNK_LAT: "" - * - LANG_UNK_CYR: "" - * - LANG_UNK_ALPHA: "" - * - LANG_BASIC_ENG: "bas-en" - * - LANG_TRANSCR_IPA "tr-ipa" - * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be - * `nullptr`. - * - * @param language Language enum - * @return Language ISO 639-1 alpha-2 code - */ -const char* IsoNameByLanguage(ELanguage language); - -/** - * Converts language enum to corresponding human-readable language name. E.g. "Russian" for - * `LANG_RUS` or "Basic Russian" for `LANG_BASIC_RUS`. If language is missing in `ELanguage` or if - * it is a `LANG_MAX` then return value will be `nullptr`. - * - * @param language Language enum - */ -const char* FullNameByLanguage(ELanguage language); - -/** - * Same as `LanguageByNameStrict` but in case of failure will throw `yexception`. - * - * @see LanguageByNameStrict - */ -ELanguage LanguageByNameOrDie(const TStringBuf& name); - -constexpr bool UnknownLanguage(const ELanguage language) noexcept { - return language == LANG_UNK || language == LANG_UNK_LAT || language == LANG_UNK_CYR || language == LANG_UNK_ALPHA || language == LANG_EMPTY; -} - -EScript ScriptByLanguage(ELanguage language); -EScript ScriptByGlyph(wchar32 glyph); - -namespace NCharsetInternal { - void InitScriptData(ui8 data[], size_t len); -} - -inline bool LatinScript(ELanguage language) { - return ScriptByLanguage(language) == SCRIPT_LATIN; -} - -inline bool CyrillicScript(ELanguage language) { - return ScriptByLanguage(language) == SCRIPT_CYRILLIC; -} diff --git a/library/cpp/langs/scripts.cpp b/library/cpp/langs/scripts.cpp deleted file mode 100644 index 41cc91d3ce6..00000000000 --- a/library/cpp/langs/scripts.cpp +++ /dev/null @@ -1,158 +0,0 @@ -#include "scripts.h" - -#include <library/cpp/digest/lower_case/hash_ops.h> - -#include <util/generic/hash.h> -#include <util/generic/singleton.h> -#include <util/generic/strbuf.h> -#include <util/generic/yexception.h> -#include <util/system/defaults.h> - -#include <array> - -namespace { - struct TScriptNameAndEnum { - EScript Script; - const char* EnglishName; - const char* IsoName; - }; - - const TScriptNameAndEnum ScriptNameAndEnum[] = { - {SCRIPT_UNKNOWN, "Unknown", "Zzzz"}, - {SCRIPT_LATIN, "Latin", "Latn"}, - {SCRIPT_CYRILLIC, "Cyrillic", "Cyrl"}, - - {SCRIPT_GREEK, "Greek", "Grek"}, - {SCRIPT_ARABIC, "Arabic", "Arab"}, - {SCRIPT_HEBREW, "Hebrew", "Hebr"}, - {SCRIPT_ARMENIAN, "Armenian", "Armn"}, - {SCRIPT_GEORGIAN, "Georgian", "Geor"}, - - {SCRIPT_HAN, "Han", "Hans"}, // We use more common Simpliied variant (as opposed to Traditional 'Hant') - {SCRIPT_KATAKANA, "Katakana", "Kana"}, - {SCRIPT_HIRAGANA, "Hiragana", "Hira"}, - {SCRIPT_HANGUL, "Hangul", "Hang"}, - - {SCRIPT_DEVANAGARI, "Devanagari", "Deva"}, - {SCRIPT_BENGALI, "Bengali", "Beng"}, - {SCRIPT_GUJARATI, "Gujarati", "Gujr"}, - {SCRIPT_GURMUKHI, "Gurmukhi", "Guru"}, - {SCRIPT_KANNADA, "Kannada", "Knda"}, - {SCRIPT_MALAYALAM, "Malayalam", "Mlym"}, - {SCRIPT_ORIYA, "Oriya", "Orya"}, - {SCRIPT_TAMIL, "Tamil", "Taml"}, - {SCRIPT_TELUGU, "Telugu", "Telu"}, - {SCRIPT_THAANA, "Thaana", "Thaa"}, - {SCRIPT_SINHALA, "Sinhala", "Sinh"}, - - {SCRIPT_MYANMAR, "Myanmar", "Mymr"}, - {SCRIPT_THAI, "Thai", "Thai"}, - {SCRIPT_LAO, "Lao", "Laoo"}, - {SCRIPT_KHMER, "Khmer", "Khmr"}, - {SCRIPT_TIBETAN, "Tibetan", "Tibt"}, - {SCRIPT_MONGOLIAN, "Mongolian", "Mong"}, - - {SCRIPT_ETHIOPIC, "Ethiopic", "Ethi"}, - {SCRIPT_RUNIC, "Runic", "Runr"}, - {SCRIPT_COPTIC, "Coptic", "Copt"}, - {SCRIPT_SYRIAC, "Syriac", "Syrc"}, - - {SCRIPT_OTHER, "Other", "Zyyy"}, - }; - - static_assert(static_cast<size_t>(SCRIPT_MAX) == Y_ARRAY_SIZE(ScriptNameAndEnum), "Size doesn't match"); - - class TScriptsMap { - private: - static const char* const EMPTY_NAME; - - using TNamesHash = THashMap<TStringBuf, EScript, TCIOps, TCIOps>; - TNamesHash Hash; - - using TNamesArray = std::array<const char*, static_cast<size_t>(SCRIPT_MAX)>; - TNamesArray IsoNames; - TNamesArray FullNames; - - private: - void AddNameToHash(const TStringBuf& name, EScript script) { - if (Hash.find(name) != Hash.end()) { - Y_ASSERT(Hash.find(name)->second == script); - return; - } - - Hash[name] = script; - } - - void AddName(const char* name, EScript script, TNamesArray& names) { - if (name == nullptr || strlen(name) == 0) - return; - - Y_ASSERT(names[script] == EMPTY_NAME); - names[script] = name; - - AddNameToHash(name, script); - } - - public: - TScriptsMap() { - IsoNames.fill(EMPTY_NAME); - FullNames.fill(EMPTY_NAME); - - for (const auto& val : ScriptNameAndEnum) { - EScript script = val.Script; - - AddName(val.IsoName, script, IsoNames); - AddName(val.EnglishName, script, FullNames); - } - } - - public: - inline EScript ScriptByName(const TStringBuf& name, EScript def) const { - if (!name) - return def; - - TNamesHash::const_iterator i = Hash.find(name); - if (i == Hash.end()) { - return def; - } - - return i->second; - } - - inline const char* FullNameByScript(EScript script) const { - if (script < 0 || static_cast<size_t>(script) >= FullNames.size()) - return nullptr; - - return FullNames[script]; - } - - inline const char* IsoNameByScript(EScript script) const { - if (script < 0 || static_cast<size_t>(script) >= IsoNames.size()) - return nullptr; - - return IsoNames[script]; - } - }; -} - -const char* const TScriptsMap::EMPTY_NAME = ""; - -const char* FullNameByScript(EScript script) { - return Singleton<TScriptsMap>()->FullNameByScript(script); -} - -const char* IsoNameByScript(EScript script) { - return Singleton<TScriptsMap>()->IsoNameByScript(script); -} - -EScript ScriptByName(const TStringBuf& name) { - return Singleton<TScriptsMap>()->ScriptByName(name, SCRIPT_UNKNOWN); -} - -EScript ScriptByNameOrDie(const TStringBuf& name) { - EScript result = ScriptByName(name); - if (result == SCRIPT_UNKNOWN) { - ythrow yexception() << "ScriptByNameOrDie: invalid script '" << name << "'"; - } - return result; -} diff --git a/library/cpp/langs/scripts.h b/library/cpp/langs/scripts.h deleted file mode 100644 index 4c47a33d2cb..00000000000 --- a/library/cpp/langs/scripts.h +++ /dev/null @@ -1,56 +0,0 @@ -#pragma once - -#include <util/generic/strbuf.h> - -// Writing systems, a.k.a. scripts -// -enum EScript { - SCRIPT_UNKNOWN = 0, - SCRIPT_LATIN, - SCRIPT_CYRILLIC, - - SCRIPT_GREEK, - SCRIPT_ARABIC, - SCRIPT_HEBREW, - SCRIPT_ARMENIAN, - SCRIPT_GEORGIAN, - - SCRIPT_HAN, - SCRIPT_KATAKANA, - SCRIPT_HIRAGANA, - SCRIPT_HANGUL, - - SCRIPT_DEVANAGARI, - SCRIPT_BENGALI, - SCRIPT_GUJARATI, - SCRIPT_GURMUKHI, - SCRIPT_KANNADA, - SCRIPT_MALAYALAM, - SCRIPT_ORIYA, - SCRIPT_TAMIL, - SCRIPT_TELUGU, - SCRIPT_THAANA, - SCRIPT_SINHALA, - - SCRIPT_MYANMAR, - SCRIPT_THAI, - SCRIPT_LAO, - SCRIPT_KHMER, - SCRIPT_TIBETAN, - SCRIPT_MONGOLIAN, - - SCRIPT_ETHIOPIC, - SCRIPT_RUNIC, - SCRIPT_COPTIC, - SCRIPT_SYRIAC, - - SCRIPT_OTHER, - SCRIPT_MAX -}; - -// According to ISO 15924 codes. See https://en.wikipedia.org/wiki/ISO_15924 -// -EScript ScriptByName(const TStringBuf& name); -EScript ScriptByNameOrDie(const TStringBuf& name); -const char* IsoNameByScript(EScript script); -const char* FullNameByScript(EScript script); |