diff options
author | innokentii <innokentii@yandex-team.com> | 2023-03-24 18:21:43 +0300 |
---|---|---|
committer | innokentii <innokentii@yandex-team.com> | 2023-03-24 18:21:43 +0300 |
commit | a1dece0299855c161ae585b31c50155acf577294 (patch) | |
tree | 3b04be14df5ee3093cc1c49423efbfcd64cba22d /library/cpp | |
parent | 833d012d60f469736200a888fc73dda823a9be4b (diff) | |
download | ydb-a1dece0299855c161ae585b31c50155acf577294.tar.gz |
Add yaml config utils:
add yaml config utils
Diffstat (limited to 'library/cpp')
-rw-r--r-- | library/cpp/langs/langs.h | 229 | ||||
-rw-r--r-- | library/cpp/langs/scripts.h | 56 |
2 files changed, 285 insertions, 0 deletions
diff --git a/library/cpp/langs/langs.h b/library/cpp/langs/langs.h new file mode 100644 index 0000000000..360ab6a832 --- /dev/null +++ b/library/cpp/langs/langs.h @@ -0,0 +1,229 @@ +#pragma once + +#include "scripts.h" + +#include <util/generic/strbuf.h> +#include <util/system/defaults.h> + +#if defined(_win_) +// LANG_LAO is #define in WinNT.h +#undef LANG_LAO +#endif + +// Language names are given according to ISO 639-2/B +// Some languages are not present in ISO 639-2/B. Then ISO 639-3 is used. +// http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes +enum ELanguage { + LANG_UNK = 0, // Unknown + LANG_RUS = 1, // Russian + LANG_ENG = 2, // English + LANG_POL = 3, // Polish + LANG_HUN = 4, // Hungarian + LANG_UKR = 5, // Ukrainian + LANG_GER = 6, // German + LANG_FRE = 7, // French + LANG_TAT = 8, // Tatar + LANG_BEL = 9, // Belarusian + LANG_KAZ = 10, // Kazakh + LANG_ALB = 11, // Albanian + LANG_SPA = 12, // Spanish + LANG_ITA = 13, // Italian + LANG_ARM = 14, // Armenian + LANG_DAN = 15, // Danish + LANG_POR = 16, // Portuguese + LANG_ICE = 17, // Icelandic + LANG_SLO = 18, // Slovak + LANG_SLV = 19, // Slovene + LANG_DUT = 20, // Dutch (Netherlandish language) + LANG_BUL = 21, // Bulgarian + LANG_CAT = 22, // Catalan + LANG_HRV = 23, // Croatian + LANG_CZE = 24, // Czech + LANG_GRE = 25, // Greek + LANG_HEB = 26, // Hebrew + LANG_NOR = 27, // Norwegian + LANG_MAC = 28, // Macedonian + LANG_SWE = 29, // Swedish + LANG_KOR = 30, // Korean + LANG_LAT = 31, // Latin + LANG_BASIC_RUS = 32, // Simplified version of Russian (used at lemmer only) + LANG_BOS = 33, // Bosnian + LANG_MLT = 34, // Maltese + LANG_EMPTY = 35, // Indicate that document is empty + LANG_UNK_LAT = 36, // Any unrecognized latin language + LANG_UNK_CYR = 37, // Any unrecognized cyrillic language + LANG_UNK_ALPHA = 38, // Any unrecognized alphabetic language not fit into previous categories + LANG_FIN = 39, // Finnish + LANG_EST = 40, // Estonian + LANG_LAV = 41, // Latvian + LANG_LIT = 42, // Lithuanian + LANG_BAK = 43, // Bashkir + LANG_TUR = 44, // Turkish + LANG_RUM = 45, // Romanian (also Moldavian) + LANG_MON = 46, // Mongolian + LANG_UZB = 47, // Uzbek + LANG_KIR = 48, // Kirghiz + LANG_TGK = 49, // Tajik + LANG_TUK = 50, // Turkmen + LANG_SRP = 51, // Serbian + LANG_AZE = 52, // Azerbaijani + LANG_BASIC_ENG = 53, // Simplified version of English (used at lemmer only) + LANG_GEO = 54, // Georgian + LANG_ARA = 55, // Arabic + LANG_PER = 56, // Persian + LANG_CHU = 57, // Church Slavonic + LANG_CHI = 58, // Chinese + LANG_JPN = 59, // Japanese + LANG_IND = 60, // Indonesian + LANG_MAY = 61, // Malay + LANG_THA = 62, // Thai + LANG_VIE = 63, // Vietnamese + LANG_GLE = 64, // Irish (Gaelic) + LANG_TGL = 65, // Tagalog (Filipino) + LANG_HIN = 66, // Hindi + LANG_AFR = 67, // Afrikaans + LANG_URD = 68, // Urdu + LANG_MYA = 69, // Burmese + LANG_KHM = 70, // Khmer + LANG_LAO = 71, // Lao + LANG_TAM = 72, // Tamil + LANG_BEN = 73, // Bengali + LANG_GUJ = 74, // Gujarati + LANG_KAN = 75, // Kannada + LANG_PAN = 76, // Punjabi + LANG_SIN = 77, // Sinhalese + LANG_SWA = 78, // Swahili + LANG_BAQ = 79, // Basque + LANG_WEL = 80, // Welsh + LANG_GLG = 81, // Galician + LANG_HAT = 82, // Haitian Creole + LANG_MLG = 83, // Malagasy + LANG_CHV = 84, // Chuvash + LANG_UDM = 85, // Udmurt + LANG_KPV = 86, // Komi-Zyrian + LANG_MHR = 87, // Meadow Mari (Eastern Mari) + LANG_SJN = 88, // Sindarin + LANG_MRJ = 89, // Hill Mari (Western Mari) + LANG_KOI = 90, // Komi-Permyak + LANG_LTZ = 91, // Luxembourgish + LANG_GLA = 92, // Scottish Gaelic + LANG_CEB = 93, // Cebuano + LANG_PUS = 94, // Pashto + LANG_KMR = 95, // Kurmanji + LANG_AMH = 96, // Amharic + LANG_ZUL = 97, // Zulu + LANG_IBO = 98, // Igbo + LANG_YOR = 99, // Yoruba + LANG_COS = 100, // Corsican + LANG_XHO = 101, // Xhosa + LANG_JAV = 102, // Javanese + LANG_NEP = 103, // Nepali + LANG_SND = 104, // Sindhi + LANG_SOM = 105, // Somali + LANG_EPO = 106, // Esperanto + LANG_TEL = 107, // Telugu + LANG_MAR = 108, // Marathi + LANG_HAU = 109, // Hausa + LANG_YID = 110, // Yiddish + LANG_MAL = 111, // Malayalam + LANG_MAO = 112, // Maori + LANG_SUN = 113, // Sundanese + LANG_PAP = 114, // Papiamento + LANG_UZB_CYR = 115, // Cyrillic Uzbek + LANG_TRANSCR_IPA = 116, // International Phonetic Alphabet Transcription + LANG_EMJ = 117, // Emoji + LANG_UYG = 118, // Uyghur + LANG_BRE = 119, // Breton + LANG_SAH = 120, // Yakut + LANG_KAZ_LAT = 121, // Latin Kazakh + LANG_MAX +}; + +/** + * Converts string to corresponding enum. Will try to extract the primary language code from + * constructions like "en-cockney" or "zh_Hant". In case of failure will return `LANG_UNK`. + * + * @param name Language name + * @return Language enum + */ +ELanguage LanguageByName(const TStringBuf& name); + +/** + * Same as `LanguageByName`, but in case of failure will return `LANG_MAX`. + * + * @see LanguageByName + */ +ELanguage LanguageByNameStrict(const TStringBuf& name); + +/** + * Converts language enum to corresponding ISO 639-2/B alpha-3 code. For languages missing in ISO + * standard convertions are: + * - LANG_UNK: "unk" + * - LANG_BASIC_RUS: "basic-rus" + * - LANG_EMPTY: "empty" + * - LANG_UNK_LAT: "unklat" + * - LANG_UNK_CYR: "unkcyr" + * - LANG_UNK_ALPHA: "unkalpha" + * - LANG_BASIC_ENG: "basic-eng" + * - LANG_TRANSCR_IPA "transcr-ipa" + * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be + * `nullptr`. + * + * @param language Language enum + * @return Language ISO 639-2/B alpha-3 code + */ +const char* NameByLanguage(ELanguage language); + +/** + * Converts language enum to corresponding ISO 639-1 alpha-2 code. For languages missing in ISO + * standard convertions are: + * - LANG_UNK: "mis" + * - LANG_BASIC_RUS: "bas-ru" + * - LANG_EMPTY: "" + * - LANG_UNK_LAT: "" + * - LANG_UNK_CYR: "" + * - LANG_UNK_ALPHA: "" + * - LANG_BASIC_ENG: "bas-en" + * - LANG_TRANSCR_IPA "tr-ipa" + * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be + * `nullptr`. + * + * @param language Language enum + * @return Language ISO 639-1 alpha-2 code + */ +const char* IsoNameByLanguage(ELanguage language); + +/** + * Converts language enum to corresponding human-readable language name. E.g. "Russian" for + * `LANG_RUS` or "Basic Russian" for `LANG_BASIC_RUS`. If language is missing in `ELanguage` or if + * it is a `LANG_MAX` then return value will be `nullptr`. + * + * @param language Language enum + */ +const char* FullNameByLanguage(ELanguage language); + +/** + * Same as `LanguageByNameStrict` but in case of failure will throw `yexception`. + * + * @see LanguageByNameStrict + */ +ELanguage LanguageByNameOrDie(const TStringBuf& name); + +constexpr bool UnknownLanguage(const ELanguage language) noexcept { + return language == LANG_UNK || language == LANG_UNK_LAT || language == LANG_UNK_CYR || language == LANG_UNK_ALPHA || language == LANG_EMPTY; +} + +EScript ScriptByLanguage(ELanguage language); +EScript ScriptByGlyph(wchar32 glyph); + +namespace NCharsetInternal { + void InitScriptData(ui8 data[], size_t len); +} + +inline bool LatinScript(ELanguage language) { + return ScriptByLanguage(language) == SCRIPT_LATIN; +} + +inline bool CyrillicScript(ELanguage language) { + return ScriptByLanguage(language) == SCRIPT_CYRILLIC; +} diff --git a/library/cpp/langs/scripts.h b/library/cpp/langs/scripts.h new file mode 100644 index 0000000000..4c47a33d2c --- /dev/null +++ b/library/cpp/langs/scripts.h @@ -0,0 +1,56 @@ +#pragma once + +#include <util/generic/strbuf.h> + +// Writing systems, a.k.a. scripts +// +enum EScript { + SCRIPT_UNKNOWN = 0, + SCRIPT_LATIN, + SCRIPT_CYRILLIC, + + SCRIPT_GREEK, + SCRIPT_ARABIC, + SCRIPT_HEBREW, + SCRIPT_ARMENIAN, + SCRIPT_GEORGIAN, + + SCRIPT_HAN, + SCRIPT_KATAKANA, + SCRIPT_HIRAGANA, + SCRIPT_HANGUL, + + SCRIPT_DEVANAGARI, + SCRIPT_BENGALI, + SCRIPT_GUJARATI, + SCRIPT_GURMUKHI, + SCRIPT_KANNADA, + SCRIPT_MALAYALAM, + SCRIPT_ORIYA, + SCRIPT_TAMIL, + SCRIPT_TELUGU, + SCRIPT_THAANA, + SCRIPT_SINHALA, + + SCRIPT_MYANMAR, + SCRIPT_THAI, + SCRIPT_LAO, + SCRIPT_KHMER, + SCRIPT_TIBETAN, + SCRIPT_MONGOLIAN, + + SCRIPT_ETHIOPIC, + SCRIPT_RUNIC, + SCRIPT_COPTIC, + SCRIPT_SYRIAC, + + SCRIPT_OTHER, + SCRIPT_MAX +}; + +// According to ISO 15924 codes. See https://en.wikipedia.org/wiki/ISO_15924 +// +EScript ScriptByName(const TStringBuf& name); +EScript ScriptByNameOrDie(const TStringBuf& name); +const char* IsoNameByScript(EScript script); +const char* FullNameByScript(EScript script); |