diff options
author | innokentii <innokentii@yandex-team.com> | 2023-03-24 18:21:43 +0300 |
---|---|---|
committer | innokentii <innokentii@yandex-team.com> | 2023-03-24 18:21:43 +0300 |
commit | a1dece0299855c161ae585b31c50155acf577294 (patch) | |
tree | 3b04be14df5ee3093cc1c49423efbfcd64cba22d | |
parent | 833d012d60f469736200a888fc73dda823a9be4b (diff) | |
download | ydb-a1dece0299855c161ae585b31c50155acf577294.tar.gz |
Add yaml config utils:
add yaml config utils
-rw-r--r-- | library/cpp/langs/langs.h | 229 | ||||
-rw-r--r-- | library/cpp/langs/scripts.h | 56 | ||||
-rw-r--r-- | ydb/core/cms/console/yaml_config/yaml_config.cpp | 53 | ||||
-rw-r--r-- | ydb/core/cms/console/yaml_config/yaml_config.h | 12 | ||||
-rw-r--r-- | ydb/core/cms/console/yaml_config/yaml_config_ut.cpp | 10 |
5 files changed, 348 insertions, 12 deletions
diff --git a/library/cpp/langs/langs.h b/library/cpp/langs/langs.h new file mode 100644 index 0000000000..360ab6a832 --- /dev/null +++ b/library/cpp/langs/langs.h @@ -0,0 +1,229 @@ +#pragma once + +#include "scripts.h" + +#include <util/generic/strbuf.h> +#include <util/system/defaults.h> + +#if defined(_win_) +// LANG_LAO is #define in WinNT.h +#undef LANG_LAO +#endif + +// Language names are given according to ISO 639-2/B +// Some languages are not present in ISO 639-2/B. Then ISO 639-3 is used. +// http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes +enum ELanguage { + LANG_UNK = 0, // Unknown + LANG_RUS = 1, // Russian + LANG_ENG = 2, // English + LANG_POL = 3, // Polish + LANG_HUN = 4, // Hungarian + LANG_UKR = 5, // Ukrainian + LANG_GER = 6, // German + LANG_FRE = 7, // French + LANG_TAT = 8, // Tatar + LANG_BEL = 9, // Belarusian + LANG_KAZ = 10, // Kazakh + LANG_ALB = 11, // Albanian + LANG_SPA = 12, // Spanish + LANG_ITA = 13, // Italian + LANG_ARM = 14, // Armenian + LANG_DAN = 15, // Danish + LANG_POR = 16, // Portuguese + LANG_ICE = 17, // Icelandic + LANG_SLO = 18, // Slovak + LANG_SLV = 19, // Slovene + LANG_DUT = 20, // Dutch (Netherlandish language) + LANG_BUL = 21, // Bulgarian + LANG_CAT = 22, // Catalan + LANG_HRV = 23, // Croatian + LANG_CZE = 24, // Czech + LANG_GRE = 25, // Greek + LANG_HEB = 26, // Hebrew + LANG_NOR = 27, // Norwegian + LANG_MAC = 28, // Macedonian + LANG_SWE = 29, // Swedish + LANG_KOR = 30, // Korean + LANG_LAT = 31, // Latin + LANG_BASIC_RUS = 32, // Simplified version of Russian (used at lemmer only) + LANG_BOS = 33, // Bosnian + LANG_MLT = 34, // Maltese + LANG_EMPTY = 35, // Indicate that document is empty + LANG_UNK_LAT = 36, // Any unrecognized latin language + LANG_UNK_CYR = 37, // Any unrecognized cyrillic language + LANG_UNK_ALPHA = 38, // Any unrecognized alphabetic language not fit into previous categories + LANG_FIN = 39, // Finnish + LANG_EST = 40, // Estonian + LANG_LAV = 41, // Latvian + LANG_LIT = 42, // Lithuanian + LANG_BAK = 43, // Bashkir + LANG_TUR = 44, // Turkish + LANG_RUM = 45, // Romanian (also Moldavian) + LANG_MON = 46, // Mongolian + LANG_UZB = 47, // Uzbek + LANG_KIR = 48, // Kirghiz + LANG_TGK = 49, // Tajik + LANG_TUK = 50, // Turkmen + LANG_SRP = 51, // Serbian + LANG_AZE = 52, // Azerbaijani + LANG_BASIC_ENG = 53, // Simplified version of English (used at lemmer only) + LANG_GEO = 54, // Georgian + LANG_ARA = 55, // Arabic + LANG_PER = 56, // Persian + LANG_CHU = 57, // Church Slavonic + LANG_CHI = 58, // Chinese + LANG_JPN = 59, // Japanese + LANG_IND = 60, // Indonesian + LANG_MAY = 61, // Malay + LANG_THA = 62, // Thai + LANG_VIE = 63, // Vietnamese + LANG_GLE = 64, // Irish (Gaelic) + LANG_TGL = 65, // Tagalog (Filipino) + LANG_HIN = 66, // Hindi + LANG_AFR = 67, // Afrikaans + LANG_URD = 68, // Urdu + LANG_MYA = 69, // Burmese + LANG_KHM = 70, // Khmer + LANG_LAO = 71, // Lao + LANG_TAM = 72, // Tamil + LANG_BEN = 73, // Bengali + LANG_GUJ = 74, // Gujarati + LANG_KAN = 75, // Kannada + LANG_PAN = 76, // Punjabi + LANG_SIN = 77, // Sinhalese + LANG_SWA = 78, // Swahili + LANG_BAQ = 79, // Basque + LANG_WEL = 80, // Welsh + LANG_GLG = 81, // Galician + LANG_HAT = 82, // Haitian Creole + LANG_MLG = 83, // Malagasy + LANG_CHV = 84, // Chuvash + LANG_UDM = 85, // Udmurt + LANG_KPV = 86, // Komi-Zyrian + LANG_MHR = 87, // Meadow Mari (Eastern Mari) + LANG_SJN = 88, // Sindarin + LANG_MRJ = 89, // Hill Mari (Western Mari) + LANG_KOI = 90, // Komi-Permyak + LANG_LTZ = 91, // Luxembourgish + LANG_GLA = 92, // Scottish Gaelic + LANG_CEB = 93, // Cebuano + LANG_PUS = 94, // Pashto + LANG_KMR = 95, // Kurmanji + LANG_AMH = 96, // Amharic + LANG_ZUL = 97, // Zulu + LANG_IBO = 98, // Igbo + LANG_YOR = 99, // Yoruba + LANG_COS = 100, // Corsican + LANG_XHO = 101, // Xhosa + LANG_JAV = 102, // Javanese + LANG_NEP = 103, // Nepali + LANG_SND = 104, // Sindhi + LANG_SOM = 105, // Somali + LANG_EPO = 106, // Esperanto + LANG_TEL = 107, // Telugu + LANG_MAR = 108, // Marathi + LANG_HAU = 109, // Hausa + LANG_YID = 110, // Yiddish + LANG_MAL = 111, // Malayalam + LANG_MAO = 112, // Maori + LANG_SUN = 113, // Sundanese + LANG_PAP = 114, // Papiamento + LANG_UZB_CYR = 115, // Cyrillic Uzbek + LANG_TRANSCR_IPA = 116, // International Phonetic Alphabet Transcription + LANG_EMJ = 117, // Emoji + LANG_UYG = 118, // Uyghur + LANG_BRE = 119, // Breton + LANG_SAH = 120, // Yakut + LANG_KAZ_LAT = 121, // Latin Kazakh + LANG_MAX +}; + +/** + * Converts string to corresponding enum. Will try to extract the primary language code from + * constructions like "en-cockney" or "zh_Hant". In case of failure will return `LANG_UNK`. + * + * @param name Language name + * @return Language enum + */ +ELanguage LanguageByName(const TStringBuf& name); + +/** + * Same as `LanguageByName`, but in case of failure will return `LANG_MAX`. + * + * @see LanguageByName + */ +ELanguage LanguageByNameStrict(const TStringBuf& name); + +/** + * Converts language enum to corresponding ISO 639-2/B alpha-3 code. For languages missing in ISO + * standard convertions are: + * - LANG_UNK: "unk" + * - LANG_BASIC_RUS: "basic-rus" + * - LANG_EMPTY: "empty" + * - LANG_UNK_LAT: "unklat" + * - LANG_UNK_CYR: "unkcyr" + * - LANG_UNK_ALPHA: "unkalpha" + * - LANG_BASIC_ENG: "basic-eng" + * - LANG_TRANSCR_IPA "transcr-ipa" + * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be + * `nullptr`. + * + * @param language Language enum + * @return Language ISO 639-2/B alpha-3 code + */ +const char* NameByLanguage(ELanguage language); + +/** + * Converts language enum to corresponding ISO 639-1 alpha-2 code. For languages missing in ISO + * standard convertions are: + * - LANG_UNK: "mis" + * - LANG_BASIC_RUS: "bas-ru" + * - LANG_EMPTY: "" + * - LANG_UNK_LAT: "" + * - LANG_UNK_CYR: "" + * - LANG_UNK_ALPHA: "" + * - LANG_BASIC_ENG: "bas-en" + * - LANG_TRANSCR_IPA "tr-ipa" + * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be + * `nullptr`. + * + * @param language Language enum + * @return Language ISO 639-1 alpha-2 code + */ +const char* IsoNameByLanguage(ELanguage language); + +/** + * Converts language enum to corresponding human-readable language name. E.g. "Russian" for + * `LANG_RUS` or "Basic Russian" for `LANG_BASIC_RUS`. If language is missing in `ELanguage` or if + * it is a `LANG_MAX` then return value will be `nullptr`. + * + * @param language Language enum + */ +const char* FullNameByLanguage(ELanguage language); + +/** + * Same as `LanguageByNameStrict` but in case of failure will throw `yexception`. + * + * @see LanguageByNameStrict + */ +ELanguage LanguageByNameOrDie(const TStringBuf& name); + +constexpr bool UnknownLanguage(const ELanguage language) noexcept { + return language == LANG_UNK || language == LANG_UNK_LAT || language == LANG_UNK_CYR || language == LANG_UNK_ALPHA || language == LANG_EMPTY; +} + +EScript ScriptByLanguage(ELanguage language); +EScript ScriptByGlyph(wchar32 glyph); + +namespace NCharsetInternal { + void InitScriptData(ui8 data[], size_t len); +} + +inline bool LatinScript(ELanguage language) { + return ScriptByLanguage(language) == SCRIPT_LATIN; +} + +inline bool CyrillicScript(ELanguage language) { + return ScriptByLanguage(language) == SCRIPT_CYRILLIC; +} diff --git a/library/cpp/langs/scripts.h b/library/cpp/langs/scripts.h new file mode 100644 index 0000000000..4c47a33d2c --- /dev/null +++ b/library/cpp/langs/scripts.h @@ -0,0 +1,56 @@ +#pragma once + +#include <util/generic/strbuf.h> + +// Writing systems, a.k.a. scripts +// +enum EScript { + SCRIPT_UNKNOWN = 0, + SCRIPT_LATIN, + SCRIPT_CYRILLIC, + + SCRIPT_GREEK, + SCRIPT_ARABIC, + SCRIPT_HEBREW, + SCRIPT_ARMENIAN, + SCRIPT_GEORGIAN, + + SCRIPT_HAN, + SCRIPT_KATAKANA, + SCRIPT_HIRAGANA, + SCRIPT_HANGUL, + + SCRIPT_DEVANAGARI, + SCRIPT_BENGALI, + SCRIPT_GUJARATI, + SCRIPT_GURMUKHI, + SCRIPT_KANNADA, + SCRIPT_MALAYALAM, + SCRIPT_ORIYA, + SCRIPT_TAMIL, + SCRIPT_TELUGU, + SCRIPT_THAANA, + SCRIPT_SINHALA, + + SCRIPT_MYANMAR, + SCRIPT_THAI, + SCRIPT_LAO, + SCRIPT_KHMER, + SCRIPT_TIBETAN, + SCRIPT_MONGOLIAN, + + SCRIPT_ETHIOPIC, + SCRIPT_RUNIC, + SCRIPT_COPTIC, + SCRIPT_SYRIAC, + + SCRIPT_OTHER, + SCRIPT_MAX +}; + +// According to ISO 15924 codes. See https://en.wikipedia.org/wiki/ISO_15924 +// +EScript ScriptByName(const TStringBuf& name); +EScript ScriptByNameOrDie(const TStringBuf& name); +const char* IsoNameByScript(EScript script); +const char* FullNameByScript(EScript script); diff --git a/ydb/core/cms/console/yaml_config/yaml_config.cpp b/ydb/core/cms/console/yaml_config/yaml_config.cpp index ae4a461526..4582aaa444 100644 --- a/ydb/core/cms/console/yaml_config/yaml_config.cpp +++ b/ydb/core/cms/console/yaml_config/yaml_config.cpp @@ -3,6 +3,17 @@ #include <library/cpp/protobuf/json/json2proto.h> #include <ydb/core/base/appdata.h> +#include <dict/dictutil/hash.h> + +template <> +struct THash<NYamlConfig::TLabel> { + inline size_t operator()(const NYamlConfig::TLabel& value) const { + return CombineHashes(THash<TString>{}(value.Value), (size_t)value.Type); + } +}; + +template <> +struct THash<TVector<NYamlConfig::TLabel>> : public TSimpleRangeHash {}; namespace NYamlConfig { @@ -22,18 +33,6 @@ TString GetKey(const NFyaml::TNodeRef& node, TString key) { return k; } -TString CalcHash(const NFyaml::TDocument& resolved) { - TStringStream ss; - ss << resolved; - TString s = ss.Str(); - SHA256_CTX sha; - SHA256_Init(&sha); - SHA256_Update(&sha, s.data(), s.size()); - unsigned char hash[SHA256_DIGEST_LENGTH]; - SHA256_Final(hash, &sha); - return TString(reinterpret_cast<char*>(hash), sizeof(hash)); -} - bool Fit(const TSelector& selector, const TSet<TNamedLabel>& labels) { bool result = true; size_t matched = 0; @@ -520,6 +519,27 @@ TResolvedConfig ResolveAll(NFyaml::TDocument& doc) return {labelNames, std::move(configs)}; } +size_t Hash(const NFyaml::TNodeRef& resolved) { + TStringStream ss; + ss << resolved; + TString s = ss.Str(); + return THash<TString>{}(s); +} + +size_t Hash(const TResolvedConfig& config) +{ + size_t configsHash = 0; + for (auto& [labelSet, docConfig] : config.Configs) { + for (auto labels : labelSet) { + auto labelsHash = THash<TVector<TLabel>>{}(labels); + configsHash = CombineHashes(labelsHash, configsHash); + } + configsHash = CombineHashes(Hash(docConfig.second), configsHash); + } + + return CombineHashes(THash<TVector<TString>>{}(config.Labels), configsHash); +} + void ValidateVolatileConfig(NFyaml::TDocument& doc) { auto root = doc.Root(); auto seq = root.Sequence(); @@ -558,6 +578,15 @@ void AppendVolatileConfigs(NFyaml::TDocument& config, NFyaml::TDocument& volatil } } +ui64 GetVersion(const TString& config) { + auto parser = NFyaml::TParser::Create(config); + auto header = parser.NextDocument(); + auto str = header->Root().Map().at("version").Scalar(); + ui64 version = 0; + TryFromString<ui64>(str, version); + return version; +} + } // namespace NYamlConfig template <> diff --git a/ydb/core/cms/console/yaml_config/yaml_config.h b/ydb/core/cms/console/yaml_config/yaml_config.h index 85f8007170..08f0c63fad 100644 --- a/ydb/core/cms/console/yaml_config/yaml_config.h +++ b/ydb/core/cms/console/yaml_config/yaml_config.h @@ -4,6 +4,7 @@ #include <library/cpp/actors/core/actor.h> #include <ydb/core/protos/config.pb.h> +#include <ydb/core/protos/console_config.pb.h> #include <openssl/sha.h> @@ -151,6 +152,12 @@ struct TResolvedConfig { TResolvedConfig ResolveAll(NFyaml::TDocument& doc); /** + * Calculates hash of resolved config + * Used to ensure that cli resolves config the same as a server + */ +size_t Hash(const TResolvedConfig& config); + +/** * Validates single YAML volatile config schema */ void ValidateVolatileConfig(NFyaml::TDocument& doc); @@ -160,4 +167,9 @@ void ValidateVolatileConfig(NFyaml::TDocument& doc); */ void AppendVolatileConfigs(NFyaml::TDocument& config, NFyaml::TDocument& volatileConfig); +/** + * Parses config version + */ +ui64 GetVersion(const TString& config); + } // namespace NYamlConfig diff --git a/ydb/core/cms/console/yaml_config/yaml_config_ut.cpp b/ydb/core/cms/console/yaml_config/yaml_config_ut.cpp index 1738cb08a2..fd558e60eb 100644 --- a/ydb/core/cms/console/yaml_config/yaml_config_ut.cpp +++ b/ydb/core/cms/console/yaml_config/yaml_config_ut.cpp @@ -1405,4 +1405,14 @@ Y_UNIT_TEST_SUITE(YamlConfig) { stream << cfg; UNIT_ASSERT_VALUES_EQUAL(stream.Str(), TString(Concatenated)); } + + Y_UNIT_TEST(AppendAndResolve) { + auto cfg = NFyaml::TDocument::Parse(SimpleConfig); + for (int i = 0; i < 4; ++i) { + auto volatilePart = NFyaml::TDocument::Parse(VolatilePart); + NYamlConfig::AppendVolatileConfigs(cfg, volatilePart); + } + TStringStream stream; + stream << cfg; + } } |