aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorinnokentii <innokentii@yandex-team.com>2023-03-24 18:21:43 +0300
committerinnokentii <innokentii@yandex-team.com>2023-03-24 18:21:43 +0300
commita1dece0299855c161ae585b31c50155acf577294 (patch)
tree3b04be14df5ee3093cc1c49423efbfcd64cba22d
parent833d012d60f469736200a888fc73dda823a9be4b (diff)
downloadydb-a1dece0299855c161ae585b31c50155acf577294.tar.gz
Add yaml config utils:
add yaml config utils
-rw-r--r--library/cpp/langs/langs.h229
-rw-r--r--library/cpp/langs/scripts.h56
-rw-r--r--ydb/core/cms/console/yaml_config/yaml_config.cpp53
-rw-r--r--ydb/core/cms/console/yaml_config/yaml_config.h12
-rw-r--r--ydb/core/cms/console/yaml_config/yaml_config_ut.cpp10
5 files changed, 348 insertions, 12 deletions
diff --git a/library/cpp/langs/langs.h b/library/cpp/langs/langs.h
new file mode 100644
index 0000000000..360ab6a832
--- /dev/null
+++ b/library/cpp/langs/langs.h
@@ -0,0 +1,229 @@
+#pragma once
+
+#include "scripts.h"
+
+#include <util/generic/strbuf.h>
+#include <util/system/defaults.h>
+
+#if defined(_win_)
+// LANG_LAO is #define in WinNT.h
+#undef LANG_LAO
+#endif
+
+// Language names are given according to ISO 639-2/B
+// Some languages are not present in ISO 639-2/B. Then ISO 639-3 is used.
+// http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+enum ELanguage {
+ LANG_UNK = 0, // Unknown
+ LANG_RUS = 1, // Russian
+ LANG_ENG = 2, // English
+ LANG_POL = 3, // Polish
+ LANG_HUN = 4, // Hungarian
+ LANG_UKR = 5, // Ukrainian
+ LANG_GER = 6, // German
+ LANG_FRE = 7, // French
+ LANG_TAT = 8, // Tatar
+ LANG_BEL = 9, // Belarusian
+ LANG_KAZ = 10, // Kazakh
+ LANG_ALB = 11, // Albanian
+ LANG_SPA = 12, // Spanish
+ LANG_ITA = 13, // Italian
+ LANG_ARM = 14, // Armenian
+ LANG_DAN = 15, // Danish
+ LANG_POR = 16, // Portuguese
+ LANG_ICE = 17, // Icelandic
+ LANG_SLO = 18, // Slovak
+ LANG_SLV = 19, // Slovene
+ LANG_DUT = 20, // Dutch (Netherlandish language)
+ LANG_BUL = 21, // Bulgarian
+ LANG_CAT = 22, // Catalan
+ LANG_HRV = 23, // Croatian
+ LANG_CZE = 24, // Czech
+ LANG_GRE = 25, // Greek
+ LANG_HEB = 26, // Hebrew
+ LANG_NOR = 27, // Norwegian
+ LANG_MAC = 28, // Macedonian
+ LANG_SWE = 29, // Swedish
+ LANG_KOR = 30, // Korean
+ LANG_LAT = 31, // Latin
+ LANG_BASIC_RUS = 32, // Simplified version of Russian (used at lemmer only)
+ LANG_BOS = 33, // Bosnian
+ LANG_MLT = 34, // Maltese
+ LANG_EMPTY = 35, // Indicate that document is empty
+ LANG_UNK_LAT = 36, // Any unrecognized latin language
+ LANG_UNK_CYR = 37, // Any unrecognized cyrillic language
+ LANG_UNK_ALPHA = 38, // Any unrecognized alphabetic language not fit into previous categories
+ LANG_FIN = 39, // Finnish
+ LANG_EST = 40, // Estonian
+ LANG_LAV = 41, // Latvian
+ LANG_LIT = 42, // Lithuanian
+ LANG_BAK = 43, // Bashkir
+ LANG_TUR = 44, // Turkish
+ LANG_RUM = 45, // Romanian (also Moldavian)
+ LANG_MON = 46, // Mongolian
+ LANG_UZB = 47, // Uzbek
+ LANG_KIR = 48, // Kirghiz
+ LANG_TGK = 49, // Tajik
+ LANG_TUK = 50, // Turkmen
+ LANG_SRP = 51, // Serbian
+ LANG_AZE = 52, // Azerbaijani
+ LANG_BASIC_ENG = 53, // Simplified version of English (used at lemmer only)
+ LANG_GEO = 54, // Georgian
+ LANG_ARA = 55, // Arabic
+ LANG_PER = 56, // Persian
+ LANG_CHU = 57, // Church Slavonic
+ LANG_CHI = 58, // Chinese
+ LANG_JPN = 59, // Japanese
+ LANG_IND = 60, // Indonesian
+ LANG_MAY = 61, // Malay
+ LANG_THA = 62, // Thai
+ LANG_VIE = 63, // Vietnamese
+ LANG_GLE = 64, // Irish (Gaelic)
+ LANG_TGL = 65, // Tagalog (Filipino)
+ LANG_HIN = 66, // Hindi
+ LANG_AFR = 67, // Afrikaans
+ LANG_URD = 68, // Urdu
+ LANG_MYA = 69, // Burmese
+ LANG_KHM = 70, // Khmer
+ LANG_LAO = 71, // Lao
+ LANG_TAM = 72, // Tamil
+ LANG_BEN = 73, // Bengali
+ LANG_GUJ = 74, // Gujarati
+ LANG_KAN = 75, // Kannada
+ LANG_PAN = 76, // Punjabi
+ LANG_SIN = 77, // Sinhalese
+ LANG_SWA = 78, // Swahili
+ LANG_BAQ = 79, // Basque
+ LANG_WEL = 80, // Welsh
+ LANG_GLG = 81, // Galician
+ LANG_HAT = 82, // Haitian Creole
+ LANG_MLG = 83, // Malagasy
+ LANG_CHV = 84, // Chuvash
+ LANG_UDM = 85, // Udmurt
+ LANG_KPV = 86, // Komi-Zyrian
+ LANG_MHR = 87, // Meadow Mari (Eastern Mari)
+ LANG_SJN = 88, // Sindarin
+ LANG_MRJ = 89, // Hill Mari (Western Mari)
+ LANG_KOI = 90, // Komi-Permyak
+ LANG_LTZ = 91, // Luxembourgish
+ LANG_GLA = 92, // Scottish Gaelic
+ LANG_CEB = 93, // Cebuano
+ LANG_PUS = 94, // Pashto
+ LANG_KMR = 95, // Kurmanji
+ LANG_AMH = 96, // Amharic
+ LANG_ZUL = 97, // Zulu
+ LANG_IBO = 98, // Igbo
+ LANG_YOR = 99, // Yoruba
+ LANG_COS = 100, // Corsican
+ LANG_XHO = 101, // Xhosa
+ LANG_JAV = 102, // Javanese
+ LANG_NEP = 103, // Nepali
+ LANG_SND = 104, // Sindhi
+ LANG_SOM = 105, // Somali
+ LANG_EPO = 106, // Esperanto
+ LANG_TEL = 107, // Telugu
+ LANG_MAR = 108, // Marathi
+ LANG_HAU = 109, // Hausa
+ LANG_YID = 110, // Yiddish
+ LANG_MAL = 111, // Malayalam
+ LANG_MAO = 112, // Maori
+ LANG_SUN = 113, // Sundanese
+ LANG_PAP = 114, // Papiamento
+ LANG_UZB_CYR = 115, // Cyrillic Uzbek
+ LANG_TRANSCR_IPA = 116, // International Phonetic Alphabet Transcription
+ LANG_EMJ = 117, // Emoji
+ LANG_UYG = 118, // Uyghur
+ LANG_BRE = 119, // Breton
+ LANG_SAH = 120, // Yakut
+ LANG_KAZ_LAT = 121, // Latin Kazakh
+ LANG_MAX
+};
+
+/**
+ * Converts string to corresponding enum. Will try to extract the primary language code from
+ * constructions like "en-cockney" or "zh_Hant". In case of failure will return `LANG_UNK`.
+ *
+ * @param name Language name
+ * @return Language enum
+ */
+ELanguage LanguageByName(const TStringBuf& name);
+
+/**
+ * Same as `LanguageByName`, but in case of failure will return `LANG_MAX`.
+ *
+ * @see LanguageByName
+ */
+ELanguage LanguageByNameStrict(const TStringBuf& name);
+
+/**
+ * Converts language enum to corresponding ISO 639-2/B alpha-3 code. For languages missing in ISO
+ * standard convertions are:
+ * - LANG_UNK: "unk"
+ * - LANG_BASIC_RUS: "basic-rus"
+ * - LANG_EMPTY: "empty"
+ * - LANG_UNK_LAT: "unklat"
+ * - LANG_UNK_CYR: "unkcyr"
+ * - LANG_UNK_ALPHA: "unkalpha"
+ * - LANG_BASIC_ENG: "basic-eng"
+ * - LANG_TRANSCR_IPA "transcr-ipa"
+ * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be
+ * `nullptr`.
+ *
+ * @param language Language enum
+ * @return Language ISO 639-2/B alpha-3 code
+ */
+const char* NameByLanguage(ELanguage language);
+
+/**
+ * Converts language enum to corresponding ISO 639-1 alpha-2 code. For languages missing in ISO
+ * standard convertions are:
+ * - LANG_UNK: "mis"
+ * - LANG_BASIC_RUS: "bas-ru"
+ * - LANG_EMPTY: ""
+ * - LANG_UNK_LAT: ""
+ * - LANG_UNK_CYR: ""
+ * - LANG_UNK_ALPHA: ""
+ * - LANG_BASIC_ENG: "bas-en"
+ * - LANG_TRANSCR_IPA "tr-ipa"
+ * If language is missing in `ELanguage` or if it is a `LANG_MAX` then return value will be
+ * `nullptr`.
+ *
+ * @param language Language enum
+ * @return Language ISO 639-1 alpha-2 code
+ */
+const char* IsoNameByLanguage(ELanguage language);
+
+/**
+ * Converts language enum to corresponding human-readable language name. E.g. "Russian" for
+ * `LANG_RUS` or "Basic Russian" for `LANG_BASIC_RUS`. If language is missing in `ELanguage` or if
+ * it is a `LANG_MAX` then return value will be `nullptr`.
+ *
+ * @param language Language enum
+ */
+const char* FullNameByLanguage(ELanguage language);
+
+/**
+ * Same as `LanguageByNameStrict` but in case of failure will throw `yexception`.
+ *
+ * @see LanguageByNameStrict
+ */
+ELanguage LanguageByNameOrDie(const TStringBuf& name);
+
+constexpr bool UnknownLanguage(const ELanguage language) noexcept {
+ return language == LANG_UNK || language == LANG_UNK_LAT || language == LANG_UNK_CYR || language == LANG_UNK_ALPHA || language == LANG_EMPTY;
+}
+
+EScript ScriptByLanguage(ELanguage language);
+EScript ScriptByGlyph(wchar32 glyph);
+
+namespace NCharsetInternal {
+ void InitScriptData(ui8 data[], size_t len);
+}
+
+inline bool LatinScript(ELanguage language) {
+ return ScriptByLanguage(language) == SCRIPT_LATIN;
+}
+
+inline bool CyrillicScript(ELanguage language) {
+ return ScriptByLanguage(language) == SCRIPT_CYRILLIC;
+}
diff --git a/library/cpp/langs/scripts.h b/library/cpp/langs/scripts.h
new file mode 100644
index 0000000000..4c47a33d2c
--- /dev/null
+++ b/library/cpp/langs/scripts.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <util/generic/strbuf.h>
+
+// Writing systems, a.k.a. scripts
+//
+enum EScript {
+ SCRIPT_UNKNOWN = 0,
+ SCRIPT_LATIN,
+ SCRIPT_CYRILLIC,
+
+ SCRIPT_GREEK,
+ SCRIPT_ARABIC,
+ SCRIPT_HEBREW,
+ SCRIPT_ARMENIAN,
+ SCRIPT_GEORGIAN,
+
+ SCRIPT_HAN,
+ SCRIPT_KATAKANA,
+ SCRIPT_HIRAGANA,
+ SCRIPT_HANGUL,
+
+ SCRIPT_DEVANAGARI,
+ SCRIPT_BENGALI,
+ SCRIPT_GUJARATI,
+ SCRIPT_GURMUKHI,
+ SCRIPT_KANNADA,
+ SCRIPT_MALAYALAM,
+ SCRIPT_ORIYA,
+ SCRIPT_TAMIL,
+ SCRIPT_TELUGU,
+ SCRIPT_THAANA,
+ SCRIPT_SINHALA,
+
+ SCRIPT_MYANMAR,
+ SCRIPT_THAI,
+ SCRIPT_LAO,
+ SCRIPT_KHMER,
+ SCRIPT_TIBETAN,
+ SCRIPT_MONGOLIAN,
+
+ SCRIPT_ETHIOPIC,
+ SCRIPT_RUNIC,
+ SCRIPT_COPTIC,
+ SCRIPT_SYRIAC,
+
+ SCRIPT_OTHER,
+ SCRIPT_MAX
+};
+
+// According to ISO 15924 codes. See https://en.wikipedia.org/wiki/ISO_15924
+//
+EScript ScriptByName(const TStringBuf& name);
+EScript ScriptByNameOrDie(const TStringBuf& name);
+const char* IsoNameByScript(EScript script);
+const char* FullNameByScript(EScript script);
diff --git a/ydb/core/cms/console/yaml_config/yaml_config.cpp b/ydb/core/cms/console/yaml_config/yaml_config.cpp
index ae4a461526..4582aaa444 100644
--- a/ydb/core/cms/console/yaml_config/yaml_config.cpp
+++ b/ydb/core/cms/console/yaml_config/yaml_config.cpp
@@ -3,6 +3,17 @@
#include <library/cpp/protobuf/json/json2proto.h>
#include <ydb/core/base/appdata.h>
+#include <dict/dictutil/hash.h>
+
+template <>
+struct THash<NYamlConfig::TLabel> {
+ inline size_t operator()(const NYamlConfig::TLabel& value) const {
+ return CombineHashes(THash<TString>{}(value.Value), (size_t)value.Type);
+ }
+};
+
+template <>
+struct THash<TVector<NYamlConfig::TLabel>> : public TSimpleRangeHash {};
namespace NYamlConfig {
@@ -22,18 +33,6 @@ TString GetKey(const NFyaml::TNodeRef& node, TString key) {
return k;
}
-TString CalcHash(const NFyaml::TDocument& resolved) {
- TStringStream ss;
- ss << resolved;
- TString s = ss.Str();
- SHA256_CTX sha;
- SHA256_Init(&sha);
- SHA256_Update(&sha, s.data(), s.size());
- unsigned char hash[SHA256_DIGEST_LENGTH];
- SHA256_Final(hash, &sha);
- return TString(reinterpret_cast<char*>(hash), sizeof(hash));
-}
-
bool Fit(const TSelector& selector, const TSet<TNamedLabel>& labels) {
bool result = true;
size_t matched = 0;
@@ -520,6 +519,27 @@ TResolvedConfig ResolveAll(NFyaml::TDocument& doc)
return {labelNames, std::move(configs)};
}
+size_t Hash(const NFyaml::TNodeRef& resolved) {
+ TStringStream ss;
+ ss << resolved;
+ TString s = ss.Str();
+ return THash<TString>{}(s);
+}
+
+size_t Hash(const TResolvedConfig& config)
+{
+ size_t configsHash = 0;
+ for (auto& [labelSet, docConfig] : config.Configs) {
+ for (auto labels : labelSet) {
+ auto labelsHash = THash<TVector<TLabel>>{}(labels);
+ configsHash = CombineHashes(labelsHash, configsHash);
+ }
+ configsHash = CombineHashes(Hash(docConfig.second), configsHash);
+ }
+
+ return CombineHashes(THash<TVector<TString>>{}(config.Labels), configsHash);
+}
+
void ValidateVolatileConfig(NFyaml::TDocument& doc) {
auto root = doc.Root();
auto seq = root.Sequence();
@@ -558,6 +578,15 @@ void AppendVolatileConfigs(NFyaml::TDocument& config, NFyaml::TDocument& volatil
}
}
+ui64 GetVersion(const TString& config) {
+ auto parser = NFyaml::TParser::Create(config);
+ auto header = parser.NextDocument();
+ auto str = header->Root().Map().at("version").Scalar();
+ ui64 version = 0;
+ TryFromString<ui64>(str, version);
+ return version;
+}
+
} // namespace NYamlConfig
template <>
diff --git a/ydb/core/cms/console/yaml_config/yaml_config.h b/ydb/core/cms/console/yaml_config/yaml_config.h
index 85f8007170..08f0c63fad 100644
--- a/ydb/core/cms/console/yaml_config/yaml_config.h
+++ b/ydb/core/cms/console/yaml_config/yaml_config.h
@@ -4,6 +4,7 @@
#include <library/cpp/actors/core/actor.h>
#include <ydb/core/protos/config.pb.h>
+#include <ydb/core/protos/console_config.pb.h>
#include <openssl/sha.h>
@@ -151,6 +152,12 @@ struct TResolvedConfig {
TResolvedConfig ResolveAll(NFyaml::TDocument& doc);
/**
+ * Calculates hash of resolved config
+ * Used to ensure that cli resolves config the same as a server
+ */
+size_t Hash(const TResolvedConfig& config);
+
+/**
* Validates single YAML volatile config schema
*/
void ValidateVolatileConfig(NFyaml::TDocument& doc);
@@ -160,4 +167,9 @@ void ValidateVolatileConfig(NFyaml::TDocument& doc);
*/
void AppendVolatileConfigs(NFyaml::TDocument& config, NFyaml::TDocument& volatileConfig);
+/**
+ * Parses config version
+ */
+ui64 GetVersion(const TString& config);
+
} // namespace NYamlConfig
diff --git a/ydb/core/cms/console/yaml_config/yaml_config_ut.cpp b/ydb/core/cms/console/yaml_config/yaml_config_ut.cpp
index 1738cb08a2..fd558e60eb 100644
--- a/ydb/core/cms/console/yaml_config/yaml_config_ut.cpp
+++ b/ydb/core/cms/console/yaml_config/yaml_config_ut.cpp
@@ -1405,4 +1405,14 @@ Y_UNIT_TEST_SUITE(YamlConfig) {
stream << cfg;
UNIT_ASSERT_VALUES_EQUAL(stream.Str(), TString(Concatenated));
}
+
+ Y_UNIT_TEST(AppendAndResolve) {
+ auto cfg = NFyaml::TDocument::Parse(SimpleConfig);
+ for (int i = 0; i < 4; ++i) {
+ auto volatilePart = NFyaml::TDocument::Parse(VolatilePart);
+ NYamlConfig::AppendVolatileConfigs(cfg, volatilePart);
+ }
+ TStringStream stream;
+ stream << cfg;
+ }
}