diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/langs/scripts.cpp | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/langs/scripts.cpp')
-rw-r--r-- | library/cpp/langs/scripts.cpp | 158 |
1 files changed, 158 insertions, 0 deletions
diff --git a/library/cpp/langs/scripts.cpp b/library/cpp/langs/scripts.cpp new file mode 100644 index 0000000000..41cc91d3ce --- /dev/null +++ b/library/cpp/langs/scripts.cpp @@ -0,0 +1,158 @@ +#include "scripts.h" + +#include <library/cpp/digest/lower_case/hash_ops.h> + +#include <util/generic/hash.h> +#include <util/generic/singleton.h> +#include <util/generic/strbuf.h> +#include <util/generic/yexception.h> +#include <util/system/defaults.h> + +#include <array> + +namespace { + struct TScriptNameAndEnum { + EScript Script; + const char* EnglishName; + const char* IsoName; + }; + + const TScriptNameAndEnum ScriptNameAndEnum[] = { + {SCRIPT_UNKNOWN, "Unknown", "Zzzz"}, + {SCRIPT_LATIN, "Latin", "Latn"}, + {SCRIPT_CYRILLIC, "Cyrillic", "Cyrl"}, + + {SCRIPT_GREEK, "Greek", "Grek"}, + {SCRIPT_ARABIC, "Arabic", "Arab"}, + {SCRIPT_HEBREW, "Hebrew", "Hebr"}, + {SCRIPT_ARMENIAN, "Armenian", "Armn"}, + {SCRIPT_GEORGIAN, "Georgian", "Geor"}, + + {SCRIPT_HAN, "Han", "Hans"}, // We use more common Simpliied variant (as opposed to Traditional 'Hant') + {SCRIPT_KATAKANA, "Katakana", "Kana"}, + {SCRIPT_HIRAGANA, "Hiragana", "Hira"}, + {SCRIPT_HANGUL, "Hangul", "Hang"}, + + {SCRIPT_DEVANAGARI, "Devanagari", "Deva"}, + {SCRIPT_BENGALI, "Bengali", "Beng"}, + {SCRIPT_GUJARATI, "Gujarati", "Gujr"}, + {SCRIPT_GURMUKHI, "Gurmukhi", "Guru"}, + {SCRIPT_KANNADA, "Kannada", "Knda"}, + {SCRIPT_MALAYALAM, "Malayalam", "Mlym"}, + {SCRIPT_ORIYA, "Oriya", "Orya"}, + {SCRIPT_TAMIL, "Tamil", "Taml"}, + {SCRIPT_TELUGU, "Telugu", "Telu"}, + {SCRIPT_THAANA, "Thaana", "Thaa"}, + {SCRIPT_SINHALA, "Sinhala", "Sinh"}, + + {SCRIPT_MYANMAR, "Myanmar", "Mymr"}, + {SCRIPT_THAI, "Thai", "Thai"}, + {SCRIPT_LAO, "Lao", "Laoo"}, + {SCRIPT_KHMER, "Khmer", "Khmr"}, + {SCRIPT_TIBETAN, "Tibetan", "Tibt"}, + {SCRIPT_MONGOLIAN, "Mongolian", "Mong"}, + + {SCRIPT_ETHIOPIC, "Ethiopic", "Ethi"}, + {SCRIPT_RUNIC, "Runic", "Runr"}, + {SCRIPT_COPTIC, "Coptic", "Copt"}, + {SCRIPT_SYRIAC, "Syriac", "Syrc"}, + + {SCRIPT_OTHER, "Other", "Zyyy"}, + }; + + static_assert(static_cast<size_t>(SCRIPT_MAX) == Y_ARRAY_SIZE(ScriptNameAndEnum), "Size doesn't match"); + + class TScriptsMap { + private: + static const char* const EMPTY_NAME; + + using TNamesHash = THashMap<TStringBuf, EScript, TCIOps, TCIOps>; + TNamesHash Hash; + + using TNamesArray = std::array<const char*, static_cast<size_t>(SCRIPT_MAX)>; + TNamesArray IsoNames; + TNamesArray FullNames; + + private: + void AddNameToHash(const TStringBuf& name, EScript script) { + if (Hash.find(name) != Hash.end()) { + Y_ASSERT(Hash.find(name)->second == script); + return; + } + + Hash[name] = script; + } + + void AddName(const char* name, EScript script, TNamesArray& names) { + if (name == nullptr || strlen(name) == 0) + return; + + Y_ASSERT(names[script] == EMPTY_NAME); + names[script] = name; + + AddNameToHash(name, script); + } + + public: + TScriptsMap() { + IsoNames.fill(EMPTY_NAME); + FullNames.fill(EMPTY_NAME); + + for (const auto& val : ScriptNameAndEnum) { + EScript script = val.Script; + + AddName(val.IsoName, script, IsoNames); + AddName(val.EnglishName, script, FullNames); + } + } + + public: + inline EScript ScriptByName(const TStringBuf& name, EScript def) const { + if (!name) + return def; + + TNamesHash::const_iterator i = Hash.find(name); + if (i == Hash.end()) { + return def; + } + + return i->second; + } + + inline const char* FullNameByScript(EScript script) const { + if (script < 0 || static_cast<size_t>(script) >= FullNames.size()) + return nullptr; + + return FullNames[script]; + } + + inline const char* IsoNameByScript(EScript script) const { + if (script < 0 || static_cast<size_t>(script) >= IsoNames.size()) + return nullptr; + + return IsoNames[script]; + } + }; +} + +const char* const TScriptsMap::EMPTY_NAME = ""; + +const char* FullNameByScript(EScript script) { + return Singleton<TScriptsMap>()->FullNameByScript(script); +} + +const char* IsoNameByScript(EScript script) { + return Singleton<TScriptsMap>()->IsoNameByScript(script); +} + +EScript ScriptByName(const TStringBuf& name) { + return Singleton<TScriptsMap>()->ScriptByName(name, SCRIPT_UNKNOWN); +} + +EScript ScriptByNameOrDie(const TStringBuf& name) { + EScript result = ScriptByName(name); + if (result == SCRIPT_UNKNOWN) { + ythrow yexception() << "ScriptByNameOrDie: invalid script '" << name << "'"; + } + return result; +} |