aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/langmask
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/langmask
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/langmask')
-rw-r--r--library/cpp/langmask/README.md4
-rw-r--r--library/cpp/langmask/langmask.cpp149
-rw-r--r--library/cpp/langmask/langmask.h120
-rw-r--r--library/cpp/langmask/proto/langmask.proto6
4 files changed, 279 insertions, 0 deletions
diff --git a/library/cpp/langmask/README.md b/library/cpp/langmask/README.md
new file mode 100644
index 0000000000..efb6cb9313
--- /dev/null
+++ b/library/cpp/langmask/README.md
@@ -0,0 +1,4 @@
+Здесь представлен класс [`TLangMask`](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langmask/langmask.h) для битовой маски [enum'ов с языками](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs).
+
+Определено несколько [стандартных языковых масок](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langmask/langmask.h?rev=r6913473#L64-69), в основном используемых в индексаторе.
+Имеются [функции](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langmask/serialization/langmask.h) для сериализации/десериализации языковых масок.
diff --git a/library/cpp/langmask/langmask.cpp b/library/cpp/langmask/langmask.cpp
new file mode 100644
index 0000000000..19c3426864
--- /dev/null
+++ b/library/cpp/langmask/langmask.cpp
@@ -0,0 +1,149 @@
+#include "langmask.h"
+
+#include <util/generic/hash.h>
+#include <util/generic/singleton.h>
+#include <util/string/split.h>
+#include <util/string/strip.h>
+#include <util/system/compiler.h>
+
+#include <array>
+
+namespace NLanguageMasks {
+ namespace {
+ struct TScriptMapX: public TScriptMap {
+ TScriptMapX() {
+ for (size_t i = 0; i != LANG_MAX; ++i) {
+ ELanguage language = static_cast<ELanguage>(i);
+ if (!UnknownLanguage(language))
+ (*this)[ScriptByLanguage(language)].SafeSet(language);
+ }
+ }
+ };
+ }
+
+ const TScriptMap& ScriptMap() {
+ return *Singleton<TScriptMapX>();
+ }
+
+ const TLangMask& CyrillicLanguagesExt() {
+ return ScriptMap().find(SCRIPT_CYRILLIC)->second;
+ }
+
+ const TLangMask& LatinLanguages() {
+ return ScriptMap().find(SCRIPT_LATIN)->second;
+ }
+
+ const TLangMask& SameScriptLanguages(EScript scr) {
+ static const TLangMask empty;
+ TScriptMap::const_iterator it = ScriptMap().find(scr);
+ return ScriptMap().end() == it ? empty : it->second;
+ }
+
+ TLangMask SameScriptLanguages(TLangMask src) {
+ TLangMask dst;
+ for (auto lg : src) {
+ TScriptMap::const_iterator it = ScriptMap().find(ScriptByLanguage(lg));
+ if (ScriptMap().end() != it) {
+ dst |= it->second;
+ src &= ~it->second; // don't need others using the same script
+ }
+ }
+ return dst;
+ }
+
+ template <typename T>
+ TLangMask CreateFromListImpl(const TString& list, T langGetter) {
+ TLangMask result;
+ TVector<TString> langVector;
+ StringSplitter(list).Split(',').SkipEmpty().Collect(&langVector);
+ for (const auto& i : langVector) {
+ ELanguage lang = langGetter(Strip(i).data());
+ if (lang == LANG_MAX)
+ ythrow yexception() << "Unknown language: " << i;
+ result.SafeSet(lang);
+ }
+ return result;
+ }
+
+ TLangMask CreateFromList(const TString& list) {
+ return CreateFromListImpl(list, LanguageByNameStrict);
+ }
+
+ TLangMask SafeCreateFromList(const TString& list) {
+ return CreateFromListImpl(list, LanguageByName);
+ }
+
+ TString ToString(const TLangMask& langMask) {
+ if (langMask.Empty())
+ return NameByLanguage(LANG_UNK);
+ TString result;
+ for (auto lang : langMask) {
+ if (!!result)
+ result += ",";
+ result += NameByLanguage(lang);
+ }
+ return result;
+ }
+}
+
+namespace {
+ struct TNewLanguageEnumToOldLanguageHelper {
+ TNewLanguageEnumToOldLanguageHelper() {
+ static const TOldLanguageEncoder::TLanguageId LI_UNKNOWN = 0x00000000; // special code - shall be zero
+ static const TOldLanguageEncoder::TLanguageId LI_ENGLISH = 0x00000001;
+ static const TOldLanguageEncoder::TLanguageId LI_RUSSIAN = 0x00000002;
+ static const TOldLanguageEncoder::TLanguageId LI_POLISH = 0x00000004;
+ static const TOldLanguageEncoder::TLanguageId LI_UKRAINIAN = 0x00000008;
+ static const TOldLanguageEncoder::TLanguageId LI_GERMAN = 0x00000010;
+ static const TOldLanguageEncoder::TLanguageId LI_FRENCH = 0x00000020;
+ // Beware: a hole should be left at 0x40 - 0x80,
+ // to prevent overlap with CC_UPPERCASE / CC_TITLECASE
+ static const TOldLanguageEncoder::TLanguageId LI_HUNGARIAN = 0x00000100;
+ // static const TOldLanguageEncoder::TLanguageId LI_UKRAINIAN_ABBYY = 0x00000200;
+ static const TOldLanguageEncoder::TLanguageId LI_ITALIAN = 0x00000400;
+ static const TOldLanguageEncoder::TLanguageId LI_BELORUSSIAN = 0x00000800;
+ static const TOldLanguageEncoder::TLanguageId LI_KAZAKH = 0x00008000;
+
+ Direct[LANG_UNK] = LI_UNKNOWN;
+ Direct[LANG_ENG] = LI_ENGLISH;
+ Direct[LANG_RUS] = LI_RUSSIAN;
+ Direct[LANG_POL] = LI_POLISH;
+ Direct[LANG_UKR] = LI_UKRAINIAN;
+ Direct[LANG_GER] = LI_GERMAN;
+ Direct[LANG_FRE] = LI_FRENCH;
+ Direct[LANG_HUN] = LI_HUNGARIAN;
+ // Direct[] = LI_UKRAINIAN_ABBYY;
+ Direct[LANG_ITA] = LI_ITALIAN;
+ Direct[LANG_BEL] = LI_BELORUSSIAN;
+ Direct[LANG_KAZ] = LI_KAZAKH;
+
+ for (auto i = Direct.size(); i > 0; --i) {
+ Reverse[Direct[i - 1]] = static_cast<ELanguage>(i - 1);
+ }
+
+ Y_ENSURE(LANG_UNK == Reverse.find(LI_UNKNOWN)->second, "Must be equal");
+ }
+
+ THashMap< ::TOldLanguageEncoder::TLanguageId, ELanguage> Reverse;
+ std::array< ::TOldLanguageEncoder::TLanguageId, static_cast<size_t>(LANG_MAX)> Direct;
+ };
+}
+
+TOldLanguageEncoder::TLanguageId TOldLanguageEncoder::ToOld(ELanguage l) {
+ const auto& helper = Default<TNewLanguageEnumToOldLanguageHelper>();
+ if (Y_UNLIKELY(static_cast<size_t>(l) >= helper.Direct.size())) {
+ l = LANG_UNK;
+ }
+
+ return helper.Direct[l];
+}
+
+ELanguage TOldLanguageEncoder::FromOld1(TOldLanguageEncoder::TLanguageId l) {
+ const auto& helper = Default<TNewLanguageEnumToOldLanguageHelper>();
+ const auto it = helper.Reverse.find(l);
+ if (Y_UNLIKELY(it == helper.Reverse.end())) {
+ return LANG_UNK;
+ }
+
+ return it->second;
+}
diff --git a/library/cpp/langmask/langmask.h b/library/cpp/langmask/langmask.h
new file mode 100644
index 0000000000..96608bbe21
--- /dev/null
+++ b/library/cpp/langmask/langmask.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <library/cpp/enumbitset/enumbitset.h>
+#include <library/cpp/langs/langs.h>
+
+#include <util/generic/fwd.h>
+
+typedef TSfEnumBitSet<ELanguage, static_cast<ELanguage>(LANG_UNK + 1), LANG_MAX> TLangMask;
+
+// Useful language sets
+namespace NLanguageMasks {
+ using TScriptMap = THashMap<EScript, TLangMask>;
+
+ const TScriptMap& ScriptMap();
+
+ inline const TLangMask& BasicLanguages() {
+ const static TLangMask ret(LANG_ENG, LANG_RUS, LANG_UKR);
+ return ret;
+ }
+ inline const TLangMask& DefaultRequestLanguages() {
+ const static TLangMask ret = BasicLanguages() | TLangMask(LANG_KAZ, LANG_BEL, LANG_TAT);
+ return ret;
+ }
+ inline const TLangMask& AllLanguages() {
+ const static TLangMask ret = ~TLangMask() & ~TLangMask(LANG_BASIC_ENG, LANG_BASIC_RUS);
+ return ret;
+ }
+ inline const TLangMask& CyrillicLanguages() {
+ const static TLangMask ret = TLangMask(LANG_RUS, LANG_UKR, LANG_BEL);
+ return ret;
+ }
+ const TLangMask& CyrillicLanguagesExt();
+ const TLangMask& LatinLanguages();
+ inline const TLangMask& LemmasInIndex() {
+ const static TLangMask ret = TLangMask(LANG_RUS, LANG_ENG, LANG_UKR, LANG_TUR) |
+ TLangMask(LANG_BASIC_RUS, LANG_BASIC_ENG);
+ return ret;
+ }
+ inline const TLangMask& NoBastardsInSearch() {
+ const static TLangMask ret = ~LemmasInIndex();
+ return ret;
+ }
+
+ TLangMask SameScriptLanguages(TLangMask mask);
+
+ inline TLangMask RestrictLangMaskWithSameScripts(const TLangMask& mask, const TLangMask& by) {
+ return mask & ~SameScriptLanguages(by);
+ }
+
+ const TLangMask& SameScriptLanguages(EScript scr);
+
+ inline TLangMask OtherSameScriptLanguages(const TLangMask& mask) {
+ return ~mask & SameScriptLanguages(mask);
+ }
+
+ //List is string with list of languages names splinted by ','.
+ TLangMask CreateFromList(const TString& list); // throws exception on unknown name
+ TLangMask SafeCreateFromList(const TString& list); // ignore unknown names
+
+ TString ToString(const TLangMask& langMask);
+
+}
+
+#define LI_BASIC_LANGUAGES NLanguageMasks::BasicLanguages()
+#define LI_DEFAULT_REQUEST_LANGUAGES NLanguageMasks::DefaultRequestLanguages()
+#define LI_ALL_LANGUAGES NLanguageMasks::AllLanguages()
+#define LI_CYRILLIC_LANGUAGES NLanguageMasks::CyrillicLanguages()
+#define LI_CYRILLIC_LANGUAGES_EXT NLanguageMasks::CyrillicLanguagesExt()
+#define LI_LATIN_LANGUAGES NLanguageMasks::LatinLanguages()
+
+// Casing and composition of a word. Used in bitwise unions.
+using TCharCategory = long;
+const TCharCategory CC_EMPTY = 0x0000;
+const TCharCategory CC_ALPHA = 0x0001;
+const TCharCategory CC_NMTOKEN = 0x0002;
+const TCharCategory CC_NUMBER = 0x0004;
+const TCharCategory CC_NUTOKEN = 0x0008;
+// Beware: CC_ASCII .. CC_TITLECASE shall occupy bits 4 to 6. Don't move them.
+const TCharCategory CC_ASCII = 0x0010;
+const TCharCategory CC_NONASCII = 0x0020;
+const TCharCategory CC_TITLECASE = 0x0040;
+const TCharCategory CC_UPPERCASE = 0x0080;
+const TCharCategory CC_LOWERCASE = 0x0100;
+const TCharCategory CC_MIXEDCASE = 0x0200;
+const TCharCategory CC_COMPOUND = 0x0400;
+const TCharCategory CC_HAS_DIACRITIC = 0x0800;
+const TCharCategory CC_DIFFERENT_ALPHABET = 0x1000;
+
+const TCharCategory CC_WHOLEMASK = 0x1FFF;
+
+struct TOldLanguageEncoder {
+ typedef long TLanguageId;
+
+public:
+ static TLanguageId ToOld(ELanguage l);
+
+ static ELanguage FromOld1(TLanguageId l);
+
+ static TLanguageId ToOld(const TLangMask& lm) {
+ TLanguageId ret = 0;
+ for (ELanguage lg : lm) {
+ TLanguageId id = ToOld(lg);
+ ret |= id;
+ }
+ return ret;
+ }
+
+ static TLangMask FromOld(TLanguageId lm) {
+ static const TLanguageId allLangMask = TLanguageId(-1) & ~(0x40 | 0x80);
+ static const size_t numBits = sizeof(TLanguageId) * CHAR_BIT;
+ TLangMask ret;
+ lm &= allLangMask;
+ for (size_t i = 1; i < numBits; ++i) {
+ TLanguageId id = TLanguageId(1) << (i - 1);
+ if (lm & id)
+ ret.SafeSet(FromOld1(id));
+ }
+ return ret;
+ }
+};
diff --git a/library/cpp/langmask/proto/langmask.proto b/library/cpp/langmask/proto/langmask.proto
new file mode 100644
index 0000000000..be23ecfbba
--- /dev/null
+++ b/library/cpp/langmask/proto/langmask.proto
@@ -0,0 +1,6 @@
+package NProto;
+
+message TLangMask {
+ repeated uint32 Bits = 1; // binary
+ optional string Names = 2; // human readable
+}