diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/langmask | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/langmask')
-rw-r--r-- | library/cpp/langmask/README.md | 4 | ||||
-rw-r--r-- | library/cpp/langmask/langmask.cpp | 149 | ||||
-rw-r--r-- | library/cpp/langmask/langmask.h | 120 | ||||
-rw-r--r-- | library/cpp/langmask/proto/langmask.proto | 6 |
4 files changed, 279 insertions, 0 deletions
diff --git a/library/cpp/langmask/README.md b/library/cpp/langmask/README.md new file mode 100644 index 0000000000..efb6cb9313 --- /dev/null +++ b/library/cpp/langmask/README.md @@ -0,0 +1,4 @@ +Здесь представлен класс [`TLangMask`](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langmask/langmask.h) для битовой маски [enum'ов с языками](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langs). + +Определено несколько [стандартных языковых масок](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langmask/langmask.h?rev=r6913473#L64-69), в основном используемых в индексаторе. +Имеются [функции](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/langmask/serialization/langmask.h) для сериализации/десериализации языковых масок. diff --git a/library/cpp/langmask/langmask.cpp b/library/cpp/langmask/langmask.cpp new file mode 100644 index 0000000000..19c3426864 --- /dev/null +++ b/library/cpp/langmask/langmask.cpp @@ -0,0 +1,149 @@ +#include "langmask.h" + +#include <util/generic/hash.h> +#include <util/generic/singleton.h> +#include <util/string/split.h> +#include <util/string/strip.h> +#include <util/system/compiler.h> + +#include <array> + +namespace NLanguageMasks { + namespace { + struct TScriptMapX: public TScriptMap { + TScriptMapX() { + for (size_t i = 0; i != LANG_MAX; ++i) { + ELanguage language = static_cast<ELanguage>(i); + if (!UnknownLanguage(language)) + (*this)[ScriptByLanguage(language)].SafeSet(language); + } + } + }; + } + + const TScriptMap& ScriptMap() { + return *Singleton<TScriptMapX>(); + } + + const TLangMask& CyrillicLanguagesExt() { + return ScriptMap().find(SCRIPT_CYRILLIC)->second; + } + + const TLangMask& LatinLanguages() { + return ScriptMap().find(SCRIPT_LATIN)->second; + } + + const TLangMask& SameScriptLanguages(EScript scr) { + static const TLangMask empty; + TScriptMap::const_iterator it = ScriptMap().find(scr); + return ScriptMap().end() == it ? empty : it->second; + } + + TLangMask SameScriptLanguages(TLangMask src) { + TLangMask dst; + for (auto lg : src) { + TScriptMap::const_iterator it = ScriptMap().find(ScriptByLanguage(lg)); + if (ScriptMap().end() != it) { + dst |= it->second; + src &= ~it->second; // don't need others using the same script + } + } + return dst; + } + + template <typename T> + TLangMask CreateFromListImpl(const TString& list, T langGetter) { + TLangMask result; + TVector<TString> langVector; + StringSplitter(list).Split(',').SkipEmpty().Collect(&langVector); + for (const auto& i : langVector) { + ELanguage lang = langGetter(Strip(i).data()); + if (lang == LANG_MAX) + ythrow yexception() << "Unknown language: " << i; + result.SafeSet(lang); + } + return result; + } + + TLangMask CreateFromList(const TString& list) { + return CreateFromListImpl(list, LanguageByNameStrict); + } + + TLangMask SafeCreateFromList(const TString& list) { + return CreateFromListImpl(list, LanguageByName); + } + + TString ToString(const TLangMask& langMask) { + if (langMask.Empty()) + return NameByLanguage(LANG_UNK); + TString result; + for (auto lang : langMask) { + if (!!result) + result += ","; + result += NameByLanguage(lang); + } + return result; + } +} + +namespace { + struct TNewLanguageEnumToOldLanguageHelper { + TNewLanguageEnumToOldLanguageHelper() { + static const TOldLanguageEncoder::TLanguageId LI_UNKNOWN = 0x00000000; // special code - shall be zero + static const TOldLanguageEncoder::TLanguageId LI_ENGLISH = 0x00000001; + static const TOldLanguageEncoder::TLanguageId LI_RUSSIAN = 0x00000002; + static const TOldLanguageEncoder::TLanguageId LI_POLISH = 0x00000004; + static const TOldLanguageEncoder::TLanguageId LI_UKRAINIAN = 0x00000008; + static const TOldLanguageEncoder::TLanguageId LI_GERMAN = 0x00000010; + static const TOldLanguageEncoder::TLanguageId LI_FRENCH = 0x00000020; + // Beware: a hole should be left at 0x40 - 0x80, + // to prevent overlap with CC_UPPERCASE / CC_TITLECASE + static const TOldLanguageEncoder::TLanguageId LI_HUNGARIAN = 0x00000100; + // static const TOldLanguageEncoder::TLanguageId LI_UKRAINIAN_ABBYY = 0x00000200; + static const TOldLanguageEncoder::TLanguageId LI_ITALIAN = 0x00000400; + static const TOldLanguageEncoder::TLanguageId LI_BELORUSSIAN = 0x00000800; + static const TOldLanguageEncoder::TLanguageId LI_KAZAKH = 0x00008000; + + Direct[LANG_UNK] = LI_UNKNOWN; + Direct[LANG_ENG] = LI_ENGLISH; + Direct[LANG_RUS] = LI_RUSSIAN; + Direct[LANG_POL] = LI_POLISH; + Direct[LANG_UKR] = LI_UKRAINIAN; + Direct[LANG_GER] = LI_GERMAN; + Direct[LANG_FRE] = LI_FRENCH; + Direct[LANG_HUN] = LI_HUNGARIAN; + // Direct[] = LI_UKRAINIAN_ABBYY; + Direct[LANG_ITA] = LI_ITALIAN; + Direct[LANG_BEL] = LI_BELORUSSIAN; + Direct[LANG_KAZ] = LI_KAZAKH; + + for (auto i = Direct.size(); i > 0; --i) { + Reverse[Direct[i - 1]] = static_cast<ELanguage>(i - 1); + } + + Y_ENSURE(LANG_UNK == Reverse.find(LI_UNKNOWN)->second, "Must be equal"); + } + + THashMap< ::TOldLanguageEncoder::TLanguageId, ELanguage> Reverse; + std::array< ::TOldLanguageEncoder::TLanguageId, static_cast<size_t>(LANG_MAX)> Direct; + }; +} + +TOldLanguageEncoder::TLanguageId TOldLanguageEncoder::ToOld(ELanguage l) { + const auto& helper = Default<TNewLanguageEnumToOldLanguageHelper>(); + if (Y_UNLIKELY(static_cast<size_t>(l) >= helper.Direct.size())) { + l = LANG_UNK; + } + + return helper.Direct[l]; +} + +ELanguage TOldLanguageEncoder::FromOld1(TOldLanguageEncoder::TLanguageId l) { + const auto& helper = Default<TNewLanguageEnumToOldLanguageHelper>(); + const auto it = helper.Reverse.find(l); + if (Y_UNLIKELY(it == helper.Reverse.end())) { + return LANG_UNK; + } + + return it->second; +} diff --git a/library/cpp/langmask/langmask.h b/library/cpp/langmask/langmask.h new file mode 100644 index 0000000000..96608bbe21 --- /dev/null +++ b/library/cpp/langmask/langmask.h @@ -0,0 +1,120 @@ +#pragma once + +#include <library/cpp/enumbitset/enumbitset.h> +#include <library/cpp/langs/langs.h> + +#include <util/generic/fwd.h> + +typedef TSfEnumBitSet<ELanguage, static_cast<ELanguage>(LANG_UNK + 1), LANG_MAX> TLangMask; + +// Useful language sets +namespace NLanguageMasks { + using TScriptMap = THashMap<EScript, TLangMask>; + + const TScriptMap& ScriptMap(); + + inline const TLangMask& BasicLanguages() { + const static TLangMask ret(LANG_ENG, LANG_RUS, LANG_UKR); + return ret; + } + inline const TLangMask& DefaultRequestLanguages() { + const static TLangMask ret = BasicLanguages() | TLangMask(LANG_KAZ, LANG_BEL, LANG_TAT); + return ret; + } + inline const TLangMask& AllLanguages() { + const static TLangMask ret = ~TLangMask() & ~TLangMask(LANG_BASIC_ENG, LANG_BASIC_RUS); + return ret; + } + inline const TLangMask& CyrillicLanguages() { + const static TLangMask ret = TLangMask(LANG_RUS, LANG_UKR, LANG_BEL); + return ret; + } + const TLangMask& CyrillicLanguagesExt(); + const TLangMask& LatinLanguages(); + inline const TLangMask& LemmasInIndex() { + const static TLangMask ret = TLangMask(LANG_RUS, LANG_ENG, LANG_UKR, LANG_TUR) | + TLangMask(LANG_BASIC_RUS, LANG_BASIC_ENG); + return ret; + } + inline const TLangMask& NoBastardsInSearch() { + const static TLangMask ret = ~LemmasInIndex(); + return ret; + } + + TLangMask SameScriptLanguages(TLangMask mask); + + inline TLangMask RestrictLangMaskWithSameScripts(const TLangMask& mask, const TLangMask& by) { + return mask & ~SameScriptLanguages(by); + } + + const TLangMask& SameScriptLanguages(EScript scr); + + inline TLangMask OtherSameScriptLanguages(const TLangMask& mask) { + return ~mask & SameScriptLanguages(mask); + } + + //List is string with list of languages names splinted by ','. + TLangMask CreateFromList(const TString& list); // throws exception on unknown name + TLangMask SafeCreateFromList(const TString& list); // ignore unknown names + + TString ToString(const TLangMask& langMask); + +} + +#define LI_BASIC_LANGUAGES NLanguageMasks::BasicLanguages() +#define LI_DEFAULT_REQUEST_LANGUAGES NLanguageMasks::DefaultRequestLanguages() +#define LI_ALL_LANGUAGES NLanguageMasks::AllLanguages() +#define LI_CYRILLIC_LANGUAGES NLanguageMasks::CyrillicLanguages() +#define LI_CYRILLIC_LANGUAGES_EXT NLanguageMasks::CyrillicLanguagesExt() +#define LI_LATIN_LANGUAGES NLanguageMasks::LatinLanguages() + +// Casing and composition of a word. Used in bitwise unions. +using TCharCategory = long; +const TCharCategory CC_EMPTY = 0x0000; +const TCharCategory CC_ALPHA = 0x0001; +const TCharCategory CC_NMTOKEN = 0x0002; +const TCharCategory CC_NUMBER = 0x0004; +const TCharCategory CC_NUTOKEN = 0x0008; +// Beware: CC_ASCII .. CC_TITLECASE shall occupy bits 4 to 6. Don't move them. +const TCharCategory CC_ASCII = 0x0010; +const TCharCategory CC_NONASCII = 0x0020; +const TCharCategory CC_TITLECASE = 0x0040; +const TCharCategory CC_UPPERCASE = 0x0080; +const TCharCategory CC_LOWERCASE = 0x0100; +const TCharCategory CC_MIXEDCASE = 0x0200; +const TCharCategory CC_COMPOUND = 0x0400; +const TCharCategory CC_HAS_DIACRITIC = 0x0800; +const TCharCategory CC_DIFFERENT_ALPHABET = 0x1000; + +const TCharCategory CC_WHOLEMASK = 0x1FFF; + +struct TOldLanguageEncoder { + typedef long TLanguageId; + +public: + static TLanguageId ToOld(ELanguage l); + + static ELanguage FromOld1(TLanguageId l); + + static TLanguageId ToOld(const TLangMask& lm) { + TLanguageId ret = 0; + for (ELanguage lg : lm) { + TLanguageId id = ToOld(lg); + ret |= id; + } + return ret; + } + + static TLangMask FromOld(TLanguageId lm) { + static const TLanguageId allLangMask = TLanguageId(-1) & ~(0x40 | 0x80); + static const size_t numBits = sizeof(TLanguageId) * CHAR_BIT; + TLangMask ret; + lm &= allLangMask; + for (size_t i = 1; i < numBits; ++i) { + TLanguageId id = TLanguageId(1) << (i - 1); + if (lm & id) + ret.SafeSet(FromOld1(id)); + } + return ret; + } +}; diff --git a/library/cpp/langmask/proto/langmask.proto b/library/cpp/langmask/proto/langmask.proto new file mode 100644 index 0000000000..be23ecfbba --- /dev/null +++ b/library/cpp/langmask/proto/langmask.proto @@ -0,0 +1,6 @@ +package NProto; + +message TLangMask { + repeated uint32 Bits = 1; // binary + optional string Names = 2; // human readable +} |