diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/abbreviations.cpp | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/abbreviations.cpp')
-rw-r--r-- | library/cpp/tokenizer/abbreviations.cpp | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/abbreviations.cpp b/library/cpp/tokenizer/abbreviations.cpp new file mode 100644 index 0000000000..60108d908d --- /dev/null +++ b/library/cpp/tokenizer/abbreviations.cpp @@ -0,0 +1,207 @@ +#include "sentbreakfilter.h" + +static const char* COMMON_ABBREVIATIONS_NEVER_BREAK[] = { + "агр", + "акад", + "ал", + "алл", + "арх", + "асс", + "б-р", + "бол", + "бул", + "бульв", + "вл", + "верхн", + "вып", + "гг", + "ген", + "гр", + "деп", + "дер", + "дир", + "дор", + "доц", + "зав", + "зам", + "им", + "канд", + "каб", + "кв", + "кв-л", + "км", + "кн", + "корп", + "корр", + "кр", + "лит", + "маг", + "м-н", + "мех", + "мин", + "мкр", + "наб", + "напр", + "нов", + "нс", + "пав", + "пер", + "пер-к", + "пл", + "пос", + "пп", + "пр", + "пр-д", + "пр-зд", + "пр-т", + "пр-кт", + "просп", + "проф", + "ред", + "св", + "см", + "сов", + "спец", + "ср", + "ст", + "твц", + "тоц", + "трк", + "тц", + "тeх", + "техн", + "тов", + "тт", + "туп", + "укр", + "ул", + "чл", + "эт", + "co", + "corp", + "dr", + "inc", + "ltd", + "mr", + "mrs", + "ms", + "st", + "vs"}; + +static const char* DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_NEVER_BREAK[] = { + "б-р", + "кв-л", + "м-н", + "пер-к", + "пр-д", + "пр-зд", + "пр-т", + "пр-кт", + "т.е", +}; + +static const char* COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT[] = { + "авг", + "апр", + "влад", + "гл", + "дек", + "доб", + "ил", + "июл", + "июн", + "кг", + "кл", + "ком", + "кор", + "мар", + "мб", + "млн", + "млрд", + "моб", + "нояб", + "окт", + "оф", + "рис", + "род", + "руб", + "сен", + "сент", + "сот", + "стр", + "табл", + "тел", + "тыс", + "фев", + "шк", + "янв", + "no", + "pp", + "vol"}; + +static const char* DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT[] = { + "т/ф", + "тел/ф", +}; + +static const char* UKR_ABBREVIATIONS_NEVER_BREAK[] = { + "вул", + "ім", + "торг", + "тур"}; + +static const char* TUR_ABBREVIATIONS_NEVER_BREAK[] = { + "bld", + "blv", + "blvd", + "bul", + "cad", + "dk", + "doç", + "hz", + "inc", + "jr", + "kg", + "mah", + "mh", + "prof", + "sok", + "tel", + "tic", + "vb", + "yard", + "yrd"}; + +void TAbbreviationsDictionary::AddElements(THashSet<TUtf16String>& hashSet, + const char* elements[], + size_t size) { + size_t length = size / sizeof(char*); + for (size_t i = 0; i != length; ++i) { + TUtf16String str(UTF8ToWide(elements[i])); + Y_ASSERT(hashSet.find(str) == hashSet.end()); + hashSet.insert(str); + } +} + +TAbbreviationsDictionary::TAbbreviationsDictionary() { + AddElements(NeverBreakSets[LANG_UNK], + COMMON_ABBREVIATIONS_NEVER_BREAK, + sizeof(COMMON_ABBREVIATIONS_NEVER_BREAK)); + AddElements(DoubleSubtokenNeverBreakSets[LANG_UNK], + DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_NEVER_BREAK, + sizeof(DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_NEVER_BREAK)); + AddElements(DontBreakIfBeforeDigitSets[LANG_UNK], + COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT, + sizeof(COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT)); + AddElements(DoubleSubtokenDontBreakIfBeforeDigitSets[LANG_UNK], + DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT, + sizeof(DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT)); + + AddElements(NeverBreakSets[LANG_UKR], + UKR_ABBREVIATIONS_NEVER_BREAK, + sizeof(UKR_ABBREVIATIONS_NEVER_BREAK)); + + AddElements(NeverBreakSets[LANG_TUR], + TUR_ABBREVIATIONS_NEVER_BREAK, + sizeof(TUR_ABBREVIATIONS_NEVER_BREAK)); +} |