aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/abbreviations.cpp
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/abbreviations.cpp
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/abbreviations.cpp')
-rw-r--r--library/cpp/tokenizer/abbreviations.cpp207
1 files changed, 207 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/abbreviations.cpp b/library/cpp/tokenizer/abbreviations.cpp
new file mode 100644
index 0000000000..60108d908d
--- /dev/null
+++ b/library/cpp/tokenizer/abbreviations.cpp
@@ -0,0 +1,207 @@
+#include "sentbreakfilter.h"
+
+static const char* COMMON_ABBREVIATIONS_NEVER_BREAK[] = {
+ "агр",
+ "акад",
+ "ал",
+ "алл",
+ "арх",
+ "асс",
+ "б-р",
+ "бол",
+ "бул",
+ "бульв",
+ "вл",
+ "верхн",
+ "вып",
+ "гг",
+ "ген",
+ "гр",
+ "деп",
+ "дер",
+ "дир",
+ "дор",
+ "доц",
+ "зав",
+ "зам",
+ "им",
+ "канд",
+ "каб",
+ "кв",
+ "кв-л",
+ "км",
+ "кн",
+ "корп",
+ "корр",
+ "кр",
+ "лит",
+ "маг",
+ "м-н",
+ "мех",
+ "мин",
+ "мкр",
+ "наб",
+ "напр",
+ "нов",
+ "нс",
+ "пав",
+ "пер",
+ "пер-к",
+ "пл",
+ "пос",
+ "пп",
+ "пр",
+ "пр-д",
+ "пр-зд",
+ "пр-т",
+ "пр-кт",
+ "просп",
+ "проф",
+ "ред",
+ "св",
+ "см",
+ "сов",
+ "спец",
+ "ср",
+ "ст",
+ "твц",
+ "тоц",
+ "трк",
+ "тц",
+ "тeх",
+ "техн",
+ "тов",
+ "тт",
+ "туп",
+ "укр",
+ "ул",
+ "чл",
+ "эт",
+ "co",
+ "corp",
+ "dr",
+ "inc",
+ "ltd",
+ "mr",
+ "mrs",
+ "ms",
+ "st",
+ "vs"};
+
+static const char* DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_NEVER_BREAK[] = {
+ "б-р",
+ "кв-л",
+ "м-н",
+ "пер-к",
+ "пр-д",
+ "пр-зд",
+ "пр-т",
+ "пр-кт",
+ "т.е",
+};
+
+static const char* COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT[] = {
+ "авг",
+ "апр",
+ "влад",
+ "гл",
+ "дек",
+ "доб",
+ "ил",
+ "июл",
+ "июн",
+ "кг",
+ "кл",
+ "ком",
+ "кор",
+ "мар",
+ "мб",
+ "млн",
+ "млрд",
+ "моб",
+ "нояб",
+ "окт",
+ "оф",
+ "рис",
+ "род",
+ "руб",
+ "сен",
+ "сент",
+ "сот",
+ "стр",
+ "табл",
+ "тел",
+ "тыс",
+ "фев",
+ "шк",
+ "янв",
+ "no",
+ "pp",
+ "vol"};
+
+static const char* DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT[] = {
+ "т/ф",
+ "тел/ф",
+};
+
+static const char* UKR_ABBREVIATIONS_NEVER_BREAK[] = {
+ "вул",
+ "ім",
+ "торг",
+ "тур"};
+
+static const char* TUR_ABBREVIATIONS_NEVER_BREAK[] = {
+ "bld",
+ "blv",
+ "blvd",
+ "bul",
+ "cad",
+ "dk",
+ "doç",
+ "hz",
+ "inc",
+ "jr",
+ "kg",
+ "mah",
+ "mh",
+ "prof",
+ "sok",
+ "tel",
+ "tic",
+ "vb",
+ "yard",
+ "yrd"};
+
+void TAbbreviationsDictionary::AddElements(THashSet<TUtf16String>& hashSet,
+ const char* elements[],
+ size_t size) {
+ size_t length = size / sizeof(char*);
+ for (size_t i = 0; i != length; ++i) {
+ TUtf16String str(UTF8ToWide(elements[i]));
+ Y_ASSERT(hashSet.find(str) == hashSet.end());
+ hashSet.insert(str);
+ }
+}
+
+TAbbreviationsDictionary::TAbbreviationsDictionary() {
+ AddElements(NeverBreakSets[LANG_UNK],
+ COMMON_ABBREVIATIONS_NEVER_BREAK,
+ sizeof(COMMON_ABBREVIATIONS_NEVER_BREAK));
+ AddElements(DoubleSubtokenNeverBreakSets[LANG_UNK],
+ DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_NEVER_BREAK,
+ sizeof(DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_NEVER_BREAK));
+ AddElements(DontBreakIfBeforeDigitSets[LANG_UNK],
+ COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT,
+ sizeof(COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT));
+ AddElements(DoubleSubtokenDontBreakIfBeforeDigitSets[LANG_UNK],
+ DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT,
+ sizeof(DOUBLE_SUBTOKEN_COMMON_ABBREVIATIONS_DONT_BREAK_IF_BEFORE_DIGIT));
+
+ AddElements(NeverBreakSets[LANG_UKR],
+ UKR_ABBREVIATIONS_NEVER_BREAK,
+ sizeof(UKR_ABBREVIATIONS_NEVER_BREAK));
+
+ AddElements(NeverBreakSets[LANG_TUR],
+ TUR_ABBREVIATIONS_NEVER_BREAK,
+ sizeof(TUR_ABBREVIATIONS_NEVER_BREAK));
+}