aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/wordlistreader
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/wordlistreader
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/wordlistreader')
-rw-r--r--library/cpp/wordlistreader/README.md3
-rw-r--r--library/cpp/wordlistreader/wordlistreader.cpp62
-rw-r--r--library/cpp/wordlistreader/wordlistreader.h43
3 files changed, 108 insertions, 0 deletions
diff --git a/library/cpp/wordlistreader/README.md b/library/cpp/wordlistreader/README.md
new file mode 100644
index 0000000000..7521d88662
--- /dev/null
+++ b/library/cpp/wordlistreader/README.md
@@ -0,0 +1,3 @@
+Вспомогательная библиотека для работы с текстовыми файлами, разбитыми на разделы по языку.
+
+Используется в [библиотеке](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/stopwords) для работы со стоп-словами ([пример файла](https://a.yandex-team.ru/arc/trunk/arcadia/search/wizard/data/wizard/language/stopword.lst)) и в морфологическом [дециматоре](https://a.yandex-team.ru/arc/trunk/arcadia/kernel/lemmer/core/decimator.h) ([пример файла](https://a.yandex-team.ru/arc/trunk/arcadia/kernel/lemmer/context/default_decimator/default_decimator.lst)).
diff --git a/library/cpp/wordlistreader/wordlistreader.cpp b/library/cpp/wordlistreader/wordlistreader.cpp
new file mode 100644
index 0000000000..9cf6d86c00
--- /dev/null
+++ b/library/cpp/wordlistreader/wordlistreader.cpp
@@ -0,0 +1,62 @@
+#include "wordlistreader.h"
+
+#include <library/cpp/charset/wide.h>
+#include <library/cpp/langs/langs.h>
+
+#include <library/cpp/charset/recyr.hh>
+#include <util/string/cast.h>
+#include <util/generic/yexception.h>
+#include <util/string/vector.h>
+#include <util/string/split.h>
+
+void TWordListReader::ProcessLine(const TString& line) {
+ if (line.find('[') == 0 && line.find(']') != TString::npos) {
+ size_t endpos = line.find(']');
+ TString langname = line.substr(1, endpos - 1);
+ LangCode = LanguageByName(langname);
+ if (LangCode != LANG_UNK) {
+ SkippingByError = false;
+ } else {
+ Cerr << "Unknown language name: " << langname.c_str() << Endl;
+ SkippingByError = true;
+ }
+ } else if (!SkippingByError) {
+ TUtf16String recodedLine = CharToWide(line, Encoding);
+ ParseLine(recodedLine, LangCode, Version);
+ }
+}
+
+void TWordListReader::ReadDataFile(IInputStream& src) {
+ // Read header for version and encoding
+ LangCode = LANG_UNK;
+ Encoding = CODES_YANDEX;
+ Version = 0;
+ SkippingByError = false;
+
+ TString line;
+ while (src.ReadLine(line)) {
+ if (line[0] == '#')
+ continue; // comment
+ TVector<TString> tokens = StringSplitter(line).SplitBySet(" \t\r\n:,").SkipEmpty();
+ if (tokens.size() == 2) {
+ if (stricmp(tokens[0].c_str(), "version") == 0) {
+ Version = FromString<int>(tokens[1]);
+ continue;
+ } else if (stricmp(tokens[0].c_str(), "encoding") == 0) {
+ Encoding = CharsetByName(tokens[1].c_str());
+ if (Encoding == CODES_UNKNOWN)
+ ythrow yexception() << "Invalid encoding name";
+ continue;
+ }
+ }
+ break;
+ }
+
+ // Read the body
+ ProcessLine(line);
+ while (src.ReadLine(line)) {
+ if (line[0] == '#')
+ continue; // skip comments
+ ProcessLine(line);
+ }
+}
diff --git a/library/cpp/wordlistreader/wordlistreader.h b/library/cpp/wordlistreader/wordlistreader.h
new file mode 100644
index 0000000000..03abe78fe7
--- /dev/null
+++ b/library/cpp/wordlistreader/wordlistreader.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <library/cpp/charset/codepage.h>
+#include <util/stream/output.h>
+#include <util/stream/file.h>
+
+#include <library/cpp/langmask/langmask.h>
+
+// Mix-in class for loading configuration files built of language sections. Handles version, encoding,
+// comments, and language section switching; delegates actual processing to derived classes
+// via ParseLine() function (pure virtual).
+
+class TWordListReader {
+private:
+ ELanguage LangCode;
+ ECharset Encoding;
+ int Version;
+ bool SkippingByError;
+
+public:
+ TWordListReader()
+ : LangCode(LANG_UNK)
+ , Encoding(CODES_YANDEX)
+ , Version(0)
+ , SkippingByError(false)
+ {
+ }
+ virtual ~TWordListReader() {
+ }
+
+protected:
+ virtual void ParseLine(const TUtf16String& line, ELanguage langcode, int version) = 0;
+
+ void ReadDataFile(const char* filename) {
+ TBuffered<TUnbufferedFileInput> src(4096, filename);
+ ReadDataFile(src);
+ }
+ void ReadDataFile(IInputStream& src);
+
+private:
+ void ProcessLine(const TString& line);
+};