diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/wordlistreader | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/wordlistreader')
-rw-r--r-- | library/cpp/wordlistreader/README.md | 3 | ||||
-rw-r--r-- | library/cpp/wordlistreader/wordlistreader.cpp | 62 | ||||
-rw-r--r-- | library/cpp/wordlistreader/wordlistreader.h | 43 |
3 files changed, 108 insertions, 0 deletions
diff --git a/library/cpp/wordlistreader/README.md b/library/cpp/wordlistreader/README.md new file mode 100644 index 0000000000..7521d88662 --- /dev/null +++ b/library/cpp/wordlistreader/README.md @@ -0,0 +1,3 @@ +Вспомогательная библиотека для работы с текстовыми файлами, разбитыми на разделы по языку. + +Используется в [библиотеке](https://a.yandex-team.ru/arc/trunk/arcadia/library/cpp/stopwords) для работы со стоп-словами ([пример файла](https://a.yandex-team.ru/arc/trunk/arcadia/search/wizard/data/wizard/language/stopword.lst)) и в морфологическом [дециматоре](https://a.yandex-team.ru/arc/trunk/arcadia/kernel/lemmer/core/decimator.h) ([пример файла](https://a.yandex-team.ru/arc/trunk/arcadia/kernel/lemmer/context/default_decimator/default_decimator.lst)). diff --git a/library/cpp/wordlistreader/wordlistreader.cpp b/library/cpp/wordlistreader/wordlistreader.cpp new file mode 100644 index 0000000000..9cf6d86c00 --- /dev/null +++ b/library/cpp/wordlistreader/wordlistreader.cpp @@ -0,0 +1,62 @@ +#include "wordlistreader.h" + +#include <library/cpp/charset/wide.h> +#include <library/cpp/langs/langs.h> + +#include <library/cpp/charset/recyr.hh> +#include <util/string/cast.h> +#include <util/generic/yexception.h> +#include <util/string/vector.h> +#include <util/string/split.h> + +void TWordListReader::ProcessLine(const TString& line) { + if (line.find('[') == 0 && line.find(']') != TString::npos) { + size_t endpos = line.find(']'); + TString langname = line.substr(1, endpos - 1); + LangCode = LanguageByName(langname); + if (LangCode != LANG_UNK) { + SkippingByError = false; + } else { + Cerr << "Unknown language name: " << langname.c_str() << Endl; + SkippingByError = true; + } + } else if (!SkippingByError) { + TUtf16String recodedLine = CharToWide(line, Encoding); + ParseLine(recodedLine, LangCode, Version); + } +} + +void TWordListReader::ReadDataFile(IInputStream& src) { + // Read header for version and encoding + LangCode = LANG_UNK; + Encoding = CODES_YANDEX; + Version = 0; + SkippingByError = false; + + TString line; + while (src.ReadLine(line)) { + if (line[0] == '#') + continue; // comment + TVector<TString> tokens = StringSplitter(line).SplitBySet(" \t\r\n:,").SkipEmpty(); + if (tokens.size() == 2) { + if (stricmp(tokens[0].c_str(), "version") == 0) { + Version = FromString<int>(tokens[1]); + continue; + } else if (stricmp(tokens[0].c_str(), "encoding") == 0) { + Encoding = CharsetByName(tokens[1].c_str()); + if (Encoding == CODES_UNKNOWN) + ythrow yexception() << "Invalid encoding name"; + continue; + } + } + break; + } + + // Read the body + ProcessLine(line); + while (src.ReadLine(line)) { + if (line[0] == '#') + continue; // skip comments + ProcessLine(line); + } +} diff --git a/library/cpp/wordlistreader/wordlistreader.h b/library/cpp/wordlistreader/wordlistreader.h new file mode 100644 index 0000000000..03abe78fe7 --- /dev/null +++ b/library/cpp/wordlistreader/wordlistreader.h @@ -0,0 +1,43 @@ +#pragma once + +#include <util/generic/string.h> +#include <library/cpp/charset/codepage.h> +#include <util/stream/output.h> +#include <util/stream/file.h> + +#include <library/cpp/langmask/langmask.h> + +// Mix-in class for loading configuration files built of language sections. Handles version, encoding, +// comments, and language section switching; delegates actual processing to derived classes +// via ParseLine() function (pure virtual). + +class TWordListReader { +private: + ELanguage LangCode; + ECharset Encoding; + int Version; + bool SkippingByError; + +public: + TWordListReader() + : LangCode(LANG_UNK) + , Encoding(CODES_YANDEX) + , Version(0) + , SkippingByError(false) + { + } + virtual ~TWordListReader() { + } + +protected: + virtual void ParseLine(const TUtf16String& line, ELanguage langcode, int version) = 0; + + void ReadDataFile(const char* filename) { + TBuffered<TUnbufferedFileInput> src(4096, filename); + ReadDataFile(src); + } + void ReadDataFile(IInputStream& src); + +private: + void ProcessLine(const TString& line); +}; |