aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/stopwords
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/stopwords
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/stopwords')
-rw-r--r--library/cpp/stopwords/README.md1
-rw-r--r--library/cpp/stopwords/stopwords.cpp152
-rw-r--r--library/cpp/stopwords/stopwords.h143
3 files changed, 296 insertions, 0 deletions
diff --git a/library/cpp/stopwords/README.md b/library/cpp/stopwords/README.md
new file mode 100644
index 0000000000..cb48a4e345
--- /dev/null
+++ b/library/cpp/stopwords/README.md
@@ -0,0 +1 @@
+Библиотека для работы с файлами стоп-слов, заданными в [таком формате](https://a.yandex-team.ru/arc/trunk/arcadia/search/wizard/data/wizard/language/stopword.lst).
diff --git a/library/cpp/stopwords/stopwords.cpp b/library/cpp/stopwords/stopwords.cpp
new file mode 100644
index 0000000000..54186ca1bb
--- /dev/null
+++ b/library/cpp/stopwords/stopwords.cpp
@@ -0,0 +1,152 @@
+#include <algorithm>
+
+#include <library/cpp/charset/wide.h>
+#include <util/memory/tempbuf.h>
+#include <util/string/vector.h>
+#include <util/generic/yexception.h>
+#include <util/digest/murmur.h>
+#include <util/string/split.h>
+
+#include "stopwords.h"
+
+const EStickySide DefaultStickiness = STICK_RIGHT;
+const TWordFilter TWordFilter::EmptyFilter;
+
+struct TToLower {
+ wchar16 operator()(wchar16 c) {
+ return (wchar16)ToLower(c);
+ }
+};
+
+size_t TTCharStrIHashImpl(const wchar16* ptr) {
+ const size_t len = ptr ? std::char_traits<wchar16>::length(ptr) : 0;
+ TCharTemp buf(len);
+ std::transform(ptr, ptr + len, buf.Data(), TToLower());
+ return MurmurHash<size_t>((void*)buf.Data(), len * sizeof(wchar16));
+}
+
+bool TTCharStrIEqualToImpl(const wchar16* s1, const wchar16* s2) {
+ if (!s1 || !s2)
+ return !s1 == !s2;
+ for (; *s1 && *s2; ++s1, ++s2)
+ if (ToLower(*s1) != ToLower(*s2))
+ return false;
+ return *s1 == *s2;
+}
+
+namespace {
+ struct TReaderImpl: public TWordListReader {
+ TWordFilter::TWideStopWordsHash* Res;
+ ELanguage CurrentLanguage;
+ EStickySide CurrentStickiness;
+ TReaderImpl(TWordFilter::TWideStopWordsHash* res)
+ : Res(res)
+ , CurrentLanguage(LANG_UNK)
+ , CurrentStickiness(DefaultStickiness)
+ {
+ }
+ void ParseLine(const TUtf16String& line, ELanguage langcode, int version) override;
+ void ReadDataFile(const char* s) {
+ TWordListReader::ReadDataFile(s);
+ }
+ void ReadDataFile(IInputStream& in) {
+ TWordListReader::ReadDataFile(in);
+ }
+ };
+}
+
+bool TWordFilter::InitStopWordsList(const char* filename) {
+ if (!filename || !*filename)
+ return true;
+ TBuffered<TUnbufferedFileInput> src(4096, filename);
+ return InitStopWordsList(src);
+}
+
+bool TWordFilter::InitStopWordsList(IInputStream& instream) {
+ TermStopWordsList();
+ WordFilter.Reset(new TStopWordsHash);
+ WideWordFilter.Reset(new TWideStopWordsHash);
+ PlainWordFilter.Reset(new HashSet);
+ TReaderImpl reader(WideWordFilter.Get());
+ reader.ReadDataFile(instream);
+ InitNarrowFilter();
+ return true;
+}
+
+// Deprecated initializer - no language data, default stickiness
+bool TWordFilter::InitStopWordsList(const char** s, size_t n) {
+ TermStopWordsList();
+ if (!s)
+ return false;
+ WordFilter.Reset(new TStopWordsHash);
+ WideWordFilter.Reset(new TWideStopWordsHash);
+ PlainWordFilter.Reset(new HashSet);
+ for (size_t i = 0; i < n; i++) {
+ if (s[i])
+ WordFilter->Add(s[i], TStopWordInfo(LI_ALL_LANGUAGES, DefaultStickiness));
+ }
+ InitWideFilter();
+ return true;
+}
+
+void TWordFilter::InitWideFilter() {
+ for (TStopWordsHash::const_iterator it = WordFilter->begin(); it != WordFilter->end(); ++it) {
+ TUtf16String tmp = UTF8ToWide(it->first);
+ PlainWordFilter->Add(WideToChar(tmp.data(), tmp.size(), CODES_YANDEX).c_str());
+ WideWordFilter->insert_copy(tmp.c_str(), tmp.size() + 1, it->second);
+ }
+}
+
+void TWordFilter::InitNarrowFilter() {
+ TString tmp;
+ for (TWideStopWordsHash::const_iterator it = WideWordFilter->begin(); it != WideWordFilter->end(); ++it) {
+ const wchar16* const str = it->first;
+ const size_t len = std::char_traits<wchar16>::length(str);
+ tmp.resize(len);
+ WideToChar(str, len, tmp.begin(), CODES_YANDEX);
+ PlainWordFilter->Add(tmp.c_str());
+ WordFilter->Add(WideToUTF8(it->first).c_str(), it->second);
+ }
+}
+
+void TReaderImpl::ParseLine(const TUtf16String& line, ELanguage langcode, int version) {
+ static const TUtf16String delimiters = u" \t\r\n,;";
+ static const TUtf16String strNone = u"NONE:";
+ static const TUtf16String strLeft = u"LEFT:";
+ static const TUtf16String strRight = u"RIGHT:";
+ static const TUtf16String strBoth = u"BOTH:";
+
+ if (langcode != CurrentLanguage) {
+ CurrentStickiness = DefaultStickiness; // reset stickiness at the beginning of each zone
+ CurrentLanguage = langcode;
+ }
+ TLangMask langCode = langcode != LANG_UNK ? TLangMask(langcode) : LI_ALL_LANGUAGES;
+
+ TVector<TUtf16String> tokens;
+ StringSplitter(line).SplitBySet(delimiters.c_str()).SkipEmpty().Collect(&tokens);
+ TVector<TUtf16String>::const_iterator it;
+ for (it = tokens.begin(); it != tokens.end(); it++) {
+ if (it->empty())
+ continue; // due diligence
+ if (version > 1) { // support for stickiness comes from version 2
+ if (*it == strNone) {
+ CurrentStickiness = STICK_NONE;
+ continue;
+ } else if (*it == strLeft) {
+ CurrentStickiness = STICK_LEFT;
+ continue;
+ } else if (*it == strRight) {
+ CurrentStickiness = STICK_RIGHT;
+ continue;
+ } else if (*it == strBoth) {
+ CurrentStickiness = STICK_BOTH;
+ continue;
+ }
+ }
+ TWordFilter::TWideStopWordsHash::iterator fit = Res->find(it->c_str());
+ if (fit == Res->end())
+ Res->insert_copy(it->c_str(), it->length() + 1, TWordFilter::TStopWordInfo(langCode, CurrentStickiness));
+ else
+ fit->second.Language.SafeSet(langcode);
+ }
+}
diff --git a/library/cpp/stopwords/stopwords.h b/library/cpp/stopwords/stopwords.h
new file mode 100644
index 0000000000..fab28a3488
--- /dev/null
+++ b/library/cpp/stopwords/stopwords.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <library/cpp/charset/wide.h>
+#include <library/cpp/containers/str_map/str_map.h>
+#include <library/cpp/containers/str_hash/str_hash.h>
+#include <library/cpp/wordlistreader/wordlistreader.h>
+#include <util/generic/hash.h>
+#include <util/generic/ptr.h>
+#include <util/charset/wide.h>
+#include <util/memory/tempbuf.h>
+
+#include <type_traits>
+
+enum EStickySide {
+ STICK_NONE = 0,
+ STICK_LEFT = 1,
+ STICK_RIGHT = 2,
+ STICK_BOTH = 3,
+};
+
+size_t TTCharStrIHashImpl(const wchar16* ptr);
+bool TTCharStrIEqualToImpl(const wchar16* s1, const wchar16* s2);
+
+struct TTCharStrIHasher {
+ size_t operator()(const wchar16* s) const {
+ return TTCharStrIHashImpl(s);
+ }
+};
+
+struct TTCharStrIEqualTo {
+ bool operator()(const wchar16* s1, const wchar16* s2) {
+ return TTCharStrIEqualToImpl(s1, s2);
+ }
+};
+
+// Hash of stop words, plus facilities to load it from a file
+class TWordFilter {
+public:
+ struct TStopWordInfo {
+ ::TLangMask Language;
+ EStickySide Stickiness;
+ TStopWordInfo(::TLangMask lang = LI_ALL_LANGUAGES, EStickySide side = STICK_NONE)
+ : Language(lang)
+ , Stickiness(side)
+ {
+ }
+ };
+ typedef Hash<TStopWordInfo> TStopWordsHash;
+ typedef THashWithSegmentedPoolForKeys<wchar16, TStopWordInfo, TTCharStrIHasher, TTCharStrIEqualTo> TWideStopWordsHash;
+ template <class TTChar>
+ struct THashType;
+
+ inline TWordFilter() {
+ }
+
+ // Recommended initialization - from a config file
+ bool InitStopWordsList(const char* filename);
+ bool InitStopWordsList(IInputStream& instream);
+
+ // Deprecated initialization - just words in single-byte encoding, no language data, no i18n
+ bool InitStopWordsList(const char** s, size_t n);
+
+ void TermStopWordsList() {
+ WordFilter = nullptr;
+ WideWordFilter = nullptr;
+ PlainWordFilter = nullptr;
+ }
+
+ //in case TTChar == char, assumes csYandex
+ //see MORPH-74
+ template <class TTChar>
+ bool IsStopWord(const TTChar* word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
+ if (!word || !*word)
+ return false;
+ typedef typename THashType<TTChar>::Type THash;
+ const TAtomicSharedPtr<THash>& wordFilter = GetHashPtr<TTChar>();
+ if (!wordFilter)
+ return false;
+
+ typename THash::const_iterator it = wordFilter->find(word);
+ if (it == wordFilter->end())
+ return false;
+ if (lang.none() || (it->second.Language & lang).any()) {
+ if (side)
+ *side = it->second.Stickiness;
+ return true;
+ }
+ return false;
+ }
+
+ // assumes word is in UTF8
+ bool IsStopWord(const TString& word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
+ return IsStopWord(word.c_str(), lang, side);
+ }
+
+ bool IsStopWord(const TUtf16String& word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
+ return IsStopWord(word.c_str(), lang, side);
+ }
+
+ template <class TTChar>
+ bool IsStopWord(const TTChar* word, size_t len, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
+ TTempArray<TTChar> str(len + 1);
+ memcpy((void*)str.Data(), word, len * sizeof(TTChar));
+ str.Data()[len] = 0;
+ return IsStopWord(str.Data(), lang, side);
+ }
+
+ // Deprecated interface - get a plain list of single-byte strings
+ const HashSet* GetWordFilter() const {
+ return PlainWordFilter.Get();
+ }
+
+ static const TWordFilter EmptyFilter;
+
+private:
+ //in csYandex
+ TAtomicSharedPtr<HashSet> PlainWordFilter; // compatibility: will be gone when no one uses GetWordFilter()
+ //in UTF8
+ TAtomicSharedPtr<TStopWordsHash> WordFilter;
+ //in UTF16
+ TAtomicSharedPtr<TWideStopWordsHash> WideWordFilter;
+ void InitWideFilter();
+ void InitNarrowFilter();
+
+ template <class TTChar>
+ inline const TAtomicSharedPtr<typename THashType<TTChar>::Type>& GetHashPtr() const;
+};
+template <>
+struct TWordFilter::THashType<char> {
+ typedef TStopWordsHash Type;
+};
+template <>
+struct TWordFilter::THashType<wchar16> {
+ typedef TWideStopWordsHash Type;
+};
+template <>
+inline const TAtomicSharedPtr<TWordFilter::TStopWordsHash>& TWordFilter::GetHashPtr<char>() const {
+ return WordFilter;
+}
+template <>
+inline const TAtomicSharedPtr<TWordFilter::TWideStopWordsHash>& TWordFilter::GetHashPtr<wchar16>() const {
+ return WideWordFilter;
+}