diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/stopwords | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/stopwords')
-rw-r--r-- | library/cpp/stopwords/README.md | 1 | ||||
-rw-r--r-- | library/cpp/stopwords/stopwords.cpp | 152 | ||||
-rw-r--r-- | library/cpp/stopwords/stopwords.h | 143 |
3 files changed, 296 insertions, 0 deletions
diff --git a/library/cpp/stopwords/README.md b/library/cpp/stopwords/README.md new file mode 100644 index 0000000000..cb48a4e345 --- /dev/null +++ b/library/cpp/stopwords/README.md @@ -0,0 +1 @@ +Библиотека для работы с файлами стоп-слов, заданными в [таком формате](https://a.yandex-team.ru/arc/trunk/arcadia/search/wizard/data/wizard/language/stopword.lst). diff --git a/library/cpp/stopwords/stopwords.cpp b/library/cpp/stopwords/stopwords.cpp new file mode 100644 index 0000000000..54186ca1bb --- /dev/null +++ b/library/cpp/stopwords/stopwords.cpp @@ -0,0 +1,152 @@ +#include <algorithm> + +#include <library/cpp/charset/wide.h> +#include <util/memory/tempbuf.h> +#include <util/string/vector.h> +#include <util/generic/yexception.h> +#include <util/digest/murmur.h> +#include <util/string/split.h> + +#include "stopwords.h" + +const EStickySide DefaultStickiness = STICK_RIGHT; +const TWordFilter TWordFilter::EmptyFilter; + +struct TToLower { + wchar16 operator()(wchar16 c) { + return (wchar16)ToLower(c); + } +}; + +size_t TTCharStrIHashImpl(const wchar16* ptr) { + const size_t len = ptr ? std::char_traits<wchar16>::length(ptr) : 0; + TCharTemp buf(len); + std::transform(ptr, ptr + len, buf.Data(), TToLower()); + return MurmurHash<size_t>((void*)buf.Data(), len * sizeof(wchar16)); +} + +bool TTCharStrIEqualToImpl(const wchar16* s1, const wchar16* s2) { + if (!s1 || !s2) + return !s1 == !s2; + for (; *s1 && *s2; ++s1, ++s2) + if (ToLower(*s1) != ToLower(*s2)) + return false; + return *s1 == *s2; +} + +namespace { + struct TReaderImpl: public TWordListReader { + TWordFilter::TWideStopWordsHash* Res; + ELanguage CurrentLanguage; + EStickySide CurrentStickiness; + TReaderImpl(TWordFilter::TWideStopWordsHash* res) + : Res(res) + , CurrentLanguage(LANG_UNK) + , CurrentStickiness(DefaultStickiness) + { + } + void ParseLine(const TUtf16String& line, ELanguage langcode, int version) override; + void ReadDataFile(const char* s) { + TWordListReader::ReadDataFile(s); + } + void ReadDataFile(IInputStream& in) { + TWordListReader::ReadDataFile(in); + } + }; +} + +bool TWordFilter::InitStopWordsList(const char* filename) { + if (!filename || !*filename) + return true; + TBuffered<TUnbufferedFileInput> src(4096, filename); + return InitStopWordsList(src); +} + +bool TWordFilter::InitStopWordsList(IInputStream& instream) { + TermStopWordsList(); + WordFilter.Reset(new TStopWordsHash); + WideWordFilter.Reset(new TWideStopWordsHash); + PlainWordFilter.Reset(new HashSet); + TReaderImpl reader(WideWordFilter.Get()); + reader.ReadDataFile(instream); + InitNarrowFilter(); + return true; +} + +// Deprecated initializer - no language data, default stickiness +bool TWordFilter::InitStopWordsList(const char** s, size_t n) { + TermStopWordsList(); + if (!s) + return false; + WordFilter.Reset(new TStopWordsHash); + WideWordFilter.Reset(new TWideStopWordsHash); + PlainWordFilter.Reset(new HashSet); + for (size_t i = 0; i < n; i++) { + if (s[i]) + WordFilter->Add(s[i], TStopWordInfo(LI_ALL_LANGUAGES, DefaultStickiness)); + } + InitWideFilter(); + return true; +} + +void TWordFilter::InitWideFilter() { + for (TStopWordsHash::const_iterator it = WordFilter->begin(); it != WordFilter->end(); ++it) { + TUtf16String tmp = UTF8ToWide(it->first); + PlainWordFilter->Add(WideToChar(tmp.data(), tmp.size(), CODES_YANDEX).c_str()); + WideWordFilter->insert_copy(tmp.c_str(), tmp.size() + 1, it->second); + } +} + +void TWordFilter::InitNarrowFilter() { + TString tmp; + for (TWideStopWordsHash::const_iterator it = WideWordFilter->begin(); it != WideWordFilter->end(); ++it) { + const wchar16* const str = it->first; + const size_t len = std::char_traits<wchar16>::length(str); + tmp.resize(len); + WideToChar(str, len, tmp.begin(), CODES_YANDEX); + PlainWordFilter->Add(tmp.c_str()); + WordFilter->Add(WideToUTF8(it->first).c_str(), it->second); + } +} + +void TReaderImpl::ParseLine(const TUtf16String& line, ELanguage langcode, int version) { + static const TUtf16String delimiters = u" \t\r\n,;"; + static const TUtf16String strNone = u"NONE:"; + static const TUtf16String strLeft = u"LEFT:"; + static const TUtf16String strRight = u"RIGHT:"; + static const TUtf16String strBoth = u"BOTH:"; + + if (langcode != CurrentLanguage) { + CurrentStickiness = DefaultStickiness; // reset stickiness at the beginning of each zone + CurrentLanguage = langcode; + } + TLangMask langCode = langcode != LANG_UNK ? TLangMask(langcode) : LI_ALL_LANGUAGES; + + TVector<TUtf16String> tokens; + StringSplitter(line).SplitBySet(delimiters.c_str()).SkipEmpty().Collect(&tokens); + TVector<TUtf16String>::const_iterator it; + for (it = tokens.begin(); it != tokens.end(); it++) { + if (it->empty()) + continue; // due diligence + if (version > 1) { // support for stickiness comes from version 2 + if (*it == strNone) { + CurrentStickiness = STICK_NONE; + continue; + } else if (*it == strLeft) { + CurrentStickiness = STICK_LEFT; + continue; + } else if (*it == strRight) { + CurrentStickiness = STICK_RIGHT; + continue; + } else if (*it == strBoth) { + CurrentStickiness = STICK_BOTH; + continue; + } + } + TWordFilter::TWideStopWordsHash::iterator fit = Res->find(it->c_str()); + if (fit == Res->end()) + Res->insert_copy(it->c_str(), it->length() + 1, TWordFilter::TStopWordInfo(langCode, CurrentStickiness)); + else + fit->second.Language.SafeSet(langcode); + } +} diff --git a/library/cpp/stopwords/stopwords.h b/library/cpp/stopwords/stopwords.h new file mode 100644 index 0000000000..fab28a3488 --- /dev/null +++ b/library/cpp/stopwords/stopwords.h @@ -0,0 +1,143 @@ +#pragma once + +#include <library/cpp/charset/wide.h> +#include <library/cpp/containers/str_map/str_map.h> +#include <library/cpp/containers/str_hash/str_hash.h> +#include <library/cpp/wordlistreader/wordlistreader.h> +#include <util/generic/hash.h> +#include <util/generic/ptr.h> +#include <util/charset/wide.h> +#include <util/memory/tempbuf.h> + +#include <type_traits> + +enum EStickySide { + STICK_NONE = 0, + STICK_LEFT = 1, + STICK_RIGHT = 2, + STICK_BOTH = 3, +}; + +size_t TTCharStrIHashImpl(const wchar16* ptr); +bool TTCharStrIEqualToImpl(const wchar16* s1, const wchar16* s2); + +struct TTCharStrIHasher { + size_t operator()(const wchar16* s) const { + return TTCharStrIHashImpl(s); + } +}; + +struct TTCharStrIEqualTo { + bool operator()(const wchar16* s1, const wchar16* s2) { + return TTCharStrIEqualToImpl(s1, s2); + } +}; + +// Hash of stop words, plus facilities to load it from a file +class TWordFilter { +public: + struct TStopWordInfo { + ::TLangMask Language; + EStickySide Stickiness; + TStopWordInfo(::TLangMask lang = LI_ALL_LANGUAGES, EStickySide side = STICK_NONE) + : Language(lang) + , Stickiness(side) + { + } + }; + typedef Hash<TStopWordInfo> TStopWordsHash; + typedef THashWithSegmentedPoolForKeys<wchar16, TStopWordInfo, TTCharStrIHasher, TTCharStrIEqualTo> TWideStopWordsHash; + template <class TTChar> + struct THashType; + + inline TWordFilter() { + } + + // Recommended initialization - from a config file + bool InitStopWordsList(const char* filename); + bool InitStopWordsList(IInputStream& instream); + + // Deprecated initialization - just words in single-byte encoding, no language data, no i18n + bool InitStopWordsList(const char** s, size_t n); + + void TermStopWordsList() { + WordFilter = nullptr; + WideWordFilter = nullptr; + PlainWordFilter = nullptr; + } + + //in case TTChar == char, assumes csYandex + //see MORPH-74 + template <class TTChar> + bool IsStopWord(const TTChar* word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const { + if (!word || !*word) + return false; + typedef typename THashType<TTChar>::Type THash; + const TAtomicSharedPtr<THash>& wordFilter = GetHashPtr<TTChar>(); + if (!wordFilter) + return false; + + typename THash::const_iterator it = wordFilter->find(word); + if (it == wordFilter->end()) + return false; + if (lang.none() || (it->second.Language & lang).any()) { + if (side) + *side = it->second.Stickiness; + return true; + } + return false; + } + + // assumes word is in UTF8 + bool IsStopWord(const TString& word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const { + return IsStopWord(word.c_str(), lang, side); + } + + bool IsStopWord(const TUtf16String& word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const { + return IsStopWord(word.c_str(), lang, side); + } + + template <class TTChar> + bool IsStopWord(const TTChar* word, size_t len, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const { + TTempArray<TTChar> str(len + 1); + memcpy((void*)str.Data(), word, len * sizeof(TTChar)); + str.Data()[len] = 0; + return IsStopWord(str.Data(), lang, side); + } + + // Deprecated interface - get a plain list of single-byte strings + const HashSet* GetWordFilter() const { + return PlainWordFilter.Get(); + } + + static const TWordFilter EmptyFilter; + +private: + //in csYandex + TAtomicSharedPtr<HashSet> PlainWordFilter; // compatibility: will be gone when no one uses GetWordFilter() + //in UTF8 + TAtomicSharedPtr<TStopWordsHash> WordFilter; + //in UTF16 + TAtomicSharedPtr<TWideStopWordsHash> WideWordFilter; + void InitWideFilter(); + void InitNarrowFilter(); + + template <class TTChar> + inline const TAtomicSharedPtr<typename THashType<TTChar>::Type>& GetHashPtr() const; +}; +template <> +struct TWordFilter::THashType<char> { + typedef TStopWordsHash Type; +}; +template <> +struct TWordFilter::THashType<wchar16> { + typedef TWideStopWordsHash Type; +}; +template <> +inline const TAtomicSharedPtr<TWordFilter::TStopWordsHash>& TWordFilter::GetHashPtr<char>() const { + return WordFilter; +} +template <> +inline const TAtomicSharedPtr<TWordFilter::TWideStopWordsHash>& TWordFilter::GetHashPtr<wchar16>() const { + return WideWordFilter; +} |