validate canons without yatest_common

author: qrort <qrort@yandex-team.com> 2022-11-30 23:47:12 +0300
committer: qrort <qrort@yandex-team.com> 2022-11-30 23:47:12 +0300
commit: 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
tree: bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/stopwords
parent: 332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
download: ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
3 files changed, 296 insertions, 0 deletions
diff --git a/library/cpp/stopwords/README.md b/library/cpp/stopwords/README.md
new file mode 100644
index 0000000000..cb48a4e345
--- /dev/null
+++ b/library/cpp/stopwords/README.md
@@ -0,0 +1 @@
+Библиотека для работы с файлами стоп-слов, заданными в [таком формате](https://a.yandex-team.ru/arc/trunk/arcadia/search/wizard/data/wizard/language/stopword.lst).
diff --git a/library/cpp/stopwords/stopwords.cpp b/library/cpp/stopwords/stopwords.cpp
new file mode 100644
index 0000000000..54186ca1bb
--- /dev/null
+++ b/library/cpp/stopwords/stopwords.cpp
@@ -0,0 +1,152 @@
+#include <algorithm>
+
+#include <library/cpp/charset/wide.h>
+#include <util/memory/tempbuf.h>
+#include <util/string/vector.h>
+#include <util/generic/yexception.h>
+#include <util/digest/murmur.h>
+#include <util/string/split.h>
+
+#include "stopwords.h"
+
+const EStickySide DefaultStickiness = STICK_RIGHT;
+const TWordFilter TWordFilter::EmptyFilter;
+
+struct TToLower {
+    wchar16 operator()(wchar16 c) {
+        return (wchar16)ToLower(c);
+    }
+};
+
+size_t TTCharStrIHashImpl(const wchar16* ptr) {
+    const size_t len = ptr ? std::char_traits<wchar16>::length(ptr) : 0;
+    TCharTemp buf(len);
+    std::transform(ptr, ptr + len, buf.Data(), TToLower());
+    return MurmurHash<size_t>((void*)buf.Data(), len * sizeof(wchar16));
+}
+
+bool TTCharStrIEqualToImpl(const wchar16* s1, const wchar16* s2) {
+    if (!s1 || !s2)
+        return !s1 == !s2;
+    for (; *s1 && *s2; ++s1, ++s2)
+        if (ToLower(*s1) != ToLower(*s2))
+            return false;
+    return *s1 == *s2;
+}
+
+namespace {
+    struct TReaderImpl: public TWordListReader {
+        TWordFilter::TWideStopWordsHash* Res;
+        ELanguage CurrentLanguage;
+        EStickySide CurrentStickiness;
+        TReaderImpl(TWordFilter::TWideStopWordsHash* res)
+            : Res(res)
+            , CurrentLanguage(LANG_UNK)
+            , CurrentStickiness(DefaultStickiness)
+        {
+        }
+        void ParseLine(const TUtf16String& line, ELanguage langcode, int version) override;
+        void ReadDataFile(const char* s) {
+            TWordListReader::ReadDataFile(s);
+        }
+        void ReadDataFile(IInputStream& in) {
+            TWordListReader::ReadDataFile(in);
+        }
+    };
+}
+
+bool TWordFilter::InitStopWordsList(const char* filename) {
+    if (!filename || !*filename)
+        return true;
+    TBuffered<TUnbufferedFileInput> src(4096, filename);
+    return InitStopWordsList(src);
+}
+
+bool TWordFilter::InitStopWordsList(IInputStream& instream) {
+    TermStopWordsList();
+    WordFilter.Reset(new TStopWordsHash);
+    WideWordFilter.Reset(new TWideStopWordsHash);
+    PlainWordFilter.Reset(new HashSet);
+    TReaderImpl reader(WideWordFilter.Get());
+    reader.ReadDataFile(instream);
+    InitNarrowFilter();
+    return true;
+}
+
+// Deprecated initializer - no language data, default stickiness
+bool TWordFilter::InitStopWordsList(const char** s, size_t n) {
+    TermStopWordsList();
+    if (!s)
+        return false;
+    WordFilter.Reset(new TStopWordsHash);
+    WideWordFilter.Reset(new TWideStopWordsHash);
+    PlainWordFilter.Reset(new HashSet);
+    for (size_t i = 0; i < n; i++) {
+        if (s[i])
+            WordFilter->Add(s[i], TStopWordInfo(LI_ALL_LANGUAGES, DefaultStickiness));
+    }
+    InitWideFilter();
+    return true;
+}
+
+void TWordFilter::InitWideFilter() {
+    for (TStopWordsHash::const_iterator it = WordFilter->begin(); it != WordFilter->end(); ++it) {
+        TUtf16String tmp = UTF8ToWide(it->first);
+        PlainWordFilter->Add(WideToChar(tmp.data(), tmp.size(), CODES_YANDEX).c_str());
+        WideWordFilter->insert_copy(tmp.c_str(), tmp.size() + 1, it->second);
+    }
+}
+
+void TWordFilter::InitNarrowFilter() {
+    TString tmp;
+    for (TWideStopWordsHash::const_iterator it = WideWordFilter->begin(); it != WideWordFilter->end(); ++it) {
+        const wchar16* const str = it->first;
+        const size_t len = std::char_traits<wchar16>::length(str);
+        tmp.resize(len);
+        WideToChar(str, len, tmp.begin(), CODES_YANDEX);
+        PlainWordFilter->Add(tmp.c_str());
+        WordFilter->Add(WideToUTF8(it->first).c_str(), it->second);
+    }
+}
+
+void TReaderImpl::ParseLine(const TUtf16String& line, ELanguage langcode, int version) {
+    static const TUtf16String delimiters = u" \t\r\n,;";
+    static const TUtf16String strNone = u"NONE:";
+    static const TUtf16String strLeft = u"LEFT:";
+    static const TUtf16String strRight = u"RIGHT:";
+    static const TUtf16String strBoth = u"BOTH:";
+
+    if (langcode != CurrentLanguage) {
+        CurrentStickiness = DefaultStickiness; // reset stickiness at the beginning of each zone
+        CurrentLanguage = langcode;
+    }
+    TLangMask langCode = langcode != LANG_UNK ? TLangMask(langcode) : LI_ALL_LANGUAGES;
+
+    TVector<TUtf16String> tokens;
+    StringSplitter(line).SplitBySet(delimiters.c_str()).SkipEmpty().Collect(&tokens);
+    TVector<TUtf16String>::const_iterator it;
+    for (it = tokens.begin(); it != tokens.end(); it++) {
+        if (it->empty())
+            continue;      // due diligence
+        if (version > 1) { // support for stickiness comes from version 2
+            if (*it == strNone) {
+                CurrentStickiness = STICK_NONE;
+                continue;
+            } else if (*it == strLeft) {
+                CurrentStickiness = STICK_LEFT;
+                continue;
+            } else if (*it == strRight) {
+                CurrentStickiness = STICK_RIGHT;
+                continue;
+            } else if (*it == strBoth) {
+                CurrentStickiness = STICK_BOTH;
+                continue;
+            }
+        }
+        TWordFilter::TWideStopWordsHash::iterator fit = Res->find(it->c_str());
+        if (fit == Res->end())
+            Res->insert_copy(it->c_str(), it->length() + 1, TWordFilter::TStopWordInfo(langCode, CurrentStickiness));
+        else
+            fit->second.Language.SafeSet(langcode);
+    }
+}
diff --git a/library/cpp/stopwords/stopwords.h b/library/cpp/stopwords/stopwords.h
new file mode 100644
index 0000000000..fab28a3488
--- /dev/null
+++ b/library/cpp/stopwords/stopwords.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <library/cpp/charset/wide.h>
+#include <library/cpp/containers/str_map/str_map.h>
+#include <library/cpp/containers/str_hash/str_hash.h>
+#include <library/cpp/wordlistreader/wordlistreader.h>
+#include <util/generic/hash.h>
+#include <util/generic/ptr.h>
+#include <util/charset/wide.h>
+#include <util/memory/tempbuf.h>
+
+#include <type_traits>
+
+enum EStickySide {
+    STICK_NONE = 0,
+    STICK_LEFT = 1,
+    STICK_RIGHT = 2,
+    STICK_BOTH = 3,
+};
+
+size_t TTCharStrIHashImpl(const wchar16* ptr);
+bool TTCharStrIEqualToImpl(const wchar16* s1, const wchar16* s2);
+
+struct TTCharStrIHasher {
+    size_t operator()(const wchar16* s) const {
+        return TTCharStrIHashImpl(s);
+    }
+};
+
+struct TTCharStrIEqualTo {
+    bool operator()(const wchar16* s1, const wchar16* s2) {
+        return TTCharStrIEqualToImpl(s1, s2);
+    }
+};
+
+// Hash of stop words, plus facilities to load it from a file
+class TWordFilter {
+public:
+    struct TStopWordInfo {
+        ::TLangMask Language;
+        EStickySide Stickiness;
+        TStopWordInfo(::TLangMask lang = LI_ALL_LANGUAGES, EStickySide side = STICK_NONE)
+            : Language(lang)
+            , Stickiness(side)
+        {
+        }
+    };
+    typedef Hash<TStopWordInfo> TStopWordsHash;
+    typedef THashWithSegmentedPoolForKeys<wchar16, TStopWordInfo, TTCharStrIHasher, TTCharStrIEqualTo> TWideStopWordsHash;
+    template <class TTChar>
+    struct THashType;
+
+    inline TWordFilter() {
+    }
+
+    // Recommended initialization - from a config file
+    bool InitStopWordsList(const char* filename);
+    bool InitStopWordsList(IInputStream& instream);
+
+    // Deprecated initialization - just words in single-byte encoding, no language data, no i18n
+    bool InitStopWordsList(const char** s, size_t n);
+
+    void TermStopWordsList() {
+        WordFilter = nullptr;
+        WideWordFilter = nullptr;
+        PlainWordFilter = nullptr;
+    }
+
+    //in case TTChar == char, assumes csYandex
+    //see MORPH-74
+    template <class TTChar>
+    bool IsStopWord(const TTChar* word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
+        if (!word || !*word)
+            return false;
+        typedef typename THashType<TTChar>::Type THash;
+        const TAtomicSharedPtr<THash>& wordFilter = GetHashPtr<TTChar>();
+        if (!wordFilter)
+            return false;
+
+        typename THash::const_iterator it = wordFilter->find(word);
+        if (it == wordFilter->end())
+            return false;
+        if (lang.none() || (it->second.Language & lang).any()) {
+            if (side)
+                *side = it->second.Stickiness;
+            return true;
+        }
+        return false;
+    }
+
+    // assumes word is in UTF8
+    bool IsStopWord(const TString& word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
+        return IsStopWord(word.c_str(), lang, side);
+    }
+
+    bool IsStopWord(const TUtf16String& word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
+        return IsStopWord(word.c_str(), lang, side);
+    }
+
+    template <class TTChar>
+    bool IsStopWord(const TTChar* word, size_t len, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
+        TTempArray<TTChar> str(len + 1);
+        memcpy((void*)str.Data(), word, len * sizeof(TTChar));
+        str.Data()[len] = 0;
+        return IsStopWord(str.Data(), lang, side);
+    }
+
+    // Deprecated interface - get a plain list of single-byte strings
+    const HashSet* GetWordFilter() const {
+        return PlainWordFilter.Get();
+    }
+
+    static const TWordFilter EmptyFilter;
+
+private:
+    //in csYandex
+    TAtomicSharedPtr<HashSet> PlainWordFilter; // compatibility: will be gone when no one uses GetWordFilter()
+    //in UTF8
+    TAtomicSharedPtr<TStopWordsHash> WordFilter;
+    //in UTF16
+    TAtomicSharedPtr<TWideStopWordsHash> WideWordFilter;
+    void InitWideFilter();
+    void InitNarrowFilter();
+
+    template <class TTChar>
+    inline const TAtomicSharedPtr<typename THashType<TTChar>::Type>& GetHashPtr() const;
+};
+template <>
+struct TWordFilter::THashType<char> {
+    typedef TStopWordsHash Type;
+};
+template <>
+struct TWordFilter::THashType<wchar16> {
+    typedef TWideStopWordsHash Type;
+};
+template <>
+inline const TAtomicSharedPtr<TWordFilter::TStopWordsHash>& TWordFilter::GetHashPtr<char>() const {
+    return WordFilter;
+}
+template <>
+inline const TAtomicSharedPtr<TWordFilter::TWideStopWordsHash>& TWordFilter::GetHashPtr<wchar16>() const {
+    return WideWordFilter;
+}
author	qrort <qrort@yandex-team.com>	2022-11-30 23:47:12 +0300
committer	qrort <qrort@yandex-team.com>	2022-11-30 23:47:12 +0300
commit	22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
tree	bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/stopwords
parent	332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
download	ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz