1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
|
#pragma once
#include <library/cpp/charset/wide.h>
#include <library/cpp/containers/str_map/str_map.h>
#include <library/cpp/containers/str_hash/str_hash.h>
#include <library/cpp/wordlistreader/wordlistreader.h>
#include <util/generic/hash.h>
#include <util/generic/ptr.h>
#include <util/charset/wide.h>
#include <util/memory/tempbuf.h>
#include <type_traits>
enum EStickySide {
STICK_NONE = 0,
STICK_LEFT = 1,
STICK_RIGHT = 2,
STICK_BOTH = 3,
};
size_t TTCharStrIHashImpl(const wchar16* ptr);
bool TTCharStrIEqualToImpl(const wchar16* s1, const wchar16* s2);
struct TTCharStrIHasher {
size_t operator()(const wchar16* s) const {
return TTCharStrIHashImpl(s);
}
};
struct TTCharStrIEqualTo {
bool operator()(const wchar16* s1, const wchar16* s2) {
return TTCharStrIEqualToImpl(s1, s2);
}
};
// Hash of stop words, plus facilities to load it from a file
class TWordFilter {
public:
struct TStopWordInfo {
::TLangMask Language;
EStickySide Stickiness;
TStopWordInfo(::TLangMask lang = LI_ALL_LANGUAGES, EStickySide side = STICK_NONE)
: Language(lang)
, Stickiness(side)
{
}
};
typedef Hash<TStopWordInfo> TStopWordsHash;
typedef THashWithSegmentedPoolForKeys<wchar16, TStopWordInfo, TTCharStrIHasher, TTCharStrIEqualTo> TWideStopWordsHash;
template <class TTChar>
struct THashType;
inline TWordFilter() {
}
// Recommended initialization - from a config file
bool InitStopWordsList(const char* filename);
bool InitStopWordsList(IInputStream& instream);
// Deprecated initialization - just words in single-byte encoding, no language data, no i18n
bool InitStopWordsList(const char** s, size_t n);
void TermStopWordsList() {
WordFilter = nullptr;
WideWordFilter = nullptr;
PlainWordFilter = nullptr;
}
//in case TTChar == char, assumes csYandex
//see MORPH-74
template <class TTChar>
bool IsStopWord(const TTChar* word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
if (!word || !*word)
return false;
typedef typename THashType<TTChar>::Type THash;
const TAtomicSharedPtr<THash>& wordFilter = GetHashPtr<TTChar>();
if (!wordFilter)
return false;
typename THash::const_iterator it = wordFilter->find(word);
if (it == wordFilter->end())
return false;
if (lang.none() || (it->second.Language & lang).any()) {
if (side)
*side = it->second.Stickiness;
return true;
}
return false;
}
// assumes word is in UTF8
bool IsStopWord(const TString& word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
return IsStopWord(word.c_str(), lang, side);
}
bool IsStopWord(const TUtf16String& word, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
return IsStopWord(word.c_str(), lang, side);
}
template <class TTChar>
bool IsStopWord(const TTChar* word, size_t len, ::TLangMask lang = ::TLangMask(), EStickySide* side = nullptr) const {
TTempArray<TTChar> str(len + 1);
memcpy((void*)str.Data(), word, len * sizeof(TTChar));
str.Data()[len] = 0;
return IsStopWord(str.Data(), lang, side);
}
// Deprecated interface - get a plain list of single-byte strings
const HashSet* GetWordFilter() const {
return PlainWordFilter.Get();
}
static const TWordFilter EmptyFilter;
private:
//in csYandex
TAtomicSharedPtr<HashSet> PlainWordFilter; // compatibility: will be gone when no one uses GetWordFilter()
//in UTF8
TAtomicSharedPtr<TStopWordsHash> WordFilter;
//in UTF16
TAtomicSharedPtr<TWideStopWordsHash> WideWordFilter;
void InitWideFilter();
void InitNarrowFilter();
template <class TTChar>
inline const TAtomicSharedPtr<typename THashType<TTChar>::Type>& GetHashPtr() const;
};
template <>
struct TWordFilter::THashType<char> {
typedef TStopWordsHash Type;
};
template <>
struct TWordFilter::THashType<wchar16> {
typedef TWideStopWordsHash Type;
};
template <>
inline const TAtomicSharedPtr<TWordFilter::TStopWordsHash>& TWordFilter::GetHashPtr<char>() const {
return WordFilter;
}
template <>
inline const TAtomicSharedPtr<TWordFilter::TWideStopWordsHash>& TWordFilter::GetHashPtr<wchar16>() const {
return WideWordFilter;
}
|