1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
#include <Common/TLDListsHolder.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/logger_useful.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadHelpers.h>
#include <string_view>
#include <unordered_set>
namespace DB
{
namespace ErrorCodes
{
extern const int TLD_LIST_NOT_FOUND;
extern const int LOGICAL_ERROR;
}
constexpr size_t StringHashTablePadRequirement = 8;
/// TLDList
TLDList::TLDList(size_t size)
: tld_container(size)
, memory_pool(std::make_unique<Arena>())
{
/// StringHashTable requires padded to 8 bytes key,
/// and Arena (memory_pool here) does satisfies this,
/// since it has padding with 15 bytes at the right.
///
/// However, StringHashTable may reference -1 byte of the key,
/// so left padding is also required:
memory_pool->alignedAlloc(StringHashTablePadRequirement, StringHashTablePadRequirement);
}
void TLDList::insert(const String & host, TLDType type)
{
StringRef owned_host{memory_pool->insert(host.data(), host.size()), host.size()};
tld_container[owned_host] = type;
}
TLDType TLDList::lookup(StringRef host) const
{
if (auto it = tld_container.find(host); it != nullptr)
return it->getMapped();
return TLDType::TLD_NONE;
}
/// TLDListsHolder
TLDListsHolder & TLDListsHolder::getInstance()
{
static TLDListsHolder instance;
return instance;
}
TLDListsHolder::TLDListsHolder() = default;
void TLDListsHolder::parseConfig(const std::string & top_level_domains_path, const Poco::Util::AbstractConfiguration & config)
{
Poco::Util::AbstractConfiguration::Keys config_keys;
config.keys("top_level_domains_lists", config_keys);
Poco::Logger * log = &Poco::Logger::get("TLDListsHolder");
for (const auto & key : config_keys)
{
const std::string & path = top_level_domains_path + config.getString("top_level_domains_lists." + key);
LOG_TRACE(log, "{} loading from {}", key, path);
size_t hosts = parseAndAddTldList(key, path);
LOG_INFO(log, "{} was added ({} hosts)", key, hosts);
}
}
size_t TLDListsHolder::parseAndAddTldList(const std::string & name, const std::string & path)
{
std::unordered_map<std::string, TLDType> tld_list_tmp;
ReadBufferFromFile in(path);
String buffer;
while (!in.eof())
{
readEscapedStringUntilEOL(buffer, in);
if (!in.eof())
++in.position();
std::string_view line(buffer);
/// Skip comments
if (line.starts_with("//"))
continue;
line = line.substr(0, line.rend() - std::find_if_not(line.rbegin(), line.rend(), ::isspace));
/// Skip empty line
if (line.empty())
continue;
/// Validate special symbols.
if (line.starts_with("*."))
{
line = line.substr(2);
tld_list_tmp.emplace(line, TLDType::TLD_ANY);
}
else if (line[0] == '!')
{
line = line.substr(1);
tld_list_tmp.emplace(line, TLDType::TLD_EXCLUDE);
}
else
tld_list_tmp.emplace(line, TLDType::TLD_REGULAR);
}
if (!in.eof())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not all list had been read: {}", name);
TLDList tld_list(tld_list_tmp.size());
for (const auto & [host, type] : tld_list_tmp)
{
tld_list.insert(host, type);
}
size_t tld_list_size = tld_list.size();
std::lock_guard lock(tld_lists_map_mutex);
tld_lists_map.insert(std::make_pair(name, std::move(tld_list)));
return tld_list_size;
}
const TLDList & TLDListsHolder::getTldList(const std::string & name)
{
std::lock_guard lock(tld_lists_map_mutex);
auto it = tld_lists_map.find(name);
if (it == tld_lists_map.end())
throw Exception(ErrorCodes::TLD_LIST_NOT_FOUND, "TLD list {} does not exist", name);
return it->second;
}
}
|