aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Common/TLDListsHolder.cpp
blob: 623b88f83a548c607229bddda30cd386a37d3dfc (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include <Common/TLDListsHolder.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/logger_useful.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadHelpers.h>
#include <string_view>
#include <unordered_set>

namespace DB
{

namespace ErrorCodes
{
    extern const int TLD_LIST_NOT_FOUND;
    extern const int LOGICAL_ERROR;
}

constexpr size_t StringHashTablePadRequirement = 8;

/// TLDList
TLDList::TLDList(size_t size)
    : tld_container(size)
    , memory_pool(std::make_unique<Arena>())
{
    /// StringHashTable requires padded to 8 bytes key,
    /// and Arena (memory_pool here) does satisfies this,
    /// since it has padding with 15 bytes at the right.
    ///
    /// However, StringHashTable may reference -1 byte of the key,
    /// so left padding is also required:
    memory_pool->alignedAlloc(StringHashTablePadRequirement, StringHashTablePadRequirement);
}
void TLDList::insert(const String & host, TLDType type)
{
    StringRef owned_host{memory_pool->insert(host.data(), host.size()), host.size()};
    tld_container[owned_host] = type;
}
TLDType TLDList::lookup(StringRef host) const
{
    if (auto it = tld_container.find(host); it != nullptr)
        return it->getMapped();
    return TLDType::TLD_NONE;
}

/// TLDListsHolder
TLDListsHolder & TLDListsHolder::getInstance()
{
    static TLDListsHolder instance;
    return instance;
}
TLDListsHolder::TLDListsHolder() = default;

void TLDListsHolder::parseConfig(const std::string & top_level_domains_path, const Poco::Util::AbstractConfiguration & config)
{
    Poco::Util::AbstractConfiguration::Keys config_keys;
    config.keys("top_level_domains_lists", config_keys);

    Poco::Logger * log = &Poco::Logger::get("TLDListsHolder");

    for (const auto & key : config_keys)
    {
        const std::string & path = top_level_domains_path + config.getString("top_level_domains_lists." + key);
        LOG_TRACE(log, "{} loading from {}", key, path);
        size_t hosts = parseAndAddTldList(key, path);
        LOG_INFO(log, "{} was added ({} hosts)", key, hosts);
    }
}

size_t TLDListsHolder::parseAndAddTldList(const std::string & name, const std::string & path)
{
    std::unordered_map<std::string, TLDType> tld_list_tmp;

    ReadBufferFromFile in(path);
    String buffer;
    while (!in.eof())
    {
        readEscapedStringUntilEOL(buffer, in);
        if (!in.eof())
            ++in.position();
        std::string_view line(buffer);
        /// Skip comments
        if (line.starts_with("//"))
            continue;
        line = line.substr(0, line.rend() - std::find_if_not(line.rbegin(), line.rend(), ::isspace));
        /// Skip empty line
        if (line.empty())
            continue;
        /// Validate special symbols.
        if (line.starts_with("*."))
        {
            line = line.substr(2);
            tld_list_tmp.emplace(line, TLDType::TLD_ANY);
        }
        else if (line[0] == '!')
        {
            line = line.substr(1);
            tld_list_tmp.emplace(line, TLDType::TLD_EXCLUDE);
        }
        else
            tld_list_tmp.emplace(line, TLDType::TLD_REGULAR);
    }
    if (!in.eof())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Not all list had been read: {}", name);

    TLDList tld_list(tld_list_tmp.size());
    for (const auto & [host, type] : tld_list_tmp)
    {
        tld_list.insert(host, type);
    }

    size_t tld_list_size = tld_list.size();
    std::lock_guard lock(tld_lists_map_mutex);
    tld_lists_map.insert(std::make_pair(name, std::move(tld_list)));
    return tld_list_size;
}

const TLDList & TLDListsHolder::getTldList(const std::string & name)
{
    std::lock_guard lock(tld_lists_map_mutex);
    auto it = tld_lists_map.find(name);
    if (it == tld_lists_map.end())
        throw Exception(ErrorCodes::TLD_LIST_NOT_FOUND, "TLD list {} does not exist", name);
    return it->second;
}

}