aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/URL/topLevelDomain.cpp
blob: 25e9f383f60ceff3858d0b56bbf8a08c909140dd (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include "domain.h"

namespace DB
{

template<bool conform_rfc>
struct ExtractTopLevelDomain
{
    static size_t getReserveLengthForElement() { return 5; }

    static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
    {
        std::string_view host;
        if constexpr (conform_rfc)
            host = getURLHostRFC(data, size);
        else
            host = getURLHost(data, size);

        res_data = data;
        res_size = 0;

        if (!host.empty())
        {
            if (host[host.size() - 1] == '.')
                host.remove_suffix(1);

            const auto * host_end = host.data() + host.size();

            Pos last_dot = find_last_symbols_or_null<'.'>(host.data(), host_end);
            if (!last_dot)
                return;

            /// For IPv4 addresses select nothing.
            ///
            /// NOTE: it is safe to access last_dot[1]
            /// since getURLHost() will not return a host if there is symbol after dot.
            if (isNumericASCII(last_dot[1]))
                return;

            res_data = last_dot + 1;
            res_size = host_end - res_data;
        }
    }
};

struct NameTopLevelDomain { static constexpr auto name = "topLevelDomain"; };
using FunctionTopLevelDomain = FunctionStringToString<ExtractSubstringImpl<ExtractTopLevelDomain<false>>, NameTopLevelDomain>;

struct NameTopLevelDomainRFC { static constexpr auto name = "topLevelDomainRFC"; };
using FunctionTopLevelDomainRFC = FunctionStringToString<ExtractSubstringImpl<ExtractTopLevelDomain<true>>, NameTopLevelDomainRFC>;

REGISTER_FUNCTION(TopLevelDomain)
{
    factory.registerFunction<FunctionTopLevelDomain>(FunctionDocumentation
    {
        .description=R"(
Extracts the the top-level domain from a URL.

Returns an empty string if the argument cannot be parsed as a URL or does not contain a top-level domain.
        )",
        .examples{{"topLevelDomain", "SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk')", ""}},
        .categories{"URL"}
    });

    factory.registerFunction<FunctionTopLevelDomainRFC>(FunctionDocumentation
    {
        .description=R"(Similar to topLevelDomain, but conforms to RFC 3986.)",
        .examples{},
        .categories{"URL"}
    });
}

}