aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/string_utils/url/url.cpp
blob: 3658cfbabde9676533a55e6d325a4df7158dfd77 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#include "url.h"

#include <util/string/cast.h>
#include <util/string/util.h>
#include <util/string/cstriter.h>
#include <util/string/ascii.h>
#include <util/string/strip.h>

#include <util/charset/unidata.h> // for ToLower
#include <util/system/defaults.h>
#include <util/generic/algorithm.h>
#include <util/generic/hash_set.h>
#include <util/generic/yexception.h>
#include <util/generic/singleton.h>

#include <cstdlib>

namespace {
    struct TUncheckedSize {
        static bool Has(size_t) {
            return true;
        }
    };

    struct TKnownSize {
        size_t MySize;
        explicit TKnownSize(size_t sz)
            : MySize(sz)
        {
        }
        bool Has(size_t sz) const {
            return sz <= MySize;
        }
    };

    template <typename TChar1, typename TChar2>
    int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) {
        for (size_t i = 0; i < n; ++i) {
            if ((TChar1)ToLower(s1[i]) != s2[i])
                return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1;
        }
        return 0;
    }

    template <typename TChar, typename TBounds>
    inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) {
        const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0};
        const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0};
        if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0)
            return 7;
        if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0)
            return 8;
        return 0;
    }

    template <typename T>
    inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) {
        size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps);
        if (prefixSize)
            return url.substr(prefixSize);
        return url;
    }
}

namespace NUrl {

    TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) {
        TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false);
        TStringBuf path = url;
        path.SkipPrefix(host);
        return {host, path};
    }

} // namespace NUrl

size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept {
    return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps);
}

size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept {
    return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps);
}

size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept {
    return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps);
}

size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept {
    return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps);
}

TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept {
    return CutHttpPrefixImpl(url, ignorehttps);
}

TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept {
    return CutHttpPrefixImpl(url, ignorehttps);
}

size_t GetSchemePrefixSize(const TStringBuf url) noexcept {
    struct TDelim: public str_spn {
        inline TDelim()
            : str_spn("!-/:-@[-`{|}", true)
        {
        }
    };

    const auto& delim = *Singleton<TDelim>();
    const char* n = delim.brk(url.data(), url.end()); 

    if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') {
        return 0;
    }

    return n + 3 - url.begin();
}

TStringBuf GetSchemePrefix(const TStringBuf url) noexcept {
    return url.Head(GetSchemePrefixSize(url));
}

TStringBuf CutSchemePrefix(const TStringBuf url) noexcept {
    return url.Tail(GetSchemePrefixSize(url));
}

template <bool KeepPort>
static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) {
    TStringBuf urlNoScheme = url;

    urlNoScheme.Skip(GetHttpPrefixSize(url));

    struct TDelim: public str_spn {
        inline TDelim()
            : str_spn(KeepPort ? "/;?#" : "/:;?#")
        {
        }
    };

    const auto& nonHostCharacters = *Singleton<TDelim>();
    const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end());

    if (firstNonHostCharacter != urlNoScheme.end()) {
        return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data()); 
    }

    return urlNoScheme;
}

TStringBuf GetHost(const TStringBuf url) noexcept {
    return GetHostAndPortImpl<false>(url);
}

TStringBuf GetHostAndPort(const TStringBuf url) noexcept {
    return GetHostAndPortImpl<true>(url);
}

TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept {
    const size_t schemeSize = GetSchemePrefixSize(url);
    const TStringBuf scheme = url.Head(schemeSize);

    const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));

    TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));

    if (trimDefaultPort) {
        const size_t pos = hostAndPort.find(':');
        if (pos != TStringBuf::npos) {
            const bool isHttps = (scheme == TStringBuf("https://"));

            const TStringBuf port = hostAndPort.Tail(pos + 1);
            if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) {
                // trimming default port
                hostAndPort = hostAndPort.Head(pos);
            }
        }
    }

    if (isHttp && trimHttp) {
        return hostAndPort;
    } else {
        return TStringBuf(scheme.begin(), hostAndPort.end());
    }
}

void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) {
    auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
    host = hostBuf;
    path = pathBuf;
}

void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) {
    auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
    host = hostBuf;
    path = pathBuf;
}

void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) {
    TStringBuf urlWithoutFragment;
    if (!url.TrySplit('#', urlWithoutFragment, fragment)) {
        fragment = "";
        urlWithoutFragment = url;
    }
    if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) {
        query = "";
        sanitizedUrl = urlWithoutFragment;
    }
}

bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
    const size_t schemeSize = GetSchemePrefixSize(url);
    if (schemeSize != 0) {
        scheme = url.Head(schemeSize);
    }

    TStringBuf portStr;
    TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
    if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) {
        // URL has port
        if (!TryFromString(portStr, port)) {
            return false;
        }
    } else {
        host = hostAndPort;
        if (scheme == TStringBuf("https://")) {
            port = 443;
        } else if (scheme == TStringBuf("http://")) {
            port = 80;
        }
    }
    return true;
}

void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
    bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port);
    Y_ENSURE(isOk, "cannot parse port number from URL: " << url);
}

TStringBuf GetOnlyHost(const TStringBuf url) noexcept {
    return GetHost(CutSchemePrefix(url));
}

TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept {
    const size_t off = url.find('/', GetHttpPrefixSize(url));
    TStringBuf hostUnused, path;
    if (!url.TrySplitAt(off, hostUnused, path))
        return "/";

    return trimFragment ? path.Before('#') : path;
}

// this strange creature returns 2nd level domain, possibly with port
TStringBuf GetDomain(const TStringBuf host) noexcept {
    const char* c = !host ? host.data() : host.end() - 1; 
    for (bool wasPoint = false; c != host.data(); --c) { 
        if (*c == '.') {
            if (wasPoint) {
                ++c;
                break;
            }
            wasPoint = true;
        }
    }
    return TStringBuf(c, host.end());
}

TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept {
    size_t pos = host.size();
    for (size_t i = 0; i < level; ++i) {
        pos = host.rfind('.', pos);
        if (pos == TString::npos)
            return host;
    }
    return host.SubStr(pos + 1);
}

TStringBuf GetZone(const TStringBuf host) noexcept {
    return GetParentDomain(host, 1);
}

TStringBuf CutWWWPrefix(const TStringBuf url) noexcept {
    if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3)) 
        return url.substr(4);
    return url;
}

TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept {
    auto it = url.begin();

    StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; });
    if (it == url.begin()) {
        return url;
    }

    StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); });
    if (it == url.end()) {
        return url;
    }

    if (*it++ == '.') {
        return url.Tail(it - url.begin());
    }

    return url;
}

TStringBuf CutMPrefix(const TStringBuf url) noexcept {
    if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) {
        return url.substr(2);
    }
    return url;
}

static inline bool IsSchemeChar(char c) noexcept {
    return IsAsciiAlnum(c); //what about '+' ?..
}

static bool HasPrefix(const TStringBuf url) noexcept {
    TStringBuf scheme, unused;
    if (!url.TrySplit(TStringBuf("://"), scheme, unused))
        return false;

    return AllOf(scheme, IsSchemeChar);
}

TString AddSchemePrefix(const TString& url) {
    return AddSchemePrefix(url, TStringBuf("http"));
}

TString AddSchemePrefix(const TString& url, TStringBuf scheme) {
    if (HasPrefix(url)) {
        return url;
    }

    return TString::Join(scheme, TStringBuf("://"), url);
}

#define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0'))

static inline int x2c(unsigned char* x) {
    if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1]))
        return -1;
    return X(x[0]) * 16 + X(x[1]);
}

#undef X

static inline int Unescape(char* str) {
    char *to, *from;
    int dlen = 0;
    if ((str = strchr(str, '%')) == nullptr)
        return dlen;
    for (to = str, from = str; *from; from++, to++) {
        if ((*to = *from) == '%') {
            int c = x2c((unsigned char*)from + 1);
            *to = char((c > 0) ? c : '0');
            from += 2;
            dlen += 2;
        }
    }
    *to = 0; /* terminate it at the new length */
    return dlen;
}

size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) {
    if (source.empty() || source[0] == '?') 
        return strlcpy(dest, "/", dest_size);
    size_t len = Min(dest_size - 1, source.length());
    memcpy(dest, source.data(), len);
    dest[len] = 0;
    len -= Unescape(dest);
    strlwr(dest);
    return len;
}

size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) {
    size_t len = Min(dest_size - 1, source.length());
    memcpy(dest, source.data(), len);
    dest[len] = 0;
    char buf[8] = ":";
    size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2);
    buf[buflen] = '\0';
    char* ptr = strstr(dest, buf);
    if (ptr && ptr[buflen] == 0) {
        len -= buflen;
        *ptr = 0;
    }
    strlwr(dest);
    return len;
}

TStringBuf RemoveFinalSlash(TStringBuf str) noexcept {
    if (str.EndsWith('/')) {
        str.Chop(1);
    }
    return str;
}

TStringBuf CutUrlPrefixes(TStringBuf url) noexcept {
    url = CutSchemePrefix(url);
    url = CutWWWPrefix(url);
    return url;
}

bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept {
    url = CutSchemePrefix(url);
    const TStringBuf noHostSuffix = url.After('/');
    if (noHostSuffix == url) {
        // no slash => no suffix with token info
        return false;
    }
    const bool suffixHasPrefix = noHostSuffix.StartsWith(token);
    if (!suffixHasPrefix) {
        return false;
    }
    const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length();
    const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length();
    const bool nothingAfterPrefix = noHostSuffix.length() <= token.length();
    const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix;
    return prefixIsToken;
}