aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/string_utils/url/url.cpp
blob: 44e6cb671a75810e90fdb1c9861df9b0f4a387a3 (plain) (tree)
1
2
3
4
5
6
7
8
9
                




                                 
                                                
                                 
                                   
                                    
                                   
 
                  
           
                           
                                 




                        
                                      
                        
         






                                                                     
                                                               


                 
                                               

                                                                                                     
                                                                     
                     
                                                                                      
                     
     
 
                         
                                                                
                                                                                                                          
                                          


                   









                                                                                                   
                                                                      
                                                                           
 
                                                                         
                                                                              
 
                                                                           
                                                                                        
 
                                                                           
                                                                                           
 
                                                                           
                                               
 
                                                                           
                                               
 
                                                           






                                             
                                                     
                                                                        
                 
     
                               
 
                                                           

                                              
                                                           

                                              
                        
                                                                   

                                             
 






                                                         
                                                                                                      
                                                     
                                                                                 

                       
 
                                                   

                                          
                                                          

                                         













                                                                             
                                                                                                     

                                                       
                                                                             




                                                                  
                                                                    
                                                              
                                                                                                 











                                                             
                                                                                      

                                                               

                                                                                
                                                               
                   
 










                                                                                                                               






                                                                                                      
                                                                                                




                                            
                                               
                       
                                                     









                                                                                                   
                                                       
                                         
 
                                                                              
                                                             
                                               
                   
                                                  
 
                                                                     
                                                      
                                                         
                           
                    
                      
             

                            
                                     
 
                                                                          

                                        
                                 



                                
                                                    
                                    
 
                                                        
                                                                            


                             


















                                                                                     





                                                                               
                                                  
                                                
 
                                                      
                              
                                                         
                     
                                       
 
                                             
                                                    
 
                                                                


                         
                                                         
 

                                                             
                                               


                                  
        
                                       
                 
                                            

                                                     
                                                  



                                          
                                                 

                
                                                                                
                                           
                                             

                                                     



                          
                                                                                               

                                                     

                                                                    
                                  
                      



                 
 
                                                      



                            























                                                                                           
#include "url.h"

#include <util/string/cast.h>
#include <util/string/util.h>
#include <util/string/cstriter.h>
#include <util/string/ascii.h>
#include <util/string/strip.h>

#include <util/charset/unidata.h> // for ToLower
#include <util/system/defaults.h>
#include <util/generic/algorithm.h>
#include <util/generic/hash_set.h>
#include <util/generic/yexception.h>
#include <util/generic/singleton.h>

#include <cstdlib>

namespace {
    struct TUncheckedSize {
        static bool Has(size_t) {
            return true;
        }
    };

    struct TKnownSize {
        size_t MySize;
        explicit TKnownSize(size_t sz)
            : MySize(sz)
        {
        }
        bool Has(size_t sz) const {
            return sz <= MySize;
        }
    };

    template <typename TChar1, typename TChar2>
    int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) {
        for (size_t i = 0; i < n; ++i) {
            if ((TChar1)ToLower(s1[i]) != s2[i])
                return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1;
        }
        return 0;
    }

    template <typename TChar, typename TBounds>
    inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) {
        const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0};
        const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0};
        if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0)
            return 7;
        if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0)
            return 8;
        return 0;
    }

    template <typename T>
    inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) {
        size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps);
        if (prefixSize)
            return url.substr(prefixSize);
        return url;
    }
}

namespace NUrl {

    TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) {
        TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false);
        TStringBuf path = url;
        path.SkipPrefix(host);
        return {host, path};
    }

} // namespace NUrl

size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept {
    return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps);
}

size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept {
    return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps);
}

size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept {
    return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps);
}

size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept {
    return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps);
}

TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept {
    return CutHttpPrefixImpl(url, ignorehttps);
}

TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept {
    return CutHttpPrefixImpl(url, ignorehttps);
}

size_t GetSchemePrefixSize(const TStringBuf url) noexcept {
    struct TDelim: public str_spn {
        inline TDelim()
            : str_spn("!-/:-@[-`{|}", true)
        {
        }
    };

    const auto& delim = *Singleton<TDelim>();
    const char* n = delim.brk(url.data(), url.end());

    if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') {
        return 0;
    }

    return n + 3 - url.begin();
}

TStringBuf GetSchemePrefix(const TStringBuf url) noexcept {
    return url.Head(GetSchemePrefixSize(url));
}

TStringBuf CutSchemePrefix(const TStringBuf url) noexcept {
    return url.Tail(GetSchemePrefixSize(url));
}

template <bool KeepPort>
static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) {
    TStringBuf urlNoScheme = url;

    urlNoScheme.Skip(GetHttpPrefixSize(url));

    struct TDelim: public str_spn {
        inline TDelim()
            : str_spn(KeepPort ? "/;?#" : "/:;?#")
        {
        }
    };

    const auto& nonHostCharacters = *Singleton<TDelim>();
    const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end());

    if (firstNonHostCharacter != urlNoScheme.end()) {
        return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data());
    }

    return urlNoScheme;
}

TStringBuf GetHost(const TStringBuf url) noexcept {
    return GetHostAndPortImpl<false>(url);
}

TStringBuf GetHostAndPort(const TStringBuf url) noexcept {
    return GetHostAndPortImpl<true>(url);
}

TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp) noexcept {
    const size_t schemeSize = GetSchemePrefixSize(url);
    const TStringBuf scheme = url.Head(schemeSize);

    const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));

    const TStringBuf host = GetHost(url.Tail(schemeSize));

    if (isHttp && trimHttp) {
        return host;
    } else {
        return TStringBuf(scheme.begin(), host.end());
    }
}

TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept {
    const size_t schemeSize = GetSchemePrefixSize(url);
    const TStringBuf scheme = url.Head(schemeSize);

    const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));

    TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));

    if (trimDefaultPort) {
        const size_t pos = hostAndPort.find(':');
        if (pos != TStringBuf::npos) {
            const bool isHttps = (scheme == TStringBuf("https://"));

            const TStringBuf port = hostAndPort.Tail(pos + 1);
            if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) {
                // trimming default port
                hostAndPort = hostAndPort.Head(pos);
            }
        }
    }

    if (isHttp && trimHttp) {
        return hostAndPort;
    } else {
        return TStringBuf(scheme.begin(), hostAndPort.end());
    }
}

void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) {
    auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
    host = hostBuf;
    path = pathBuf;
}

void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) {
    auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
    host = hostBuf;
    path = pathBuf;
}

void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) {
    TStringBuf urlWithoutFragment;
    if (!url.TrySplit('#', urlWithoutFragment, fragment)) {
        fragment = "";
        urlWithoutFragment = url;
    }
    if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) {
        query = "";
        sanitizedUrl = urlWithoutFragment;
    }
}

bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
    const size_t schemeSize = GetSchemePrefixSize(url);
    if (schemeSize != 0) {
        scheme = url.Head(schemeSize);
    }

    TStringBuf portStr;
    TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
    if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) {
        // URL has port
        if (!TryFromString(portStr, port)) {
            return false;
        }
    } else {
        host = hostAndPort;
        if (scheme == TStringBuf("https://")) {
            port = 443;
        } else if (scheme == TStringBuf("http://")) {
            port = 80;
        }
    }
    return true;
}

void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
    bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port);
    Y_ENSURE(isOk, "cannot parse port number from URL: " << url);
}

TStringBuf GetOnlyHost(const TStringBuf url) noexcept {
    return GetHost(CutSchemePrefix(url));
}

TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept {
    const size_t off = url.find('/', GetHttpPrefixSize(url));
    TStringBuf hostUnused, path;
    if (!url.TrySplitAt(off, hostUnused, path))
        return "/";

    return trimFragment ? path.Before('#') : path;
}

// this strange creature returns 2nd level domain, possibly with port
TStringBuf GetDomain(const TStringBuf host) noexcept {
    const char* c = !host ? host.data() : host.end() - 1;
    for (bool wasPoint = false; c != host.data(); --c) {
        if (*c == '.') {
            if (wasPoint) {
                ++c;
                break;
            }
            wasPoint = true;
        }
    }
    return TStringBuf(c, host.end());
}

TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept {
    size_t pos = host.size();
    for (size_t i = 0; i < level; ++i) {
        pos = host.rfind('.', pos);
        if (pos == TString::npos)
            return host;
    }
    return host.SubStr(pos + 1);
}

TStringBuf GetZone(const TStringBuf host) noexcept {
    return GetParentDomain(host, 1);
}

TStringBuf CutWWWPrefix(const TStringBuf url) noexcept {
    if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3))
        return url.substr(4);
    return url;
}

TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept {
    auto it = url.begin();

    StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; });
    if (it == url.begin()) {
        return url;
    }

    StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); });
    if (it == url.end()) {
        return url;
    }

    if (*it++ == '.') {
        return url.Tail(it - url.begin());
    }

    return url;
}

TStringBuf CutMPrefix(const TStringBuf url) noexcept {
    if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) {
        return url.substr(2);
    }
    return url;
}

static inline bool IsSchemeChar(char c) noexcept {
    return IsAsciiAlnum(c); //what about '+' ?..
}

static bool HasPrefix(const TStringBuf url) noexcept {
    TStringBuf scheme, unused;
    if (!url.TrySplit(TStringBuf("://"), scheme, unused))
        return false;

    return AllOf(scheme, IsSchemeChar);
}

TString AddSchemePrefix(const TString& url) {
    return AddSchemePrefix(url, TStringBuf("http"));
}

TString AddSchemePrefix(const TString& url, TStringBuf scheme) {
    if (HasPrefix(url)) {
        return url;
    }

    return TString::Join(scheme, TStringBuf("://"), url);
}

#define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0'))

static inline int x2c(unsigned char* x) {
    if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1]))
        return -1;
    return X(x[0]) * 16 + X(x[1]);
}

#undef X

static inline int Unescape(char* str) {
    char *to, *from;
    int dlen = 0;
    if ((str = strchr(str, '%')) == nullptr)
        return dlen;
    for (to = str, from = str; *from; from++, to++) {
        if ((*to = *from) == '%') {
            int c = x2c((unsigned char*)from + 1);
            *to = char((c > 0) ? c : '0');
            from += 2;
            dlen += 2;
        }
    }
    *to = 0; /* terminate it at the new length */
    return dlen;
}

size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) {
    if (source.empty() || source[0] == '?')
        return strlcpy(dest, "/", dest_size);
    size_t len = Min(dest_size - 1, source.length());
    memcpy(dest, source.data(), len);
    dest[len] = 0;
    len -= Unescape(dest);
    strlwr(dest);
    return len;
}

size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) {
    size_t len = Min(dest_size - 1, source.length());
    memcpy(dest, source.data(), len);
    dest[len] = 0;
    char buf[8] = ":";
    size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2);
    buf[buflen] = '\0';
    char* ptr = strstr(dest, buf);
    if (ptr && ptr[buflen] == 0) {
        len -= buflen;
        *ptr = 0;
    }
    strlwr(dest);
    return len;
}

TStringBuf RemoveFinalSlash(TStringBuf str) noexcept {
    if (str.EndsWith('/')) {
        str.Chop(1);
    }
    return str;
}

TStringBuf CutUrlPrefixes(TStringBuf url) noexcept {
    url = CutSchemePrefix(url);
    url = CutWWWPrefix(url);
    return url;
}

bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept {
    url = CutSchemePrefix(url);
    const TStringBuf noHostSuffix = url.After('/');
    if (noHostSuffix == url) {
        // no slash => no suffix with token info
        return false;
    }
    const bool suffixHasPrefix = noHostSuffix.StartsWith(token);
    if (!suffixHasPrefix) {
        return false;
    }
    const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length();
    const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length();
    const bool nothingAfterPrefix = noHostSuffix.length() <= token.length();
    const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix;
    return prefixIsToken;
}