aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/uri/assign.cpp
blob: ae9125c727450f281ebea8e3559f2bd4b4e0094a (plain) (tree)
1
2
3
4
5
6
7
8
9
10


                                     
 
                                       


                                
                                  
 
                







                                                                             
 

                                                                             
 
                                                                                   
 
                                
     
 
                                                                                                                         
 
                          
 
                                                       
 









                                                                                                              
 











                                                                   
                           
                         
 



                                        
 



                                               
 










                                                                   
 
                            
 
                       
     



                                                                                                              
 

                                                                                 
 




                                                                              
 
                       
 
                                                                                                                    
                        

                                                      
            
                       
     
                                                                                       
 

                                           
 
                                                                     
 
                                             
                                                     
                                              
 
                            
 
                          
 
                                        
 


                                                                     
 
                                              
 





                                                                              
 



                                                                                          
 
                                  





                                                                                 
         
 
                                 
 



                                                                    
         
                                                 
 
                                                                                    
 
                                                                                 
 




                                                            
 



                                                                           
                                                         




















                                                                                                 
 
                                                 
 
                      
 
                                                             
                                                     
 

                                                      
 
                                                     
 
                                                                  
 

                                                      
 

                          
 


                                                                                      
                                                        
                                                                                                                                                                 
                                                             




                                                       
 












                                                                                                        
             
 








                                                                              
 






                                                                                              
             
                        
         




                                                                       
                        
         
 
                                 
 



                                                                     
         
 
                                                        
 
                                                     
                            
                                                 
         
 
                            
 
                                                          
 


                                                
                                  
                                                
                                  








                                                               
     
                                                                                                                                           
 
                        
                               
 
                                                
 
                                              
 



                                                                                                                                         
                                                                                                          

                            
                                                 






                                                                            
                   
     
 
                                                                                                                               

                            
                                                  
 
                  
                   
     
 



                                                                                                                                             
 
                                 
 

                        
 
#include "uri.h"
#include "parse.h"

#include <contrib/libs/libidn/idna.h>

#include <library/cpp/charset/recyr.hh>
#include <util/charset/wide.h>
#include <util/memory/tempbuf.h>
#include <util/string/cast.h>
#include <util/system/yassert.h>
#include <util/system/sys_alloc.h>

namespace NUri {
    TMallocPtr<char> TUri::IDNToAscii(const wchar32* idna) {
        // XXX: don't use punycode_encode directly as it doesn't include
        // proper stringprep and splitting on dot-equivalent characters
        char* buf;
        static_assert(sizeof(*idna) == sizeof(ui32), "fixme");
        if (IDNA_SUCCESS != idna_to_ascii_4z((const uint32_t*)idna, &buf, 0))
            buf = nullptr;
        return buf;
    }

    TMallocPtr<char> TUri::IDNToAscii(const TStringBuf& host, ECharset enc) {
        TTempBuf buf(sizeof(wchar32) * (1 + host.length()));
        wchar32* wbuf = reinterpret_cast<wchar32*>(buf.Data());

        const size_t written = NDetail::NBaseOps::Recode(host, wbuf, enc).length();
        wbuf[written] = 0;

        return IDNToAscii(wbuf);
    }

    TStringBuf TUri::HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc) {
        TStringBuf outhost; // store the result here before returning it, to get RVO

        size_t buflen = 0;

        if (hasExtended && !allowIDN)
            return outhost; // definitely can't convert

        // charset-recode: RFC 3986, 3.2.2, requires percent-encoded non-ASCII
        // chars in reg-name to be UTF-8 so convert to UTF-8 prior to decoding
        const bool recoding = CODES_UTF8 != enc && hasExtended;
        if (recoding) {
            size_t nrd, nwr;
            buflen = host.length() * 4;
            buf.Reset(static_cast<char*>(y_allocate(buflen)));
            if (RECODE_OK != Recode(enc, CODES_UTF8, host.data(), buf.Get(), host.length(), buflen, nrd, nwr))
                return outhost;
            host = TStringBuf(buf.Get(), nwr);
        }

        // percent-decode
        if (0 == buflen) {
            buflen = host.length();
            buf.Reset(static_cast<char*>(y_allocate(buflen)));
        }
        // decoding shortens so writing over host in buf is OK
        TMemoryWriteBuffer out(buf.Get(), buflen);
        TEncoder decoder(out, FeatureDecodeANY | FeatureToLower);
        const long outFlags = decoder.ReEncode(host);
        hasExtended = 0 != (outFlags & FeatureEncodeExtendedASCII);

        // check again
        if (hasExtended && !allowIDN)
            return outhost;

        host = out.Str();

        // convert to punycode if needed
        if (!hasExtended) {
            outhost = host;
            return outhost;
        }

        TMallocPtr<char> puny;
        try {
            puny = IDNToAscii(host);
        } catch (const yexception& /* exc */) {
        }

        if (!puny) {
            // XXX: try user charset unless UTF8 or converted to it
            if (CODES_UTF8 == enc || recoding)
                return outhost;
            try {
                puny = IDNToAscii(host, enc);
            } catch (const yexception& /* exc */) {
                return outhost;
            }
            if (!puny)
                return outhost;
        }

        buf = puny;
        outhost = buf.Get();

        return outhost;
    }

    TStringBuf TUri::HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc) {
        // find what we have
        long haveFlags = 0;
        for (size_t i = 0; i != host.length(); ++i)
            haveFlags |= TEncoder::GetFlags(host[i]).FeatFlags;

        // interested in encoded characters or (if IDN is allowed) extended ascii
        TStringBuf outhost;
        const bool haveExtended = haveFlags & FeatureEncodeExtendedASCII;

        if (!haveExtended || allowIDN) {
            if (!haveExtended && 0 == (haveFlags & FeatureDecodeANY))
                outhost = host;
            else
                outhost = HostToAscii(host, buf, haveExtended, allowIDN, enc);
        }

        return outhost;
    }

    static inline bool AppendField(TMemoryWriteBuffer& out, TField::EField fld, const TStringBuf& val, long flags) {
        if (val.empty())
            return false;
        if (flags & TFeature::FeaturesAllEncoder)
            TUri::ReEncodeField(out, val, fld, flags);
        else
            out << val;
        return true;
    }

    TState::EParsed TUri::AssignImpl(const TParser& parser, TScheme::EKind defscheme) {
        Clear();

        TState::EParsed ret = parser.State;
        if (ParsedBadFormat <= ret)
            return ret;

        const TSection& scheme = parser.Get(FieldScheme);
        const TSchemeInfo& schemeInfo = SetSchemeImpl(parser.Scheme);

        // set the scheme always if available
        if (schemeInfo.Str.empty() && scheme.IsSet())
            FldSet(FieldScheme, scheme.Get());

        if (ParsedOK != ret)
            return ret;

        size_t buflen = 0;

        // special processing for fields

        const bool convertIDN = parser.Flags & FeatureConvertHostIDN;
        long flags = parser.Flags.Allow;
        if (convertIDN)
            flags |= FeatureAllowHostIDN | FeatureCheckHost;

        // process non-ASCII host for punycode

        TMallocPtr<char> hostptr;
        TStringBuf hostascii;       // empty: use host field; non-empty: ascii
        bool hostConverted = false; // hostascii is empty or the original
        const TSection& host = parser.Get(FieldHost);
        if (host.IsSet() && !FldIsSet(FieldHost)) {
            const bool allowIDN = (flags & FeatureAllowHostIDN);
            const TStringBuf hostbuf = host.Get();

            // if we know we have and allow extended-ASCII chars, no need to check further
            if (allowIDN && (host.GetFlagsAllPlaintext() & FeatureEncodeExtendedASCII))
                hostascii = HostToAscii(hostbuf, hostptr, true, true, parser.Enc);
            else
                hostascii = HostToAscii(hostbuf, hostptr, allowIDN, parser.Enc);

            if (hostascii.empty())
                ret = ParsedBadHost; // exists but cannot be converted
            else if (hostbuf.data() != hostascii.data()) {
                hostConverted = true;
                buflen += 1 + hostascii.length();
                if (convertIDN)
                    FldMarkSet(FieldHost); // so that we don't process host below
            }
        }

        // add unprocessed fields

        for (int idx = 0; idx < FieldUrlMAX; ++idx) {
            const EField fld = EField(idx);
            const TSection& section = parser.Get(fld);
            if (section.IsSet() && !FldIsSet(fld))
                buflen += 1 + section.EncodedLen(); // includes null
        }
        if (0 == buflen) // no more sections set?
            return ret;

        // process #! fragments
        // https://developers.google.com/webmasters/ajax-crawling/docs/specification

        static const TStringBuf escFragPrefix(TStringBuf("_escaped_fragment_="));

        bool encHashBangFrag = false;
        TStringBuf qryBeforeEscapedFragment;
        TStringBuf qryEscapedFragment;
        do {
            if (FldIsSet(FieldFrag) || FldIsSet(FieldQuery))
                break;

            const TSection& frag = parser.Get(FieldFrag);
            if (frag.IsSet()) {
                if (0 == (parser.Flags & FeatureHashBangToEscapedFragment))
                    break;
                const TStringBuf fragbuf = frag.Get();
                if (fragbuf.empty() || '!' != fragbuf[0])
                    break;
                encHashBangFrag = true;
                // '!' will make space for '&' or '\0' if needed
                buflen += escFragPrefix.length();
                buflen += 2 * fragbuf.length(); // we don't know how many will be encoded
            } else {
                const TSection& qry = parser.Get(FieldQuery);
                if (!qry.IsSet())
                    break;
                // FeatureHashBangToEscapedFragment has preference
                if (FeatureEscapedToHashBangFragment != (parser.Flags & FeaturesEscapedFragment))
                    break;
                qry.Get().RSplit('&', qryBeforeEscapedFragment, qryEscapedFragment);
                if (!qryEscapedFragment.StartsWith(escFragPrefix)) {
                    qryEscapedFragment.Clear();
                    break;
                }
                qryEscapedFragment.Skip(escFragPrefix.length());
                buflen += 2; // for '!' and '\0' in fragment
                buflen -= escFragPrefix.length();
            }
        } while (false);

        // now set all fields prior to validating

        Alloc(buflen);

        TMemoryWriteBuffer out(Buffer.data(), Buffer.size());
        for (int idx = 0; idx < FieldUrlMAX; ++idx) {
            const EField fld = EField(idx);

            const TSection& section = parser.Get(fld);
            if (!section.IsSet() || FldIsSet(fld))
                continue;

            if (FieldQuery == fld && encHashBangFrag)
                continue;

            if (FieldFrag == fld && qryEscapedFragment.IsInited())
                continue;

            char* beg = out.Buf();
            TStringBuf val = section.Get();
            long careFlags = section.GetFlagsEncode();

            switch (fld) {
                default:
                    break;

                case FieldQuery:
                    if (qryEscapedFragment.IsInited()) {
                        const EField dstfld = FieldFrag; // that's where we will store
                        out << '!';
                        if (!qryEscapedFragment.empty())
                            ReEncodeToField(out, qryEscapedFragment, fld, FeatureDecodeANY | careFlags, dstfld, FeatureDecodeANY | parser.GetFieldFlags(dstfld));
                        FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
                        if (qryBeforeEscapedFragment.empty())
                            continue;
                        out << '\0';
                        beg = out.Buf();
                        val = qryBeforeEscapedFragment;
                    }
                    break;

                case FieldFrag:
                    if (encHashBangFrag) {
                        const EField dstfld = FieldQuery; // that's where we will store
                        const TSection& qry = parser.Get(dstfld);
                        if (qry.IsSet())
                            if (AppendField(out, dstfld, qry.Get(), qry.GetFlagsEncode()))
                                out << '&';
                        out << escFragPrefix;
                        val.Skip(1); // skip '!'
                        ReEncodeToField(out, val, fld, careFlags, dstfld, parser.GetFieldFlags(dstfld));
                        FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
                        continue;
                    }
                    break;
            }

            AppendField(out, fld, val, careFlags);
            char* end = out.Buf();

            if (careFlags & FeaturePathOperation) {
                if (!PathOperation(beg, end, PathOperationFlag(parser.Flags)))
                    return ParsedBadPath;

                Y_ASSERT(beg >= out.Beg());
                out.SetPos(end);
            }

            FldSetNoDirty(fld, TStringBuf(beg, end));

            // special character case
            const long checkChars = section.GetFlagsAllPlaintext() & FeaturesCheckSpecialChar;
            if (0 != checkChars) { // has unencoded special chars: check permission
                const long allowChars = parser.GetFieldFlags(fld) & checkChars;
                if (checkChars != allowChars)
                    ret = ParsedBadFormat;
            }

            out << '\0';
        }

        if (hostConverted) {
            char* beg = out.Buf();
            out << hostascii;
            char* end = out.Buf();
            const EField fld = convertIDN ? FieldHost : FieldHostAscii;
            FldSetNoDirty(fld, TStringBuf(beg, end));
            out << '\0';
        }

        Buffer.Resize(out.Len());

        if (GetScheme() == SchemeEmpty && SchemeEmpty != defscheme) {
            if (SchemeUnknown == defscheme)
                ret = ParsedBadScheme;
            else
                SetSchemeImpl(defscheme);
        }

        if (0 == (parser.Flags & FeatureAllowEmptyPath))
            CheckMissingFields();

        const TStringBuf& port = GetField(FieldPort);
        if (!port.empty()) {
            if (!TryFromString<ui16>(port, Port))
                ret = ParsedBadPort;
        }

        if (ParsedOK != ret)
            return ret;

        // run validity checks now that all fields are set

        // check the host for DNS compliance
        do {
            if (0 == (flags & FeatureCheckHost))
                break;
            if (hostascii.empty())
                hostascii = GetField(FieldHost);
            if (hostascii.empty())
                break;
            // IP literal
            if ('[' == hostascii[0] && ']' == hostascii.back())
                break;
            ret = CheckHost(hostascii);
            if (ParsedOK != ret)
                return ret;
        } while (false);

        return ret;
    }

    TState::EParsed TUri::ParseImpl(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
        Clear();

        if (url.empty())
            return ParsedEmpty;

        if (maxlen > 0 && url.length() > maxlen)
            return ParsedTooLong;

        const TParser parser(flags, url, enc);

        return AssignImpl(parser, defscheme);
    }

    TState::EParsed TUri::Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& url_base, ui32 maxlen, ECharset enc) {
        const TParseFlags flags1 = flags.Exclude(FeatureNoRelPath);
        TState::EParsed ret = ParseImpl(url, url_base.empty() ? flags : flags1, maxlen, SchemeEmpty, enc);
        if (ParsedOK != ret)
            return ret;

        if (!url_base.empty() && !IsValidAbs()) {
            TUri base;
            ret = base.ParseImpl(url_base, flags, maxlen, SchemeEmpty, enc);
            if (ParsedOK != ret)
                return ret;
            Merge(base, PathOperationFlag(flags));
        }

        Rewrite();
        return ret;
    }

    TState::EParsed TUri::Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags, ui32 maxlen, ECharset enc) {
        const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc);
        if (ParsedOK != ret)
            return ret;

        if (!IsValidAbs())
            Merge(base, PathOperationFlag(flags));

        Rewrite();
        return ret;
    }

    TState::EParsed TUri::ParseAbsUri(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
        const TState::EParsed ret = ParseImpl(
            url, flags | FeatureNoRelPath, maxlen, defscheme, enc);
        if (ParsedOK != ret)
            return ret;

        if (IsNull(FlagHost))
            return ParsedBadHost;

        Rewrite();
        return ParsedOK;
    }

}