aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/http/push_parser/http_parser.cpp
blob: b7b0a82becdfdabdba4c17a25d173fc87bbc056f (plain) (tree)
1
2
3
4
5


                                           
                                              







































                                                       


                                                                 






























































                                                                                                                                                                                                      
                                                 








                                                                                     
                                                                                                             










































































































































                                                                                                                       
                                                                
                                                                                            
                                  







                                                             
                                                











                                                                  
                                                                                                             




                                                                      
                                                                               












                                                                                                     
                                           
                                                                              
                                                 
                                          
                                           
































                                                                                                   
#include "http_parser.h"

#include <library/cpp/blockcodecs/stream.h>
#include <library/cpp/blockcodecs/codecs.h>
#include <library/cpp/streams/brotli/brotli.h>

#include <util/generic/string.h>
#include <util/generic/yexception.h>
#include <util/stream/mem.h>
#include <util/stream/zlib.h>
#include <util/string/ascii.h>
#include <util/string/split.h>
#include <util/string/strip.h>

//#define DBGOUT(args) Cout << args << Endl;
#define DBGOUT(args)

namespace {
    const TString BestCodings[] = {
        "gzip",
        "deflate",
        "br",
        "x-gzip",
        "x-deflate",
        "y-lzo",
        "y-lzf",
        "y-lzq",
        "y-bzip2",
        "y-lzma",
    };
}

TString THttpParser::GetBestCompressionScheme() const {
    if (AcceptEncodings_.contains("*")) {
        return BestCodings[0];
    }

    for (auto& coding : BestCodings) {
        if (AcceptEncodings_.contains(coding)) {
            return coding;
        }
    }

    return TString();
}

const THashSet<TString>& THttpParser::AcceptedEncodings() const {
    return AcceptEncodings_;
}

bool THttpParser::FirstLineParser() {
    if (Y_UNLIKELY(!ReadLine())) {
        return false;
    }

    CurrentLine_.swap(FirstLine_);

    try {
        TStringBuf s(FirstLine_);
        if (MessageType_ == Response) {
            // Status-Line = HTTP-Version SP Status-Code SP Reason-Phrase CRLF
            TStringBuf httpVersion, statusCode;
            GetNext(s, ' ', httpVersion);
            ParseHttpVersion(httpVersion);
            GetNext(s, ' ', statusCode);
            RetCode_ = FromString<unsigned>(statusCode);
        } else {
            // Request-Line   = Method SP Request-URI SP HTTP-Version CRLF
            TStringBuf httpVersion = s.After(' ').After(' ');
            ParseHttpVersion(httpVersion);
        }
    } catch (...) {
        throw THttpParseException() << "Cannot parse first line: " << CurrentExceptionMessage() << " First 80 chars of line: " << FirstLine_.substr(0, Min<size_t>(80ull, FirstLine_.size())).Quote();
    }

    return HeadersParser();
}

bool THttpParser::HeadersParser() {
    while (ReadLine()) {
        if (!CurrentLine_) {
            //end of headers
            DBGOUT("end of headers()");
            ParseHeaderLine();

            if (HasContentLength_) {
                if (ContentLength_ == 0) {
                    return OnEndParsing();
                }

                if (ContentLength_ < 1000000) {
                    Content_.reserve(ContentLength_ + 1);
                }
            }

            return !!ChunkInputState_ ? ChunkedContentParser() : ContentParser();
        }

        if (CurrentLine_[0] == ' ' || CurrentLine_[0] == '\t') {
            //continue previous header-line
            HeaderLine_ += CurrentLine_;
            CurrentLine_.remove(0);
        } else {
            ParseHeaderLine();
            HeaderLine_.swap(CurrentLine_);
        }
    }

    Parser_ = &THttpParser::HeadersParser;
    return false;
}

bool THttpParser::ContentParser() {
    DBGOUT("Content parsing()");
    if (HasContentLength_ && !BodyNotExpected_) {
        size_t rd = Min<size_t>(DataEnd_ - Data_, ContentLength_ - Content_.size());
        Content_.append(Data_, rd);
        Data_ += rd;
        DBGOUT("Content parsing: " << Content_.Size() << " from " << ContentLength_);
        if (Content_.size() == ContentLength_) {
            return OnEndParsing();
        }
    } else {
        if (MessageType_ == Request) {
            return OnEndParsing(); //RFC2616 4.4-5
        } else if (Y_UNLIKELY(BodyNotExpected_ || RetCode() < 200 || RetCode() == 204 || RetCode() == 304)) {
            return OnEndParsing(); //RFC2616 4.4-1
        }

        Content_.append(Data_, DataEnd_);
        Data_ = DataEnd_;
    }
    Parser_ = &THttpParser::ContentParser;
    return false;
}

bool THttpParser::ChunkedContentParser() {
    DBGOUT("ReadChunkedContent");
    TChunkInputState& ci = *ChunkInputState_;

    if (Content_.capacity() < static_cast<size_t>(DataEnd_ - Data_)) {
        //try reduce memory reallocations
        Content_.reserve(DataEnd_ - Data_);
    }

    do {
        if (!ci.LeftBytes_) {
            if (Y_UNLIKELY(!ReadLine())) { //read first chunk size or CRLF from prev chunk or CRLF from last chunk
                break;
            }

            if (Y_UNLIKELY(ci.ReadLastChunk_)) {
                return OnEndParsing();
            }

            if (!CurrentLine_) {
                // skip crlf from previous chunk
                if (!ReadLine()) {
                    break;
                }
            }
            Y_ENSURE(CurrentLine_.size(), "NEH: LeftBytes hex number cannot be empty. ");
            size_t size = CurrentLine_.find_first_of(" \t;");
            if (size == TString::npos) {
                size = CurrentLine_.size();
            }
            ci.LeftBytes_ = IntFromString<ui32, 16, char>(CurrentLine_.c_str(), size);
            CurrentLine_.remove(0);
            if (!ci.LeftBytes_) { //detectect end of context marker - zero-size chunk, need read CRLF after empty chunk
                ci.ReadLastChunk_ = true;
                if (ReadLine()) {
                    return OnEndParsing();
                } else {
                    break;
                }
            }
        }

        size_t rd = Min<size_t>(DataEnd_ - Data_, ci.LeftBytes_);
        Content_.append(Data_, rd);
        Data_ += rd;
        ci.LeftBytes_ -= rd;
    } while (Data_ != DataEnd_);

    Parser_ = &THttpParser::ChunkedContentParser;
    return false;
}

bool THttpParser::OnEndParsing() {
    Parser_ = &THttpParser::OnEndParsing;
    ExtraDataSize_ = DataEnd_ - Data_;
    return true;
}

//continue read to CurrentLine_
bool THttpParser::ReadLine() {
    TStringBuf in(Data_, DataEnd_);
    size_t endl = in.find('\n');

    if (Y_UNLIKELY(endl == TStringBuf::npos)) {
        //input line not completed
        CurrentLine_.append(Data_, DataEnd_);
        return false;
    }

    CurrentLine_.append(in.data(), endl);
    if (Y_LIKELY(CurrentLine_.size())) {
        //remove '\r' from tail
        size_t withoutCR = CurrentLine_.size() - 1;
        if (CurrentLine_[withoutCR] == '\r') {
            CurrentLine_.remove(withoutCR);
        }
    }

    //Cout << "ReadLine:" << CurrentLine_ << Endl;
    Data_ += endl + 1;
    return true;
}

void THttpParser::ParseHttpVersion(TStringBuf httpVersion) {
    if (!httpVersion.StartsWith("HTTP/", 5)) {
        throw yexception() << "expect 'HTTP/'";
    }
    httpVersion.Skip(5);
    {
        TStringBuf major, minor;
        Split(httpVersion, '.', major, minor);
        HttpVersion_.Major = FromString<unsigned>(major);
        HttpVersion_.Minor = FromString<unsigned>(minor);
        if (Y_LIKELY(HttpVersion_.Major > 1 || HttpVersion_.Minor > 0)) {
            // since HTTP/1.1 Keep-Alive is default behaviour
            KeepAlive_ = true;
        }
    }
}

void THttpParser::ParseHeaderLine() {
    if (!!HeaderLine_) {
        if (CollectHeaders_) {
            THttpInputHeader hdr(HeaderLine_);

            Headers_.AddHeader(hdr);

            ApplyHeaderLine(hdr.Name(), hdr.Value());
        } else {
            //some dirty optimization (avoid reallocation new strings)
            size_t pos = HeaderLine_.find(':');

            if (pos == TString::npos) {
                ythrow THttpParseException() << "can not parse http header(" << HeaderLine_.Quote() << ")";
            }

            TStringBuf name(StripString(TStringBuf(HeaderLine_.begin(), HeaderLine_.begin() + pos)));
            TStringBuf val(StripString(TStringBuf(HeaderLine_.begin() + pos + 1, HeaderLine_.end())));
            ApplyHeaderLine(name, val);
        }
        HeaderLine_.remove(0);
    }
}

void THttpParser::OnEof() {
    if (Parser_ == &THttpParser::ContentParser && !HasContentLength_ && !ChunkInputState_) {
        return; //end of content determined by end of input
    }
    throw THttpException() << TStringBuf("incompleted http response");
}

bool THttpParser::DecodeContent(TString& decodedContent) const {
    if (!ContentEncoding_ || ContentEncoding_ == "identity" || ContentEncoding_ == "none") {
        decodedContent = Content_;
        return false;
    }

    TMemoryInput in(Content_.data(), Content_.size());
    if (ContentEncoding_ == "gzip") {
        auto decompressor = TZLibDecompress(&in, ZLib::GZip);
        if (!GzipAllowMultipleStreams_) {
            decompressor.SetAllowMultipleStreams(false);
        }
        decodedContent = decompressor.ReadAll();
    } else if (ContentEncoding_ == "deflate") {

        //https://tools.ietf.org/html/rfc1950
        bool definitelyNoZlibHeader;
        if (Content_.size() < 2) {
            definitelyNoZlibHeader = true;
        } else {
            const ui16 cmf = static_cast<ui8>(Content_[0]);
            const ui16 flg = static_cast<ui8>(Content_[1]);
            definitelyNoZlibHeader = ((cmf << 8) | flg) % 31 != 0;
        }

        try {
            decodedContent = TZLibDecompress(&in, definitelyNoZlibHeader ? ZLib::Raw : ZLib::ZLib).ReadAll();
        }
        catch(...) {
            if (definitelyNoZlibHeader) {
                throw;
            }
            TMemoryInput retryInput(Content_.data(), Content_.size());
            decodedContent = TZLibDecompress(&retryInput, ZLib::Raw).ReadAll();
        }
    } else if (ContentEncoding_.StartsWith("z-")) {
        // opposite for library/cpp/http/io/stream.h
        const NBlockCodecs::ICodec* codec = nullptr;
        try {
            const TStringBuf codecName = TStringBuf(ContentEncoding_).SubStr(2);
            if (codecName.StartsWith("zstd06") || codecName.StartsWith("zstd08")) {
                ythrow NBlockCodecs::TNotFound() << codecName;
            }
            codec = NBlockCodecs::Codec(codecName);
        } catch(const NBlockCodecs::TNotFound& exc) {
            throw THttpParseException() << "Unsupported content-encoding method: " << exc.AsStrBuf();
        }
        NBlockCodecs::TDecodedInput decoder(&in, codec);
        decodedContent = decoder.ReadAll();
    } else if (ContentEncoding_ == "lz4") {
        const auto* codec = NBlockCodecs::Codec(TStringBuf(ContentEncoding_));
        decodedContent = codec->Decode(Content_);
    } else if (ContentEncoding_ == "br") {
        TBrotliDecompress decoder(&in);
        decodedContent = decoder.ReadAll();
    } else {
        throw THttpParseException() << "Unsupported content-encoding method: " << ContentEncoding_;
    }
    return true;
}

void THttpParser::ApplyHeaderLine(const TStringBuf& name, const TStringBuf& val) {
    if (AsciiEqualsIgnoreCase(name, TStringBuf("connection"))) {
        KeepAlive_ = AsciiEqualsIgnoreCase(val, TStringBuf("keep-alive"));
    } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-length"))) {
        Y_ENSURE(val.size(), "NEH: Content-Length cannot be empty string. ");
        ContentLength_ = FromString<ui64>(val);
        HasContentLength_ = true;
    } else if (AsciiEqualsIgnoreCase(name, TStringBuf("transfer-encoding"))) {
        if (AsciiEqualsIgnoreCase(val, TStringBuf("chunked"))) {
            ChunkInputState_ = new TChunkInputState();
        }
    } else if (AsciiEqualsIgnoreCase(name, TStringBuf("accept-encoding"))) {
        TStringBuf encodings(val);
        while (encodings.size()) {
            TStringBuf enc = encodings.NextTok(',').After(' ').Before(' ');
            if (!enc) {
                continue;
            }
            TString s(enc);
            s.to_lower();
            AcceptEncodings_.insert(s);
        }
    } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-encoding"))) {
        TString s(val);
        s.to_lower();
        ContentEncoding_ = s;
    }
}