fix ya.make

author: monster <[email protected]> 2022-07-07 14:41:37 +0300
committer: monster <[email protected]> 2022-07-07 14:41:37 +0300
commit: 06e5c21a835c0e923506c4ff27929f34e00761c2 (patch)
tree: 75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /library/cpp/http/push_parser
parent: 03f024c4412e3aa613bb543cf1660176320ba8f4 (diff)
2 files changed, 510 insertions, 0 deletions
diff --git a/library/cpp/http/push_parser/http_parser.cpp b/library/cpp/http/push_parser/http_parser.cpp
new file mode 100644
index 00000000000..d36618069fe
--- /dev/null
+++ b/library/cpp/http/push_parser/http_parser.cpp
@@ -0,0 +1,345 @@
+#include "http_parser.h"
+
+#include <library/cpp/blockcodecs/stream.h>
+#include <library/cpp/blockcodecs/codecs.h>
+
+#include <util/generic/string.h>
+#include <util/generic/yexception.h>
+#include <util/stream/mem.h>
+#include <util/stream/zlib.h>
+#include <util/string/ascii.h>
+#include <util/string/split.h>
+#include <util/string/strip.h>
+
+//#define DBGOUT(args) Cout << args << Endl;
+#define DBGOUT(args)
+
+namespace {
+    const TString BestCodings[] = {
+        "gzip",
+        "deflate",
+        "br",
+        "x-gzip",
+        "x-deflate",
+        "y-lzo",
+        "y-lzf",
+        "y-lzq",
+        "y-bzip2",
+        "y-lzma",
+    };
+}
+
+TString THttpParser::GetBestCompressionScheme() const {
+    if (AcceptEncodings_.contains("*")) {
+        return BestCodings[0];
+    }
+
+    for (auto& coding : BestCodings) {
+        if (AcceptEncodings_.contains(coding)) {
+            return coding;
+        }
+    }
+
+    return TString();
+}
+
+bool THttpParser::FirstLineParser() {
+    if (Y_UNLIKELY(!ReadLine())) {
+        return false;
+    }
+
+    CurrentLine_.swap(FirstLine_);
+
+    try {
+        TStringBuf s(FirstLine_);
+        if (MessageType_ == Response) {
+            // Status-Line = HTTP-Version SP Status-Code SP Reason-Phrase CRLF
+            TStringBuf httpVersion, statusCode;
+            GetNext(s, ' ', httpVersion);
+            ParseHttpVersion(httpVersion);
+            GetNext(s, ' ', statusCode);
+            RetCode_ = FromString<unsigned>(statusCode);
+        } else {
+            // Request-Line   = Method SP Request-URI SP HTTP-Version CRLF
+            TStringBuf httpVersion = s.After(' ').After(' ');
+            ParseHttpVersion(httpVersion);
+        }
+    } catch (...) {
+        throw THttpParseException() << "Cannot parse first line: " << CurrentExceptionMessage() << " First 80 chars of line: " << FirstLine_.substr(0, Min<size_t>(80ull, FirstLine_.size())).Quote();
+    }
+
+    return HeadersParser();
+}
+
+bool THttpParser::HeadersParser() {
+    while (ReadLine()) {
+        if (!CurrentLine_) {
+            //end of headers
+            DBGOUT("end of headers()");
+            ParseHeaderLine();
+
+            if (HasContentLength_) {
+                if (ContentLength_ == 0) {
+                    return OnEndParsing();
+                }
+
+                if (ContentLength_ < 1000000) {
+                    Content_.reserve(ContentLength_ + 1);
+                }
+            }
+
+            return !!ChunkInputState_ ? ChunkedContentParser() : ContentParser();
+        }
+
+        if (CurrentLine_[0] == ' ' || CurrentLine_[0] == '\t') {
+            //continue previous header-line
+            HeaderLine_ += CurrentLine_;
+            CurrentLine_.remove(0);
+        } else {
+            ParseHeaderLine();
+            HeaderLine_.swap(CurrentLine_);
+        }
+    }
+
+    Parser_ = &THttpParser::HeadersParser;
+    return false;
+}
+
+bool THttpParser::ContentParser() {
+    DBGOUT("Content parsing()");
+    if (HasContentLength_) {
+        size_t rd = Min<size_t>(DataEnd_ - Data_, ContentLength_ - Content_.size());
+        Content_.append(Data_, rd);
+        Data_ += rd;
+        DBGOUT("Content parsing: " << Content_.Size() << " from " << ContentLength_);
+        if (Content_.size() == ContentLength_) {
+            return OnEndParsing();
+        }
+    } else {
+        if (MessageType_ == Request) {
+            return OnEndParsing(); //RFC2616 4.4-5
+        } else if (Y_UNLIKELY(RetCode() < 200 || RetCode() == 204 || RetCode() == 304)) {
+            return OnEndParsing(); //RFC2616 4.4-1 (but not checked HEAD request type !)
+        }
+
+        Content_.append(Data_, DataEnd_);
+        Data_ = DataEnd_;
+    }
+    Parser_ = &THttpParser::ContentParser;
+    return false;
+}
+
+bool THttpParser::ChunkedContentParser() {
+    DBGOUT("ReadChunkedContent");
+    TChunkInputState& ci = *ChunkInputState_;
+
+    if (Content_.capacity() < static_cast<size_t>(DataEnd_ - Data_)) {
+        //try reduce memory reallocations
+        Content_.reserve(DataEnd_ - Data_);
+    }
+
+    do {
+        if (!ci.LeftBytes_) {
+            if (Y_UNLIKELY(!ReadLine())) { //read first chunk size or CRLF from prev chunk or CRLF from last chunk
+                break;
+            }
+
+            if (Y_UNLIKELY(ci.ReadLastChunk_)) {
+                return OnEndParsing();
+            }
+
+            if (!CurrentLine_) {
+                // skip crlf from previous chunk
+                if (!ReadLine()) {
+                    break;
+                }
+            }
+            Y_ENSURE(CurrentLine_.size(), "NEH: LeftBytes hex number cannot be empty. ");
+            size_t size = CurrentLine_.find_first_of(" \t;");
+            if (size == TString::npos) {
+                size = CurrentLine_.size();
+            }
+            ci.LeftBytes_ = IntFromString<ui32, 16, char>(CurrentLine_.c_str(), size);
+            CurrentLine_.remove(0);
+            if (!ci.LeftBytes_) { //detectect end of context marker - zero-size chunk, need read CRLF after empty chunk
+                ci.ReadLastChunk_ = true;
+                if (ReadLine()) {
+                    return OnEndParsing();
+                } else {
+                    break;
+                }
+            }
+        }
+
+        size_t rd = Min<size_t>(DataEnd_ - Data_, ci.LeftBytes_);
+        Content_.append(Data_, rd);
+        Data_ += rd;
+        ci.LeftBytes_ -= rd;
+    } while (Data_ != DataEnd_);
+
+    Parser_ = &THttpParser::ChunkedContentParser;
+    return false;
+}
+
+bool THttpParser::OnEndParsing() {
+    Parser_ = &THttpParser::OnEndParsing;
+    ExtraDataSize_ = DataEnd_ - Data_;
+    return true;
+}
+
+//continue read to CurrentLine_
+bool THttpParser::ReadLine() {
+    TStringBuf in(Data_, DataEnd_);
+    size_t endl = in.find('\n');
+
+    if (Y_UNLIKELY(endl == TStringBuf::npos)) {
+        //input line not completed
+        CurrentLine_.append(Data_, DataEnd_);
+        return false;
+    }
+
+    CurrentLine_.append(in.data(), endl);
+    if (Y_LIKELY(CurrentLine_.size())) {
+        //remove '\r' from tail
+        size_t withoutCR = CurrentLine_.size() - 1;
+        if (CurrentLine_[withoutCR] == '\r') {
+            CurrentLine_.remove(withoutCR);
+        }
+    }
+
+    //Cout << "ReadLine:" << CurrentLine_ << Endl;
+    Data_ += endl + 1;
+    return true;
+}
+
+void THttpParser::ParseHttpVersion(TStringBuf httpVersion) {
+    if (!httpVersion.StartsWith("HTTP/", 5)) {
+        throw yexception() << "expect 'HTTP/'";
+    }
+    httpVersion.Skip(5);
+    {
+        TStringBuf major, minor;
+        Split(httpVersion, '.', major, minor);
+        HttpVersion_.Major = FromString<unsigned>(major);
+        HttpVersion_.Minor = FromString<unsigned>(minor);
+        if (Y_LIKELY(HttpVersion_.Major > 1 || HttpVersion_.Minor > 0)) {
+            // since HTTP/1.1 Keep-Alive is default behaviour
+            KeepAlive_ = true;
+        }
+    }
+}
+
+void THttpParser::ParseHeaderLine() {
+    if (!!HeaderLine_) {
+        if (CollectHeaders_) {
+            THttpInputHeader hdr(HeaderLine_);
+
+            Headers_.AddHeader(hdr);
+
+            ApplyHeaderLine(hdr.Name(), hdr.Value());
+        } else {
+            //some dirty optimization (avoid reallocation new strings)
+            size_t pos = HeaderLine_.find(':');
+
+            if (pos == TString::npos) {
+                ythrow THttpParseException() << "can not parse http header(" << HeaderLine_.Quote() << ")";
+            }
+
+            TStringBuf name(StripString(TStringBuf(HeaderLine_.begin(), HeaderLine_.begin() + pos)));
+            TStringBuf val(StripString(TStringBuf(HeaderLine_.begin() + pos + 1, HeaderLine_.end())));
+            ApplyHeaderLine(name, val);
+        }
+        HeaderLine_.remove(0);
+    }
+}
+
+void THttpParser::OnEof() {
+    if (Parser_ == &THttpParser::ContentParser && !HasContentLength_ && !ChunkInputState_) {
+        return; //end of content determined by end of input
+    }
+    throw THttpException() << TStringBuf("incompleted http response");
+}
+
+bool THttpParser::DecodeContent() {
+    if (!ContentEncoding_ || ContentEncoding_ == "identity" || ContentEncoding_ == "none") {
+        DecodedContent_ = Content_;
+        return false;
+    }
+
+    TMemoryInput in(Content_.data(), Content_.size());
+    if (ContentEncoding_ == "gzip") {
+        auto decompressor = TZLibDecompress(&in, ZLib::GZip);
+        if (!GzipAllowMultipleStreams_) {
+            decompressor.SetAllowMultipleStreams(false);
+        }
+        DecodedContent_ = decompressor.ReadAll();
+    } else if (ContentEncoding_ == "deflate") {
+
+        //https://tools.ietf.org/html/rfc1950
+        bool definitelyNoZlibHeader;
+        if (Content_.size() < 2) {
+            definitelyNoZlibHeader = true;
+        } else {
+            const ui16 cmf = static_cast<ui8>(Content_[0]);
+            const ui16 flg = static_cast<ui8>(Content_[1]);
+            definitelyNoZlibHeader = ((cmf << 8) | flg) % 31 != 0;
+        }
+
+        try {
+            DecodedContent_ = TZLibDecompress(&in, definitelyNoZlibHeader ? ZLib::Raw : ZLib::ZLib).ReadAll();
+        }
+        catch(...) {
+            if (definitelyNoZlibHeader) {
+                throw;
+            }
+            TMemoryInput retryInput(Content_.data(), Content_.size());
+            DecodedContent_ = TZLibDecompress(&retryInput, ZLib::Raw).ReadAll();
+        }
+    } else if (ContentEncoding_.StartsWith("z-")) {
+        // opposite for library/cpp/http/io/stream.h
+        const NBlockCodecs::ICodec* codec = nullptr;
+        try {
+            const TStringBuf codecName = TStringBuf(ContentEncoding_).SubStr(2);
+            if (codecName.StartsWith("zstd06") || codecName.StartsWith("zstd08")) {
+                ythrow NBlockCodecs::TNotFound() << codecName;
+            }
+            codec = NBlockCodecs::Codec(codecName);
+        } catch(const NBlockCodecs::TNotFound& exc) {
+            throw THttpParseException() << "Unsupported content-encoding method: " << exc.AsStrBuf();
+        }
+        NBlockCodecs::TDecodedInput decoder(&in, codec);
+        DecodedContent_ = decoder.ReadAll();
+    } else {
+        throw THttpParseException() << "Unsupported content-encoding method: " << ContentEncoding_;
+    }
+    return true;
+}
+
+void THttpParser::ApplyHeaderLine(const TStringBuf& name, const TStringBuf& val) {
+    if (AsciiEqualsIgnoreCase(name, TStringBuf("connection"))) {
+        KeepAlive_ = AsciiEqualsIgnoreCase(val, TStringBuf("keep-alive"));
+    } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-length"))) {
+        Y_ENSURE(val.size(), "NEH: Content-Length cannot be empty string. ");
+        ContentLength_ = FromString<ui64>(val);
+        HasContentLength_ = true;
+    } else if (AsciiEqualsIgnoreCase(name, TStringBuf("transfer-encoding"))) {
+        if (AsciiEqualsIgnoreCase(val, TStringBuf("chunked"))) {
+            ChunkInputState_ = new TChunkInputState();
+        }
+    } else if (AsciiEqualsIgnoreCase(name, TStringBuf("accept-encoding"))) {
+        TStringBuf encodings(val);
+        while (encodings.size()) {
+            TStringBuf enc = encodings.NextTok(',').After(' ').Before(' ');
+            if (!enc) {
+                continue;
+            }
+            TString s(enc);
+            s.to_lower();
+            AcceptEncodings_.insert(s);
+        }
+    } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-encoding"))) {
+        TString s(val);
+        s.to_lower();
+        ContentEncoding_ = s;
+    }
+}
diff --git a/library/cpp/http/push_parser/http_parser.h b/library/cpp/http/push_parser/http_parser.h
new file mode 100644
index 00000000000..8757a3ef9a2
--- /dev/null
+++ b/library/cpp/http/push_parser/http_parser.h
@@ -0,0 +1,165 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/yexception.h>
+#include <util/generic/hash_set.h>
+#include <util/string/cast.h>
+#include <library/cpp/http/io/stream.h>
+
+struct THttpVersion {
+    unsigned Major = 1;
+    unsigned Minor = 0;
+};
+
+//http requests parser for async/callbacks arch. (uggly state-machine)
+//usage, - call Parse(...), if returned 'true' - all message parsed,
+//external (non entered in message) bytes in input data counted by GetExtraDataSize()
+class THttpParser {
+public:
+    enum TMessageType {
+        Request,
+        Response
+    };
+
+    THttpParser(TMessageType mt = Response)
+        : Parser_(&THttpParser::FirstLineParser)
+        , MessageType_(mt)
+    {
+    }
+
+    inline void DisableCollectingHeaders() noexcept {
+        CollectHeaders_ = false;
+    }
+
+    inline void SetGzipAllowMultipleStreams(bool allow) noexcept {
+        GzipAllowMultipleStreams_ = allow;
+    }
+
+    /// @return true on end parsing (GetExtraDataSize() return amount not used bytes)
+    /// throw exception on bad http format (unsupported encoding, etc)
+    /// sz == 0 signaling end of input stream
+    bool Parse(const char* data, size_t sz) {
+        if (ParseImpl(data, sz)) {
+            DecodeContent();
+            return true;
+        }
+        return false;
+    }
+
+    const char* Data() const noexcept {
+        return Data_;
+    }
+    size_t GetExtraDataSize() const noexcept {
+        return ExtraDataSize_;
+    }
+
+    const TString& FirstLine() const noexcept {
+        return FirstLine_;
+    }
+
+    unsigned RetCode() const noexcept {
+        return RetCode_;
+    }
+
+    const THttpVersion& HttpVersion() const noexcept {
+        return HttpVersion_;
+    }
+
+    const THttpHeaders& Headers() const noexcept {
+        return Headers_;
+    }
+
+    bool IsKeepAlive() const noexcept {
+        return KeepAlive_;
+    }
+
+    bool GetContentLength(ui64& value) const noexcept {
+        if (!HasContentLength_) {
+            return false;
+        }
+
+        value = ContentLength_;
+        return true;
+    }
+
+    TString GetBestCompressionScheme() const;
+
+    const TString& Content() const noexcept {
+        return Content_;
+    }
+
+    const TString& DecodedContent() const noexcept {
+        return DecodedContent_;
+    }
+
+    void Prepare() {
+        HeaderLine_.reserve(128);
+        FirstLine_.reserve(128);
+    }
+
+private:
+    bool ParseImpl(const char* data, size_t sz) {
+        Data_ = data;
+        DataEnd_ = data + sz;
+        if (sz == 0) {
+            OnEof();
+            return true;
+        }
+        return (this->*Parser_)();
+    }
+    // stage parsers
+    bool FirstLineParser();
+    bool HeadersParser();
+    bool ContentParser();
+    bool ChunkedContentParser();
+    bool OnEndParsing();
+
+    // continue read to CurrentLine_
+    bool ReadLine();
+
+    void ParseHttpVersion(TStringBuf httpVersion);
+    void ParseHeaderLine();
+
+    void OnEof();
+    bool DecodeContent();
+
+    void ApplyHeaderLine(const TStringBuf& name, const TStringBuf& val);
+
+    typedef bool (THttpParser::*TParser)();
+
+    TParser Parser_; //current parser (stage)
+    TMessageType MessageType_ = Response;
+    bool CollectHeaders_ = true;
+    bool GzipAllowMultipleStreams_ = true;
+
+    // parsed data
+    const char* Data_ = nullptr;
+    const char* DataEnd_ = nullptr;
+    TString CurrentLine_;
+    TString HeaderLine_;
+
+    size_t ExtraDataSize_ = 0;
+
+    // headers
+    TString FirstLine_;
+    THttpVersion HttpVersion_;
+    unsigned RetCode_ = 0;
+    THttpHeaders Headers_;
+    bool KeepAlive_ = false;
+    THashSet<TString> AcceptEncodings_;
+
+    TString ContentEncoding_;
+    bool HasContentLength_ = false;
+    ui64 ContentLength_ = 0;
+
+    struct TChunkInputState {
+        size_t LeftBytes_ = 0;
+        bool ReadLastChunk_ = false;
+    };
+
+    TAutoPtr<TChunkInputState> ChunkInputState_;
+
+    TString Content_;
+    TString DecodedContent_;
+};
author	monster <[email protected]>	2022-07-07 14:41:37 +0300
committer	monster <[email protected]>	2022-07-07 14:41:37 +0300
commit	06e5c21a835c0e923506c4ff27929f34e00761c2 (patch)
tree	75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /library/cpp/http/push_parser
parent	03f024c4412e3aa613bb543cf1660176320ba8f4 (diff)