aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/http/push_parser
diff options
context:
space:
mode:
authormonster <monster@ydb.tech>2022-07-07 14:41:37 +0300
committermonster <monster@ydb.tech>2022-07-07 14:41:37 +0300
commit06e5c21a835c0e923506c4ff27929f34e00761c2 (patch)
tree75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /library/cpp/http/push_parser
parent03f024c4412e3aa613bb543cf1660176320ba8f4 (diff)
downloadydb-06e5c21a835c0e923506c4ff27929f34e00761c2.tar.gz
fix ya.make
Diffstat (limited to 'library/cpp/http/push_parser')
-rw-r--r--library/cpp/http/push_parser/http_parser.cpp345
-rw-r--r--library/cpp/http/push_parser/http_parser.h165
2 files changed, 510 insertions, 0 deletions
diff --git a/library/cpp/http/push_parser/http_parser.cpp b/library/cpp/http/push_parser/http_parser.cpp
new file mode 100644
index 00000000000..d36618069fe
--- /dev/null
+++ b/library/cpp/http/push_parser/http_parser.cpp
@@ -0,0 +1,345 @@
+#include "http_parser.h"
+
+#include <library/cpp/blockcodecs/stream.h>
+#include <library/cpp/blockcodecs/codecs.h>
+
+#include <util/generic/string.h>
+#include <util/generic/yexception.h>
+#include <util/stream/mem.h>
+#include <util/stream/zlib.h>
+#include <util/string/ascii.h>
+#include <util/string/split.h>
+#include <util/string/strip.h>
+
+//#define DBGOUT(args) Cout << args << Endl;
+#define DBGOUT(args)
+
+namespace {
+ const TString BestCodings[] = {
+ "gzip",
+ "deflate",
+ "br",
+ "x-gzip",
+ "x-deflate",
+ "y-lzo",
+ "y-lzf",
+ "y-lzq",
+ "y-bzip2",
+ "y-lzma",
+ };
+}
+
+TString THttpParser::GetBestCompressionScheme() const {
+ if (AcceptEncodings_.contains("*")) {
+ return BestCodings[0];
+ }
+
+ for (auto& coding : BestCodings) {
+ if (AcceptEncodings_.contains(coding)) {
+ return coding;
+ }
+ }
+
+ return TString();
+}
+
+bool THttpParser::FirstLineParser() {
+ if (Y_UNLIKELY(!ReadLine())) {
+ return false;
+ }
+
+ CurrentLine_.swap(FirstLine_);
+
+ try {
+ TStringBuf s(FirstLine_);
+ if (MessageType_ == Response) {
+ // Status-Line = HTTP-Version SP Status-Code SP Reason-Phrase CRLF
+ TStringBuf httpVersion, statusCode;
+ GetNext(s, ' ', httpVersion);
+ ParseHttpVersion(httpVersion);
+ GetNext(s, ' ', statusCode);
+ RetCode_ = FromString<unsigned>(statusCode);
+ } else {
+ // Request-Line = Method SP Request-URI SP HTTP-Version CRLF
+ TStringBuf httpVersion = s.After(' ').After(' ');
+ ParseHttpVersion(httpVersion);
+ }
+ } catch (...) {
+ throw THttpParseException() << "Cannot parse first line: " << CurrentExceptionMessage() << " First 80 chars of line: " << FirstLine_.substr(0, Min<size_t>(80ull, FirstLine_.size())).Quote();
+ }
+
+ return HeadersParser();
+}
+
+bool THttpParser::HeadersParser() {
+ while (ReadLine()) {
+ if (!CurrentLine_) {
+ //end of headers
+ DBGOUT("end of headers()");
+ ParseHeaderLine();
+
+ if (HasContentLength_) {
+ if (ContentLength_ == 0) {
+ return OnEndParsing();
+ }
+
+ if (ContentLength_ < 1000000) {
+ Content_.reserve(ContentLength_ + 1);
+ }
+ }
+
+ return !!ChunkInputState_ ? ChunkedContentParser() : ContentParser();
+ }
+
+ if (CurrentLine_[0] == ' ' || CurrentLine_[0] == '\t') {
+ //continue previous header-line
+ HeaderLine_ += CurrentLine_;
+ CurrentLine_.remove(0);
+ } else {
+ ParseHeaderLine();
+ HeaderLine_.swap(CurrentLine_);
+ }
+ }
+
+ Parser_ = &THttpParser::HeadersParser;
+ return false;
+}
+
+bool THttpParser::ContentParser() {
+ DBGOUT("Content parsing()");
+ if (HasContentLength_) {
+ size_t rd = Min<size_t>(DataEnd_ - Data_, ContentLength_ - Content_.size());
+ Content_.append(Data_, rd);
+ Data_ += rd;
+ DBGOUT("Content parsing: " << Content_.Size() << " from " << ContentLength_);
+ if (Content_.size() == ContentLength_) {
+ return OnEndParsing();
+ }
+ } else {
+ if (MessageType_ == Request) {
+ return OnEndParsing(); //RFC2616 4.4-5
+ } else if (Y_UNLIKELY(RetCode() < 200 || RetCode() == 204 || RetCode() == 304)) {
+ return OnEndParsing(); //RFC2616 4.4-1 (but not checked HEAD request type !)
+ }
+
+ Content_.append(Data_, DataEnd_);
+ Data_ = DataEnd_;
+ }
+ Parser_ = &THttpParser::ContentParser;
+ return false;
+}
+
+bool THttpParser::ChunkedContentParser() {
+ DBGOUT("ReadChunkedContent");
+ TChunkInputState& ci = *ChunkInputState_;
+
+ if (Content_.capacity() < static_cast<size_t>(DataEnd_ - Data_)) {
+ //try reduce memory reallocations
+ Content_.reserve(DataEnd_ - Data_);
+ }
+
+ do {
+ if (!ci.LeftBytes_) {
+ if (Y_UNLIKELY(!ReadLine())) { //read first chunk size or CRLF from prev chunk or CRLF from last chunk
+ break;
+ }
+
+ if (Y_UNLIKELY(ci.ReadLastChunk_)) {
+ return OnEndParsing();
+ }
+
+ if (!CurrentLine_) {
+ // skip crlf from previous chunk
+ if (!ReadLine()) {
+ break;
+ }
+ }
+ Y_ENSURE(CurrentLine_.size(), "NEH: LeftBytes hex number cannot be empty. ");
+ size_t size = CurrentLine_.find_first_of(" \t;");
+ if (size == TString::npos) {
+ size = CurrentLine_.size();
+ }
+ ci.LeftBytes_ = IntFromString<ui32, 16, char>(CurrentLine_.c_str(), size);
+ CurrentLine_.remove(0);
+ if (!ci.LeftBytes_) { //detectect end of context marker - zero-size chunk, need read CRLF after empty chunk
+ ci.ReadLastChunk_ = true;
+ if (ReadLine()) {
+ return OnEndParsing();
+ } else {
+ break;
+ }
+ }
+ }
+
+ size_t rd = Min<size_t>(DataEnd_ - Data_, ci.LeftBytes_);
+ Content_.append(Data_, rd);
+ Data_ += rd;
+ ci.LeftBytes_ -= rd;
+ } while (Data_ != DataEnd_);
+
+ Parser_ = &THttpParser::ChunkedContentParser;
+ return false;
+}
+
+bool THttpParser::OnEndParsing() {
+ Parser_ = &THttpParser::OnEndParsing;
+ ExtraDataSize_ = DataEnd_ - Data_;
+ return true;
+}
+
+//continue read to CurrentLine_
+bool THttpParser::ReadLine() {
+ TStringBuf in(Data_, DataEnd_);
+ size_t endl = in.find('\n');
+
+ if (Y_UNLIKELY(endl == TStringBuf::npos)) {
+ //input line not completed
+ CurrentLine_.append(Data_, DataEnd_);
+ return false;
+ }
+
+ CurrentLine_.append(in.data(), endl);
+ if (Y_LIKELY(CurrentLine_.size())) {
+ //remove '\r' from tail
+ size_t withoutCR = CurrentLine_.size() - 1;
+ if (CurrentLine_[withoutCR] == '\r') {
+ CurrentLine_.remove(withoutCR);
+ }
+ }
+
+ //Cout << "ReadLine:" << CurrentLine_ << Endl;
+ Data_ += endl + 1;
+ return true;
+}
+
+void THttpParser::ParseHttpVersion(TStringBuf httpVersion) {
+ if (!httpVersion.StartsWith("HTTP/", 5)) {
+ throw yexception() << "expect 'HTTP/'";
+ }
+ httpVersion.Skip(5);
+ {
+ TStringBuf major, minor;
+ Split(httpVersion, '.', major, minor);
+ HttpVersion_.Major = FromString<unsigned>(major);
+ HttpVersion_.Minor = FromString<unsigned>(minor);
+ if (Y_LIKELY(HttpVersion_.Major > 1 || HttpVersion_.Minor > 0)) {
+ // since HTTP/1.1 Keep-Alive is default behaviour
+ KeepAlive_ = true;
+ }
+ }
+}
+
+void THttpParser::ParseHeaderLine() {
+ if (!!HeaderLine_) {
+ if (CollectHeaders_) {
+ THttpInputHeader hdr(HeaderLine_);
+
+ Headers_.AddHeader(hdr);
+
+ ApplyHeaderLine(hdr.Name(), hdr.Value());
+ } else {
+ //some dirty optimization (avoid reallocation new strings)
+ size_t pos = HeaderLine_.find(':');
+
+ if (pos == TString::npos) {
+ ythrow THttpParseException() << "can not parse http header(" << HeaderLine_.Quote() << ")";
+ }
+
+ TStringBuf name(StripString(TStringBuf(HeaderLine_.begin(), HeaderLine_.begin() + pos)));
+ TStringBuf val(StripString(TStringBuf(HeaderLine_.begin() + pos + 1, HeaderLine_.end())));
+ ApplyHeaderLine(name, val);
+ }
+ HeaderLine_.remove(0);
+ }
+}
+
+void THttpParser::OnEof() {
+ if (Parser_ == &THttpParser::ContentParser && !HasContentLength_ && !ChunkInputState_) {
+ return; //end of content determined by end of input
+ }
+ throw THttpException() << TStringBuf("incompleted http response");
+}
+
+bool THttpParser::DecodeContent() {
+ if (!ContentEncoding_ || ContentEncoding_ == "identity" || ContentEncoding_ == "none") {
+ DecodedContent_ = Content_;
+ return false;
+ }
+
+ TMemoryInput in(Content_.data(), Content_.size());
+ if (ContentEncoding_ == "gzip") {
+ auto decompressor = TZLibDecompress(&in, ZLib::GZip);
+ if (!GzipAllowMultipleStreams_) {
+ decompressor.SetAllowMultipleStreams(false);
+ }
+ DecodedContent_ = decompressor.ReadAll();
+ } else if (ContentEncoding_ == "deflate") {
+
+ //https://tools.ietf.org/html/rfc1950
+ bool definitelyNoZlibHeader;
+ if (Content_.size() < 2) {
+ definitelyNoZlibHeader = true;
+ } else {
+ const ui16 cmf = static_cast<ui8>(Content_[0]);
+ const ui16 flg = static_cast<ui8>(Content_[1]);
+ definitelyNoZlibHeader = ((cmf << 8) | flg) % 31 != 0;
+ }
+
+ try {
+ DecodedContent_ = TZLibDecompress(&in, definitelyNoZlibHeader ? ZLib::Raw : ZLib::ZLib).ReadAll();
+ }
+ catch(...) {
+ if (definitelyNoZlibHeader) {
+ throw;
+ }
+ TMemoryInput retryInput(Content_.data(), Content_.size());
+ DecodedContent_ = TZLibDecompress(&retryInput, ZLib::Raw).ReadAll();
+ }
+ } else if (ContentEncoding_.StartsWith("z-")) {
+ // opposite for library/cpp/http/io/stream.h
+ const NBlockCodecs::ICodec* codec = nullptr;
+ try {
+ const TStringBuf codecName = TStringBuf(ContentEncoding_).SubStr(2);
+ if (codecName.StartsWith("zstd06") || codecName.StartsWith("zstd08")) {
+ ythrow NBlockCodecs::TNotFound() << codecName;
+ }
+ codec = NBlockCodecs::Codec(codecName);
+ } catch(const NBlockCodecs::TNotFound& exc) {
+ throw THttpParseException() << "Unsupported content-encoding method: " << exc.AsStrBuf();
+ }
+ NBlockCodecs::TDecodedInput decoder(&in, codec);
+ DecodedContent_ = decoder.ReadAll();
+ } else {
+ throw THttpParseException() << "Unsupported content-encoding method: " << ContentEncoding_;
+ }
+ return true;
+}
+
+void THttpParser::ApplyHeaderLine(const TStringBuf& name, const TStringBuf& val) {
+ if (AsciiEqualsIgnoreCase(name, TStringBuf("connection"))) {
+ KeepAlive_ = AsciiEqualsIgnoreCase(val, TStringBuf("keep-alive"));
+ } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-length"))) {
+ Y_ENSURE(val.size(), "NEH: Content-Length cannot be empty string. ");
+ ContentLength_ = FromString<ui64>(val);
+ HasContentLength_ = true;
+ } else if (AsciiEqualsIgnoreCase(name, TStringBuf("transfer-encoding"))) {
+ if (AsciiEqualsIgnoreCase(val, TStringBuf("chunked"))) {
+ ChunkInputState_ = new TChunkInputState();
+ }
+ } else if (AsciiEqualsIgnoreCase(name, TStringBuf("accept-encoding"))) {
+ TStringBuf encodings(val);
+ while (encodings.size()) {
+ TStringBuf enc = encodings.NextTok(',').After(' ').Before(' ');
+ if (!enc) {
+ continue;
+ }
+ TString s(enc);
+ s.to_lower();
+ AcceptEncodings_.insert(s);
+ }
+ } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-encoding"))) {
+ TString s(val);
+ s.to_lower();
+ ContentEncoding_ = s;
+ }
+}
diff --git a/library/cpp/http/push_parser/http_parser.h b/library/cpp/http/push_parser/http_parser.h
new file mode 100644
index 00000000000..8757a3ef9a2
--- /dev/null
+++ b/library/cpp/http/push_parser/http_parser.h
@@ -0,0 +1,165 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/yexception.h>
+#include <util/generic/hash_set.h>
+#include <util/string/cast.h>
+#include <library/cpp/http/io/stream.h>
+
+struct THttpVersion {
+ unsigned Major = 1;
+ unsigned Minor = 0;
+};
+
+//http requests parser for async/callbacks arch. (uggly state-machine)
+//usage, - call Parse(...), if returned 'true' - all message parsed,
+//external (non entered in message) bytes in input data counted by GetExtraDataSize()
+class THttpParser {
+public:
+ enum TMessageType {
+ Request,
+ Response
+ };
+
+ THttpParser(TMessageType mt = Response)
+ : Parser_(&THttpParser::FirstLineParser)
+ , MessageType_(mt)
+ {
+ }
+
+ inline void DisableCollectingHeaders() noexcept {
+ CollectHeaders_ = false;
+ }
+
+ inline void SetGzipAllowMultipleStreams(bool allow) noexcept {
+ GzipAllowMultipleStreams_ = allow;
+ }
+
+ /// @return true on end parsing (GetExtraDataSize() return amount not used bytes)
+ /// throw exception on bad http format (unsupported encoding, etc)
+ /// sz == 0 signaling end of input stream
+ bool Parse(const char* data, size_t sz) {
+ if (ParseImpl(data, sz)) {
+ DecodeContent();
+ return true;
+ }
+ return false;
+ }
+
+ const char* Data() const noexcept {
+ return Data_;
+ }
+ size_t GetExtraDataSize() const noexcept {
+ return ExtraDataSize_;
+ }
+
+ const TString& FirstLine() const noexcept {
+ return FirstLine_;
+ }
+
+ unsigned RetCode() const noexcept {
+ return RetCode_;
+ }
+
+ const THttpVersion& HttpVersion() const noexcept {
+ return HttpVersion_;
+ }
+
+ const THttpHeaders& Headers() const noexcept {
+ return Headers_;
+ }
+
+ bool IsKeepAlive() const noexcept {
+ return KeepAlive_;
+ }
+
+ bool GetContentLength(ui64& value) const noexcept {
+ if (!HasContentLength_) {
+ return false;
+ }
+
+ value = ContentLength_;
+ return true;
+ }
+
+ TString GetBestCompressionScheme() const;
+
+ const TString& Content() const noexcept {
+ return Content_;
+ }
+
+ const TString& DecodedContent() const noexcept {
+ return DecodedContent_;
+ }
+
+ void Prepare() {
+ HeaderLine_.reserve(128);
+ FirstLine_.reserve(128);
+ }
+
+private:
+ bool ParseImpl(const char* data, size_t sz) {
+ Data_ = data;
+ DataEnd_ = data + sz;
+ if (sz == 0) {
+ OnEof();
+ return true;
+ }
+ return (this->*Parser_)();
+ }
+ // stage parsers
+ bool FirstLineParser();
+ bool HeadersParser();
+ bool ContentParser();
+ bool ChunkedContentParser();
+ bool OnEndParsing();
+
+ // continue read to CurrentLine_
+ bool ReadLine();
+
+ void ParseHttpVersion(TStringBuf httpVersion);
+ void ParseHeaderLine();
+
+ void OnEof();
+ bool DecodeContent();
+
+ void ApplyHeaderLine(const TStringBuf& name, const TStringBuf& val);
+
+ typedef bool (THttpParser::*TParser)();
+
+ TParser Parser_; //current parser (stage)
+ TMessageType MessageType_ = Response;
+ bool CollectHeaders_ = true;
+ bool GzipAllowMultipleStreams_ = true;
+
+ // parsed data
+ const char* Data_ = nullptr;
+ const char* DataEnd_ = nullptr;
+ TString CurrentLine_;
+ TString HeaderLine_;
+
+ size_t ExtraDataSize_ = 0;
+
+ // headers
+ TString FirstLine_;
+ THttpVersion HttpVersion_;
+ unsigned RetCode_ = 0;
+ THttpHeaders Headers_;
+ bool KeepAlive_ = false;
+ THashSet<TString> AcceptEncodings_;
+
+ TString ContentEncoding_;
+ bool HasContentLength_ = false;
+ ui64 ContentLength_ = 0;
+
+ struct TChunkInputState {
+ size_t LeftBytes_ = 0;
+ bool ReadLastChunk_ = false;
+ };
+
+ TAutoPtr<TChunkInputState> ChunkInputState_;
+
+ TString Content_;
+ TString DecodedContent_;
+};