diff options
author | vvvv <vvvv@yandex-team.com> | 2024-11-01 15:41:40 +0300 |
---|---|---|
committer | vvvv <vvvv@yandex-team.com> | 2024-11-01 15:55:52 +0300 |
commit | 3325f745e67f7f442790822b5c9c5e9996708be7 (patch) | |
tree | f7318d68bbe8990092715436444b05297ce35777 /yql/essentials/utils/fetch | |
parent | 6dce3f1c71786f2694b73b1a5155efc58f4557dd (diff) | |
download | ydb-3325f745e67f7f442790822b5c9c5e9996708be7.tar.gz |
Moved yql/utils YQL-19206
Также была выделена жирная зависимость из yql/utils в yql/utils/network, в результате library/cpp/getopt была добавлена явно в те проекты, которые ее ранее наследовали, а не указывали явно
commit_hash:36aa4c41f807b4cbbf70a3ed7ac0a1a5079bb75d
Diffstat (limited to 'yql/essentials/utils/fetch')
-rw-r--r-- | yql/essentials/utils/fetch/fetch.cpp | 182 | ||||
-rw-r--r-- | yql/essentials/utils/fetch/fetch.h | 24 | ||||
-rw-r--r-- | yql/essentials/utils/fetch/ya.make | 16 |
3 files changed, 222 insertions, 0 deletions
diff --git a/yql/essentials/utils/fetch/fetch.cpp b/yql/essentials/utils/fetch/fetch.cpp new file mode 100644 index 0000000000..d35baeb097 --- /dev/null +++ b/yql/essentials/utils/fetch/fetch.cpp @@ -0,0 +1,182 @@ +#include "fetch.h" + +#include <yql/essentials/utils/log/log.h> + +#include <library/cpp/openssl/io/stream.h> +#include <library/cpp/http/misc/httpcodes.h> +#include <library/cpp/charset/ci_string.h> + +#include <util/network/socket.h> +#include <util/string/cast.h> +#include <util/generic/strbuf.h> + +namespace NYql { + +namespace { + +THttpURL ParseURL(const TStringBuf addr, NUri::TParseFlags features) { + THttpURL url; + THttpURL::TParsedState parsedState = url.Parse(addr, features, nullptr, 65536); + if (THttpURL::ParsedOK != parsedState) { + ythrow yexception() << "Bad URL: \"" << addr << "\", " << HttpURLParsedStateToString(parsedState); + } + return url; +} + +class TFetchResultImpl: public IFetchResult { +public: + TFetchResultImpl(const THttpURL& url, const THttpHeaders& additionalHeaders, TDuration timeout) { + TString host = url.Get(THttpURL::FieldHost); + TString path = url.PrintS(THttpURL::FlagPath | THttpURL::FlagQuery); + const char* p = url.Get(THttpURL::FieldPort); + ui16 port = 80; + bool https = false; + + if (url.Get(THttpURL::FieldScheme) == TStringBuf("https")) { + port = 443; + https = true; + } + + if (p) { + port = FromString<ui16>(p); + } + + TString req; + { + TStringOutput rqs(req); + TStringBuf userAgent = "User-Agent: Mozilla/5.0 (compatible; YQL/1.0)"; + + IOutputStream::TPart request[] = { + IOutputStream::TPart("GET ", 4), + IOutputStream::TPart(path.data(), path.size()), + IOutputStream::TPart(" HTTP/1.1", 9), + IOutputStream::TPart::CrLf(), + IOutputStream::TPart("Host: ", 6), + IOutputStream::TPart(host.data(), host.size()), + IOutputStream::TPart::CrLf(), + IOutputStream::TPart(userAgent.data(), userAgent.size()), + IOutputStream::TPart::CrLf(), + }; + rqs.Write(request, Y_ARRAY_SIZE(request)); + if (!additionalHeaders.Empty()) { + additionalHeaders.OutTo(&rqs); + } + rqs << "\r\n"; + } + + Socket.Reset(new TSocket(TNetworkAddress(host, port), timeout)); + SocketInput.Reset(new TSocketInput(*Socket)); + SocketOutput.Reset(new TSocketOutput(*Socket)); + + Socket->SetSocketTimeout(timeout.Seconds(), timeout.MilliSeconds() % 1000); + + if (https) { + Ssl.Reset(new TOpenSslClientIO(SocketInput.Get(), SocketOutput.Get())); + } + + { + THttpOutput ho(Ssl ? (IOutputStream*)Ssl.Get() : (IOutputStream*)SocketOutput.Get()); + (ho << req).Finish(); + } + HttpInput.Reset(new THttpInput(Ssl ? (IInputStream*)Ssl.Get() : (IInputStream*)SocketInput.Get())); + } + + THttpInput& GetStream() override { + return *HttpInput; + } + + unsigned GetRetCode() override { + return ParseHttpRetCode(HttpInput->FirstLine()); + } + + THttpURL GetRedirectURL(const THttpURL& baseUrl) override { + for (auto i = HttpInput->Headers().Begin(); i != HttpInput->Headers().End(); ++i) { + if (0 == TCiString::compare(i->Name(), TStringBuf("location"))) { + THttpURL target = ParseURL(i->Value(), THttpURL::FeaturesAll | NUri::TFeature::FeatureConvertHostIDN); + if (!target.IsValidAbs()) { + target.Merge(baseUrl); + } + return target; + } + } + ythrow yexception() << "Unknown redirect location from " << baseUrl.PrintS(); + } + + static TFetchResultPtr Fetch(const THttpURL& url, const THttpHeaders& additionalHeaders, const TDuration& timeout) { + return new TFetchResultImpl(url, additionalHeaders, timeout); + } + +private: + THolder<TSocket> Socket; + THolder<TSocketInput> SocketInput; + THolder<TSocketOutput> SocketOutput; + THolder<TOpenSslClientIO> Ssl; + THolder<THttpInput> HttpInput; +}; + +inline bool IsRedirectCode(unsigned code) { + switch (code) { + case HTTP_MOVED_PERMANENTLY: + case HTTP_FOUND: + case HTTP_SEE_OTHER: + case HTTP_TEMPORARY_REDIRECT: + return true; + } + return false; +} + +inline bool IsRetryCode(unsigned code) { + switch (code) { + case HTTP_REQUEST_TIME_OUT: + case HTTP_AUTHENTICATION_TIMEOUT: + case HTTP_TOO_MANY_REQUESTS: + case HTTP_GATEWAY_TIME_OUT: + case HTTP_SERVICE_UNAVAILABLE: + return true; + } + return false; +} + +} // unnamed + +THttpURL ParseURL(const TStringBuf addr) { + return ParseURL(addr, THttpURL::FeaturesAll | NUri::TFeature::FeatureConvertHostIDN | NUri::TFeature::FeatureNoRelPath); +} + +TFetchResultPtr Fetch(const THttpURL& url, const THttpHeaders& additionalHeaders, const TDuration& timeout, size_t retries, size_t redirects) { + THttpURL currentUrl = url; + for (size_t fetchNum = 0; fetchNum < redirects; ++fetchNum) { + unsigned responseCode = 0; + TFetchResultPtr fr; + size_t fetchTry = 0; + do { + fr = TFetchResultImpl::Fetch(currentUrl, additionalHeaders, timeout); + responseCode = fr->GetRetCode(); + } while (IsRetryCode(responseCode) && ++fetchTry < retries); + + if (responseCode >= 200 && responseCode < 300) { + return fr; + } + + if (responseCode == HTTP_NOT_MODIFIED) { + return fr; + } + + if (IsRedirectCode(responseCode)) { + currentUrl = fr->GetRedirectURL(currentUrl); + YQL_LOG(INFO) << "Got redirect to " << currentUrl.PrintS(); + continue; + } + + TString errorBody; + try { + errorBody = fr->GetStream().ReadAll(); + } catch (...) { + } + + ythrow yexception() << "Failed to fetch url '" << currentUrl.PrintS() << "' with code " << responseCode << ", body: " << errorBody; + } + ythrow yexception() << "Failed to fetch url '" << currentUrl.PrintS() << "': too many redirects"; +} + +} // NYql diff --git a/yql/essentials/utils/fetch/fetch.h b/yql/essentials/utils/fetch/fetch.h new file mode 100644 index 0000000000..d9e1c3c1a5 --- /dev/null +++ b/yql/essentials/utils/fetch/fetch.h @@ -0,0 +1,24 @@ +#pragma once + +#include <library/cpp/uri/http_url.h> +#include <library/cpp/http/io/headers.h> +#include <library/cpp/http/io/stream.h> + +#include <util/datetime/base.h> +#include <util/generic/string.h> +#include <util/generic/ptr.h> + +namespace NYql { + +struct IFetchResult: public TThrRefBase { + virtual THttpInput& GetStream() = 0; + virtual unsigned GetRetCode() = 0; + virtual THttpURL GetRedirectURL(const THttpURL& baseUrl) = 0; +}; + +using TFetchResultPtr = TIntrusivePtr<IFetchResult>; + +THttpURL ParseURL(const TStringBuf addr); +TFetchResultPtr Fetch(const THttpURL& url, const THttpHeaders& additionalHeaders = {}, const TDuration& timeout = TDuration::Max(), size_t retries = 3, size_t redirects = 10); + +} // NYql diff --git a/yql/essentials/utils/fetch/ya.make b/yql/essentials/utils/fetch/ya.make new file mode 100644 index 0000000000..e95b4fde60 --- /dev/null +++ b/yql/essentials/utils/fetch/ya.make @@ -0,0 +1,16 @@ +LIBRARY() + +SRCS( + fetch.cpp +) + +PEERDIR( + library/cpp/charset + library/cpp/http/io + library/cpp/http/misc + library/cpp/openssl/io + library/cpp/uri + yql/essentials/utils/log +) + +END() |