aboutsummaryrefslogtreecommitdiffstats
path: root/yql/essentials/utils/fetch
diff options
context:
space:
mode:
authorvvvv <vvvv@yandex-team.com>2024-11-01 15:41:40 +0300
committervvvv <vvvv@yandex-team.com>2024-11-01 15:55:52 +0300
commit3325f745e67f7f442790822b5c9c5e9996708be7 (patch)
treef7318d68bbe8990092715436444b05297ce35777 /yql/essentials/utils/fetch
parent6dce3f1c71786f2694b73b1a5155efc58f4557dd (diff)
downloadydb-3325f745e67f7f442790822b5c9c5e9996708be7.tar.gz
Moved yql/utils YQL-19206
Также была выделена жирная зависимость из yql/utils в yql/utils/network, в результате library/cpp/getopt была добавлена явно в те проекты, которые ее ранее наследовали, а не указывали явно commit_hash:36aa4c41f807b4cbbf70a3ed7ac0a1a5079bb75d
Diffstat (limited to 'yql/essentials/utils/fetch')
-rw-r--r--yql/essentials/utils/fetch/fetch.cpp182
-rw-r--r--yql/essentials/utils/fetch/fetch.h24
-rw-r--r--yql/essentials/utils/fetch/ya.make16
3 files changed, 222 insertions, 0 deletions
diff --git a/yql/essentials/utils/fetch/fetch.cpp b/yql/essentials/utils/fetch/fetch.cpp
new file mode 100644
index 0000000000..d35baeb097
--- /dev/null
+++ b/yql/essentials/utils/fetch/fetch.cpp
@@ -0,0 +1,182 @@
+#include "fetch.h"
+
+#include <yql/essentials/utils/log/log.h>
+
+#include <library/cpp/openssl/io/stream.h>
+#include <library/cpp/http/misc/httpcodes.h>
+#include <library/cpp/charset/ci_string.h>
+
+#include <util/network/socket.h>
+#include <util/string/cast.h>
+#include <util/generic/strbuf.h>
+
+namespace NYql {
+
+namespace {
+
+THttpURL ParseURL(const TStringBuf addr, NUri::TParseFlags features) {
+ THttpURL url;
+ THttpURL::TParsedState parsedState = url.Parse(addr, features, nullptr, 65536);
+ if (THttpURL::ParsedOK != parsedState) {
+ ythrow yexception() << "Bad URL: \"" << addr << "\", " << HttpURLParsedStateToString(parsedState);
+ }
+ return url;
+}
+
+class TFetchResultImpl: public IFetchResult {
+public:
+ TFetchResultImpl(const THttpURL& url, const THttpHeaders& additionalHeaders, TDuration timeout) {
+ TString host = url.Get(THttpURL::FieldHost);
+ TString path = url.PrintS(THttpURL::FlagPath | THttpURL::FlagQuery);
+ const char* p = url.Get(THttpURL::FieldPort);
+ ui16 port = 80;
+ bool https = false;
+
+ if (url.Get(THttpURL::FieldScheme) == TStringBuf("https")) {
+ port = 443;
+ https = true;
+ }
+
+ if (p) {
+ port = FromString<ui16>(p);
+ }
+
+ TString req;
+ {
+ TStringOutput rqs(req);
+ TStringBuf userAgent = "User-Agent: Mozilla/5.0 (compatible; YQL/1.0)";
+
+ IOutputStream::TPart request[] = {
+ IOutputStream::TPart("GET ", 4),
+ IOutputStream::TPart(path.data(), path.size()),
+ IOutputStream::TPart(" HTTP/1.1", 9),
+ IOutputStream::TPart::CrLf(),
+ IOutputStream::TPart("Host: ", 6),
+ IOutputStream::TPart(host.data(), host.size()),
+ IOutputStream::TPart::CrLf(),
+ IOutputStream::TPart(userAgent.data(), userAgent.size()),
+ IOutputStream::TPart::CrLf(),
+ };
+ rqs.Write(request, Y_ARRAY_SIZE(request));
+ if (!additionalHeaders.Empty()) {
+ additionalHeaders.OutTo(&rqs);
+ }
+ rqs << "\r\n";
+ }
+
+ Socket.Reset(new TSocket(TNetworkAddress(host, port), timeout));
+ SocketInput.Reset(new TSocketInput(*Socket));
+ SocketOutput.Reset(new TSocketOutput(*Socket));
+
+ Socket->SetSocketTimeout(timeout.Seconds(), timeout.MilliSeconds() % 1000);
+
+ if (https) {
+ Ssl.Reset(new TOpenSslClientIO(SocketInput.Get(), SocketOutput.Get()));
+ }
+
+ {
+ THttpOutput ho(Ssl ? (IOutputStream*)Ssl.Get() : (IOutputStream*)SocketOutput.Get());
+ (ho << req).Finish();
+ }
+ HttpInput.Reset(new THttpInput(Ssl ? (IInputStream*)Ssl.Get() : (IInputStream*)SocketInput.Get()));
+ }
+
+ THttpInput& GetStream() override {
+ return *HttpInput;
+ }
+
+ unsigned GetRetCode() override {
+ return ParseHttpRetCode(HttpInput->FirstLine());
+ }
+
+ THttpURL GetRedirectURL(const THttpURL& baseUrl) override {
+ for (auto i = HttpInput->Headers().Begin(); i != HttpInput->Headers().End(); ++i) {
+ if (0 == TCiString::compare(i->Name(), TStringBuf("location"))) {
+ THttpURL target = ParseURL(i->Value(), THttpURL::FeaturesAll | NUri::TFeature::FeatureConvertHostIDN);
+ if (!target.IsValidAbs()) {
+ target.Merge(baseUrl);
+ }
+ return target;
+ }
+ }
+ ythrow yexception() << "Unknown redirect location from " << baseUrl.PrintS();
+ }
+
+ static TFetchResultPtr Fetch(const THttpURL& url, const THttpHeaders& additionalHeaders, const TDuration& timeout) {
+ return new TFetchResultImpl(url, additionalHeaders, timeout);
+ }
+
+private:
+ THolder<TSocket> Socket;
+ THolder<TSocketInput> SocketInput;
+ THolder<TSocketOutput> SocketOutput;
+ THolder<TOpenSslClientIO> Ssl;
+ THolder<THttpInput> HttpInput;
+};
+
+inline bool IsRedirectCode(unsigned code) {
+ switch (code) {
+ case HTTP_MOVED_PERMANENTLY:
+ case HTTP_FOUND:
+ case HTTP_SEE_OTHER:
+ case HTTP_TEMPORARY_REDIRECT:
+ return true;
+ }
+ return false;
+}
+
+inline bool IsRetryCode(unsigned code) {
+ switch (code) {
+ case HTTP_REQUEST_TIME_OUT:
+ case HTTP_AUTHENTICATION_TIMEOUT:
+ case HTTP_TOO_MANY_REQUESTS:
+ case HTTP_GATEWAY_TIME_OUT:
+ case HTTP_SERVICE_UNAVAILABLE:
+ return true;
+ }
+ return false;
+}
+
+} // unnamed
+
+THttpURL ParseURL(const TStringBuf addr) {
+ return ParseURL(addr, THttpURL::FeaturesAll | NUri::TFeature::FeatureConvertHostIDN | NUri::TFeature::FeatureNoRelPath);
+}
+
+TFetchResultPtr Fetch(const THttpURL& url, const THttpHeaders& additionalHeaders, const TDuration& timeout, size_t retries, size_t redirects) {
+ THttpURL currentUrl = url;
+ for (size_t fetchNum = 0; fetchNum < redirects; ++fetchNum) {
+ unsigned responseCode = 0;
+ TFetchResultPtr fr;
+ size_t fetchTry = 0;
+ do {
+ fr = TFetchResultImpl::Fetch(currentUrl, additionalHeaders, timeout);
+ responseCode = fr->GetRetCode();
+ } while (IsRetryCode(responseCode) && ++fetchTry < retries);
+
+ if (responseCode >= 200 && responseCode < 300) {
+ return fr;
+ }
+
+ if (responseCode == HTTP_NOT_MODIFIED) {
+ return fr;
+ }
+
+ if (IsRedirectCode(responseCode)) {
+ currentUrl = fr->GetRedirectURL(currentUrl);
+ YQL_LOG(INFO) << "Got redirect to " << currentUrl.PrintS();
+ continue;
+ }
+
+ TString errorBody;
+ try {
+ errorBody = fr->GetStream().ReadAll();
+ } catch (...) {
+ }
+
+ ythrow yexception() << "Failed to fetch url '" << currentUrl.PrintS() << "' with code " << responseCode << ", body: " << errorBody;
+ }
+ ythrow yexception() << "Failed to fetch url '" << currentUrl.PrintS() << "': too many redirects";
+}
+
+} // NYql
diff --git a/yql/essentials/utils/fetch/fetch.h b/yql/essentials/utils/fetch/fetch.h
new file mode 100644
index 0000000000..d9e1c3c1a5
--- /dev/null
+++ b/yql/essentials/utils/fetch/fetch.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <library/cpp/uri/http_url.h>
+#include <library/cpp/http/io/headers.h>
+#include <library/cpp/http/io/stream.h>
+
+#include <util/datetime/base.h>
+#include <util/generic/string.h>
+#include <util/generic/ptr.h>
+
+namespace NYql {
+
+struct IFetchResult: public TThrRefBase {
+ virtual THttpInput& GetStream() = 0;
+ virtual unsigned GetRetCode() = 0;
+ virtual THttpURL GetRedirectURL(const THttpURL& baseUrl) = 0;
+};
+
+using TFetchResultPtr = TIntrusivePtr<IFetchResult>;
+
+THttpURL ParseURL(const TStringBuf addr);
+TFetchResultPtr Fetch(const THttpURL& url, const THttpHeaders& additionalHeaders = {}, const TDuration& timeout = TDuration::Max(), size_t retries = 3, size_t redirects = 10);
+
+} // NYql
diff --git a/yql/essentials/utils/fetch/ya.make b/yql/essentials/utils/fetch/ya.make
new file mode 100644
index 0000000000..e95b4fde60
--- /dev/null
+++ b/yql/essentials/utils/fetch/ya.make
@@ -0,0 +1,16 @@
+LIBRARY()
+
+SRCS(
+ fetch.cpp
+)
+
+PEERDIR(
+ library/cpp/charset
+ library/cpp/http/io
+ library/cpp/http/misc
+ library/cpp/openssl/io
+ library/cpp/uri
+ yql/essentials/utils/log
+)
+
+END()