aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/http/fetch
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/http/fetch
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/http/fetch')
-rw-r--r--library/cpp/http/fetch/exthttpcodes.cpp266
-rw-r--r--library/cpp/http/fetch/exthttpcodes.h141
-rw-r--r--library/cpp/http/fetch/http_digest.cpp206
-rw-r--r--library/cpp/http/fetch/http_digest.h47
-rw-r--r--library/cpp/http/fetch/http_socket.cpp206
-rw-r--r--library/cpp/http/fetch/httpagent.h316
-rw-r--r--library/cpp/http/fetch/httpfetcher.h171
-rw-r--r--library/cpp/http/fetch/httpfsm.h104
-rw-r--r--library/cpp/http/fetch/httpfsm.rl6684
-rw-r--r--library/cpp/http/fetch/httpfsm_ut.cpp591
-rw-r--r--library/cpp/http/fetch/httpheader.cpp7
-rw-r--r--library/cpp/http/fetch/httpheader.h287
-rw-r--r--library/cpp/http/fetch/httpload.cpp373
-rw-r--r--library/cpp/http/fetch/httpload.h307
-rw-r--r--library/cpp/http/fetch/httpparser.h372
-rw-r--r--library/cpp/http/fetch/httpparser_ut.cpp231
-rw-r--r--library/cpp/http/fetch/httpzreader.h295
-rw-r--r--library/cpp/http/fetch/library-htfetch_ut_hreflang_in.h155
-rw-r--r--library/cpp/http/fetch/library-htfetch_ut_hreflang_out.h3
-rw-r--r--library/cpp/http/fetch/sockhandler.h130
-rw-r--r--library/cpp/http/fetch/ut/ya.make12
-rw-r--r--library/cpp/http/fetch/ya.make38
22 files changed, 4942 insertions, 0 deletions
diff --git a/library/cpp/http/fetch/exthttpcodes.cpp b/library/cpp/http/fetch/exthttpcodes.cpp
new file mode 100644
index 0000000000..acc05650c8
--- /dev/null
+++ b/library/cpp/http/fetch/exthttpcodes.cpp
@@ -0,0 +1,266 @@
+#include "exthttpcodes.h"
+
+#include <cstring>
+
+const ui16 CrazyServer = ShouldDelete | MarkSuspect;
+
+struct http_flag {
+ ui16 http;
+ ui16 flag;
+};
+static http_flag HTTP_FLAG[] = {
+ {HTTP_CONTINUE, MarkSuspect}, // 100
+ {HTTP_SWITCHING_PROTOCOLS, CrazyServer}, // 101
+ {HTTP_PROCESSING, CrazyServer}, // 102
+
+ {HTTP_OK, ShouldReindex}, // 200
+ {HTTP_CREATED, CrazyServer}, // 201
+ {HTTP_ACCEPTED, ShouldDelete}, // 202
+ {HTTP_NON_AUTHORITATIVE_INFORMATION, ShouldReindex}, // 203
+ {HTTP_NO_CONTENT, ShouldDelete}, // 204
+ {HTTP_RESET_CONTENT, ShouldDelete}, // 205
+ {HTTP_PARTIAL_CONTENT, ShouldReindex}, // 206
+ {HTTP_MULTI_STATUS, CrazyServer}, // 207
+ {HTTP_ALREADY_REPORTED, CrazyServer}, // 208
+ {HTTP_IM_USED, CrazyServer}, // 226
+
+ {HTTP_MULTIPLE_CHOICES, CheckLinks | ShouldDelete}, // 300
+ {HTTP_MOVED_PERMANENTLY, CheckLocation | ShouldDelete | MoveRedir}, // 301
+ {HTTP_FOUND, CheckLocation | ShouldDelete | MoveRedir}, // 302
+ {HTTP_SEE_OTHER, CheckLocation | ShouldDelete | MoveRedir}, // 303
+ {HTTP_NOT_MODIFIED, 0}, // 304
+ {HTTP_USE_PROXY, ShouldDelete}, // 305
+ {HTTP_TEMPORARY_REDIRECT, CheckLocation | ShouldDelete | MoveRedir}, // 307
+ {HTTP_PERMANENT_REDIRECT, CheckLocation | ShouldDelete | MoveRedir}, // 308
+
+ {HTTP_BAD_REQUEST, CrazyServer}, // 400
+ {HTTP_UNAUTHORIZED, ShouldDelete}, // 401
+ {HTTP_PAYMENT_REQUIRED, ShouldDelete}, // 402
+ {HTTP_FORBIDDEN, ShouldDelete}, // 403
+ {HTTP_NOT_FOUND, ShouldDelete}, // 404
+ {HTTP_METHOD_NOT_ALLOWED, ShouldDelete}, // 405
+ {HTTP_NOT_ACCEPTABLE, ShouldDelete}, // 406
+ {HTTP_PROXY_AUTHENTICATION_REQUIRED, CrazyServer}, // 407
+ {HTTP_REQUEST_TIME_OUT, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 408
+ {HTTP_CONFLICT, MarkSuspect}, // 409
+ {HTTP_GONE, ShouldDelete}, // 410
+ {HTTP_LENGTH_REQUIRED, CrazyServer}, // 411
+ {HTTP_PRECONDITION_FAILED, CrazyServer}, // 412
+ {HTTP_REQUEST_ENTITY_TOO_LARGE, CrazyServer}, // 413
+ {HTTP_REQUEST_URI_TOO_LARGE, ShouldDelete}, // 414
+ {HTTP_UNSUPPORTED_MEDIA_TYPE, CrazyServer}, // 415
+ {HTTP_REQUESTED_RANGE_NOT_SATISFIABLE, CrazyServer}, // 416
+ {HTTP_EXPECTATION_FAILED, ShouldDelete}, // 417
+ {HTTP_I_AM_A_TEAPOT, CrazyServer}, // 418
+ {HTTP_AUTHENTICATION_TIMEOUT, ShouldDelete}, // 419
+
+ {HTTP_MISDIRECTED_REQUEST, CrazyServer}, // 421
+ {HTTP_UNPROCESSABLE_ENTITY, CrazyServer}, // 422
+ {HTTP_LOCKED, ShouldDelete}, // 423
+ {HTTP_FAILED_DEPENDENCY, CrazyServer}, // 424
+ {HTTP_UPGRADE_REQUIRED, ShouldDelete}, // 426
+ {HTTP_PRECONDITION_REQUIRED, ShouldDelete}, // 428
+ {HTTP_TOO_MANY_REQUESTS, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 429
+ {HTTP_UNAVAILABLE_FOR_LEGAL_REASONS, ShouldDelete}, // 451
+
+ {HTTP_INTERNAL_SERVER_ERROR, MarkSuspect}, // 500
+ {HTTP_NOT_IMPLEMENTED, ShouldDelete | ShouldDisconnect}, // 501
+ {HTTP_BAD_GATEWAY, MarkSuspect}, // 502
+ {HTTP_SERVICE_UNAVAILABLE, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 503
+ {HTTP_GATEWAY_TIME_OUT, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 504
+ {HTTP_HTTP_VERSION_NOT_SUPPORTED, CrazyServer | ShouldDisconnect}, // 505
+
+ {HTTP_VARIANT_ALSO_NEGOTIATES, CrazyServer | ShouldDisconnect}, // 506
+ {HTTP_INSUFFICIENT_STORAGE, CrazyServer | ShouldDisconnect}, // 507
+ {HTTP_LOOP_DETECTED, CrazyServer | ShouldDisconnect}, // 508
+ {HTTP_BANDWIDTH_LIMIT_EXCEEDED, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 509
+ {HTTP_NOT_EXTENDED, ShouldDelete}, // 510
+ {HTTP_NETWORK_AUTHENTICATION_REQUIRED, ShouldDelete}, // 511
+
+ // custom
+ {HTTP_BAD_RESPONSE_HEADER, CrazyServer}, // 1000
+ {HTTP_CONNECTION_LOST, ShouldRetry}, // 1001
+ {HTTP_BODY_TOO_LARGE, ShouldDelete | CanBeFake}, // 1002
+ {HTTP_ROBOTS_TXT_DISALLOW, ShouldDelete}, // 1003
+ {HTTP_BAD_URL, ShouldDelete}, // 1004
+ {HTTP_BAD_MIME, ShouldDelete}, // 1005
+ {HTTP_DNS_FAILURE, ShouldDisconnect | MarkSuspect}, // 1006
+ {HTTP_BAD_STATUS_CODE, CrazyServer}, // 1007
+ {HTTP_BAD_HEADER_STRING, CrazyServer}, // 1008
+ {HTTP_BAD_CHUNK, CrazyServer}, // 1009
+ {HTTP_CONNECT_FAILED, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 1010
+ {HTTP_FILTER_DISALLOW, ShouldDelete}, // 1011
+ {HTTP_LOCAL_EIO, ShouldRetry}, // 1012
+ {HTTP_BAD_CONTENT_LENGTH, ShouldDelete}, // 1013
+ {HTTP_BAD_ENCODING, ShouldDelete}, // 1014
+ {HTTP_LENGTH_UNKNOWN, ShouldDelete}, // 1015
+ {HTTP_HEADER_EOF, ShouldRetry | CanBeFake}, // 1016
+ {HTTP_MESSAGE_EOF, ShouldRetry | CanBeFake}, // 1017
+ {HTTP_CHUNK_EOF, ShouldRetry | CanBeFake}, // 1018
+ {HTTP_PAST_EOF, ShouldRetry | ShouldDelete | CanBeFake}, // 1019
+ {HTTP_HEADER_TOO_LARGE, ShouldDelete}, // 1020
+ {HTTP_URL_TOO_LARGE, ShouldDelete}, // 1021
+ {HTTP_INTERRUPTED, 0}, // 1022
+ {HTTP_CUSTOM_NOT_MODIFIED, 0}, // 1023
+ {HTTP_BAD_CONTENT_ENCODING, ShouldDelete}, // 1024
+ {HTTP_PROXY_UNKNOWN, 0}, // 1030
+ {HTTP_PROXY_REQUEST_TIME_OUT, 0}, // 1031
+ {HTTP_PROXY_INTERNAL_ERROR, 0}, // 1032
+ {HTTP_PROXY_CONNECT_FAILED, 0}, // 1033
+ {HTTP_PROXY_CONNECTION_LOST, 0}, // 1034
+ {HTTP_PROXY_NO_PROXY, 0}, // 1035
+ {HTTP_PROXY_ERROR, 0}, // 1036
+ {HTTP_SSL_ERROR, 0}, // 1037
+ {HTTP_CACHED_COPY_NOT_FOUND, 0}, // 1038
+ {HTTP_TIMEDOUT_WHILE_BYTES_RECEIVING, ShouldRetry}, // 1039
+ {HTTP_FETCHER_BAD_RESPONSE, 0}, // 1040
+ {HTTP_FETCHER_MB_ERROR, 0}, // 1041
+ {HTTP_SSL_CERT_ERROR, 0}, // 1042
+
+ // Custom (replace HTTP 200/304)
+ {EXT_HTTP_MIRRMOVE, 0}, // 2000
+ {EXT_HTTP_MANUAL_DELETE, ShouldDelete}, // 2001
+ {EXT_HTTP_NOTUSED2, ShouldDelete}, // 2002
+ {EXT_HTTP_NOTUSED3, ShouldDelete}, // 2003
+ {EXT_HTTP_REFRESH, ShouldDelete | CheckLinks | MoveRedir}, // 2004
+ {EXT_HTTP_NOINDEX, ShouldDelete | CheckLinks}, // 2005
+ {EXT_HTTP_BADCODES, ShouldDelete}, // 2006
+ {EXT_HTTP_SITESTAT, ShouldDelete}, // 2007
+ {EXT_HTTP_IOERROR, ShouldDelete}, // 2008
+ {EXT_HTTP_BASEERROR, ShouldDelete}, // 2009
+ {EXT_HTTP_PARSERROR, ShouldDelete | CanBeFake}, // 2010
+ {EXT_HTTP_BAD_CHARSET, ShouldDelete | CheckLinks}, // 2011
+ {EXT_HTTP_BAD_LANGUAGE, ShouldDelete | CheckLinks}, // 2012
+ {EXT_HTTP_NUMERERROR, ShouldDelete}, // 2013
+ {EXT_HTTP_EMPTYDOC, ShouldDelete | CheckLinks}, // 2014
+ {EXT_HTTP_HUGEDOC, ShouldDelete}, // 2015
+ {EXT_HTTP_LINKGARBAGE, ShouldDelete}, // 2016
+ {EXT_HTTP_PARSERFAIL, ShouldDelete}, // 2019
+ {EXT_HTTP_GZIPERROR, ShouldDelete}, // 2020
+ {EXT_HTTP_MANUAL_DELETE_URL, ShouldDelete}, // 2022
+ {EXT_HTTP_CUSTOM_PARTIAL_CONTENT, ShouldReindex}, // 2023
+ {EXT_HTTP_EMPTY_RESPONSE, ShouldDelete}, // 2024
+ {EXT_HTTP_REL_CANONICAL, ShouldDelete | CheckLinks | MoveRedir}, // 2025
+ {0, 0}};
+
+static ui16* prepare_flags(http_flag* arg) {
+ static ui16 flags[EXT_HTTP_CODE_MAX];
+ http_flag* ptr;
+ size_t i;
+
+ // устанавливаем значение по умолчанию для кодов не перечисленных в таблице выше
+ for (i = 0; i < EXT_HTTP_CODE_MAX; ++i)
+ flags[i] = CrazyServer;
+
+ // устанавливаем флаги для перечисленных кодов
+ for (ptr = arg; ptr->http; ++ptr)
+ flags[ptr->http & (EXT_HTTP_CODE_MAX - 1)] = ptr->flag;
+
+ // для стандартных кодов ошибок берем флаги из первого кода каждой группы и проставляем их
+ // всем кодам не перечисленным в таблице выше
+ for (size_t group = 0; group < 1000; group += 100)
+ for (size_t j = group + 1; j < group + 100; ++j)
+ flags[j] = flags[group];
+
+ // предыдущий цикл затер некоторые флаги перечисленные в таблице выше
+ // восстанавливаем их
+ for (ptr = arg; ptr->http; ++ptr)
+ flags[ptr->http & (EXT_HTTP_CODE_MAX - 1)] = ptr->flag;
+
+ return flags;
+}
+
+ui16* http2status = prepare_flags(HTTP_FLAG);
+
+TStringBuf ExtHttpCodeStr(int code) noexcept {
+ if (code < HTTP_CODE_MAX) {
+ return HttpCodeStr(code);
+ }
+ switch (code) {
+ case HTTP_BAD_RESPONSE_HEADER:
+ return TStringBuf("Bad response header");
+ case HTTP_CONNECTION_LOST:
+ return TStringBuf("Connection lost");
+ case HTTP_BODY_TOO_LARGE:
+ return TStringBuf("Body too large");
+ case HTTP_ROBOTS_TXT_DISALLOW:
+ return TStringBuf("robots.txt disallow");
+ case HTTP_BAD_URL:
+ return TStringBuf("Bad url");
+ case HTTP_BAD_MIME:
+ return TStringBuf("Bad mime type");
+ case HTTP_DNS_FAILURE:
+ return TStringBuf("Dns failure");
+ case HTTP_BAD_STATUS_CODE:
+ return TStringBuf("Bad status code");
+ case HTTP_BAD_HEADER_STRING:
+ return TStringBuf("Bad header string");
+ case HTTP_BAD_CHUNK:
+ return TStringBuf("Bad chunk");
+ case HTTP_CONNECT_FAILED:
+ return TStringBuf("Connect failed");
+ case HTTP_FILTER_DISALLOW:
+ return TStringBuf("Filter disallow");
+ case HTTP_LOCAL_EIO:
+ return TStringBuf("Local eio");
+ case HTTP_BAD_CONTENT_LENGTH:
+ return TStringBuf("Bad content length");
+ case HTTP_BAD_ENCODING:
+ return TStringBuf("Bad encoding");
+ case HTTP_LENGTH_UNKNOWN:
+ return TStringBuf("Length unknown");
+ case HTTP_HEADER_EOF:
+ return TStringBuf("Header EOF");
+ case HTTP_MESSAGE_EOF:
+ return TStringBuf("Message EOF");
+ case HTTP_CHUNK_EOF:
+ return TStringBuf("Chunk EOF");
+ case HTTP_PAST_EOF:
+ return TStringBuf("Past EOF");
+ case HTTP_HEADER_TOO_LARGE:
+ return TStringBuf("Header is too large");
+ case HTTP_URL_TOO_LARGE:
+ return TStringBuf("Url is too large");
+ case HTTP_INTERRUPTED:
+ return TStringBuf("Interrupted");
+ case HTTP_CUSTOM_NOT_MODIFIED:
+ return TStringBuf("Signature detector thinks that doc is not modified");
+ case HTTP_BAD_CONTENT_ENCODING:
+ return TStringBuf("Bad content encoding");
+ case HTTP_NO_RESOURCES:
+ return TStringBuf("No resources");
+ case HTTP_FETCHER_SHUTDOWN:
+ return TStringBuf("Fetcher shutdown");
+ case HTTP_CHUNK_TOO_LARGE:
+ return TStringBuf("Chunk size is too big");
+ case HTTP_SERVER_BUSY:
+ return TStringBuf("Server is busy");
+ case HTTP_SERVICE_UNKNOWN:
+ return TStringBuf("Service is unknown");
+ case HTTP_PROXY_UNKNOWN:
+ return TStringBuf("Zora: unknown error");
+ case HTTP_PROXY_REQUEST_TIME_OUT:
+ return TStringBuf("Zora: request time out");
+ case HTTP_PROXY_INTERNAL_ERROR:
+ return TStringBuf("Zora: internal server error");
+ case HTTP_PROXY_CONNECT_FAILED:
+ return TStringBuf("Spider proxy connect failed");
+ case HTTP_PROXY_CONNECTION_LOST:
+ return TStringBuf("Spider proxy connection lost");
+ case HTTP_PROXY_NO_PROXY:
+ return TStringBuf("Spider proxy no proxy alive in region");
+ case HTTP_PROXY_ERROR:
+ return TStringBuf("Spider proxy returned custom error");
+ case HTTP_SSL_ERROR:
+ return TStringBuf("Ssl library returned error");
+ case HTTP_CACHED_COPY_NOT_FOUND:
+ return TStringBuf("Cached copy for the url is not available");
+ case HTTP_TIMEDOUT_WHILE_BYTES_RECEIVING:
+ return TStringBuf("Timed out while bytes receiving");
+
+ // TODO: messages for >2000 codes
+
+ default:
+ return TStringBuf("Unknown HTTP code");
+ }
+}
diff --git a/library/cpp/http/fetch/exthttpcodes.h b/library/cpp/http/fetch/exthttpcodes.h
new file mode 100644
index 0000000000..6b525052cd
--- /dev/null
+++ b/library/cpp/http/fetch/exthttpcodes.h
@@ -0,0 +1,141 @@
+#pragma once
+
+#include <util/system/defaults.h>
+#include <library/cpp/http/misc/httpcodes.h>
+
+enum ExtHttpCodes {
+ // Custom
+ HTTP_EXTENDED = 1000,
+ HTTP_BAD_RESPONSE_HEADER = 1000,
+ HTTP_CONNECTION_LOST = 1001,
+ HTTP_BODY_TOO_LARGE = 1002,
+ HTTP_ROBOTS_TXT_DISALLOW = 1003,
+ HTTP_BAD_URL = 1004,
+ HTTP_BAD_MIME = 1005,
+ HTTP_DNS_FAILURE = 1006,
+ HTTP_BAD_STATUS_CODE = 1007,
+ HTTP_BAD_HEADER_STRING = 1008,
+ HTTP_BAD_CHUNK = 1009,
+ HTTP_CONNECT_FAILED = 1010,
+ HTTP_FILTER_DISALLOW = 1011,
+ HTTP_LOCAL_EIO = 1012,
+ HTTP_BAD_CONTENT_LENGTH = 1013,
+ HTTP_BAD_ENCODING = 1014,
+ HTTP_LENGTH_UNKNOWN = 1015,
+ HTTP_HEADER_EOF = 1016,
+ HTTP_MESSAGE_EOF = 1017,
+ HTTP_CHUNK_EOF = 1018,
+ HTTP_PAST_EOF = 1019,
+ HTTP_HEADER_TOO_LARGE = 1020,
+ HTTP_URL_TOO_LARGE = 1021,
+ HTTP_INTERRUPTED = 1022,
+ HTTP_CUSTOM_NOT_MODIFIED = 1023,
+ HTTP_BAD_CONTENT_ENCODING = 1024,
+ HTTP_NO_RESOURCES = 1025,
+ HTTP_FETCHER_SHUTDOWN = 1026,
+ HTTP_CHUNK_TOO_LARGE = 1027,
+ HTTP_SERVER_BUSY = 1028,
+ HTTP_SERVICE_UNKNOWN = 1029,
+ HTTP_PROXY_UNKNOWN = 1030,
+ HTTP_PROXY_REQUEST_TIME_OUT = 1031,
+ HTTP_PROXY_INTERNAL_ERROR = 1032,
+ HTTP_PROXY_CONNECT_FAILED = 1033,
+ HTTP_PROXY_CONNECTION_LOST = 1034,
+ HTTP_PROXY_NO_PROXY = 1035,
+ HTTP_PROXY_ERROR = 1036,
+ HTTP_SSL_ERROR = 1037,
+ HTTP_CACHED_COPY_NOT_FOUND = 1038,
+ HTTP_TIMEDOUT_WHILE_BYTES_RECEIVING = 1039,
+ HTTP_FETCHER_BAD_RESPONSE = 1040,
+ HTTP_FETCHER_MB_ERROR = 1041,
+ HTTP_SSL_CERT_ERROR = 1042,
+ HTTP_PROXY_REQUEST_CANCELED = 1051,
+
+ // Custom (replace HTTP 200/304)
+ EXT_HTTP_EXT_SUCCESS_BEGIN = 2000, // to check if code variable is in success interval
+ EXT_HTTP_MIRRMOVE = 2000,
+ EXT_HTTP_MANUAL_DELETE = 2001,
+ EXT_HTTP_NOTUSED2 = 2002,
+ EXT_HTTP_NOTUSED3 = 2003,
+ EXT_HTTP_REFRESH = 2004,
+ EXT_HTTP_NOINDEX = 2005,
+ EXT_HTTP_BADCODES = 2006,
+ EXT_HTTP_SITESTAT = 2007,
+ EXT_HTTP_IOERROR = 2008,
+ EXT_HTTP_BASEERROR = 2009,
+ EXT_HTTP_PARSERROR = 2010,
+ EXT_HTTP_BAD_CHARSET = 2011,
+ EXT_HTTP_BAD_LANGUAGE = 2012,
+ EXT_HTTP_NUMERERROR = 2013,
+ EXT_HTTP_EMPTYDOC = 2014,
+ EXT_HTTP_HUGEDOC = 2015,
+ EXT_HTTP_LINKGARBAGE = 2016,
+ EXT_HTTP_EXDUPLICATE = 2017,
+ EXT_HTTP_FILTERED = 2018,
+ EXT_HTTP_PARSERFAIL = 2019, // parser crashed (in this case image spider will redownload such document)
+ EXT_HTTP_GZIPERROR = 2020,
+ EXT_HTTP_CLEANPARAM = 2021,
+ EXT_HTTP_MANUAL_DELETE_URL = 2022,
+ EXT_HTTP_CUSTOM_PARTIAL_CONTENT = 2023,
+ EXT_HTTP_EMPTY_RESPONSE = 2024,
+ EXT_HTTP_REL_CANONICAL = 2025,
+
+ EXT_HTTP_EXT_SUCCESS_END = 3000, // to check if code variable is in success interval
+ EXT_HTTP_HOSTFILTER = 3001,
+ EXT_HTTP_URLFILTER = 3002,
+ EXT_HTTP_SUFFIXFILTER = 3003,
+ EXT_HTTP_DOMAINFILTER = 3004,
+ EXT_HTTP_EXTDOMAINFILTER = 3005,
+ EXT_HTTP_PORTFILTER = 3006,
+ EXT_HTTP_MIRROR = 3007,
+ EXT_HTTP_DEEPDIR = 3008,
+ EXT_HTTP_DUPDIRS = 3009,
+ EXT_HTTP_REGEXP = 3010,
+ EXT_HTTP_OLDDELETED = 3012,
+ EXT_HTTP_PENALTY = 3013,
+ EXT_HTTP_POLICY = 3015,
+ EXT_HTTP_TOOOLD = 3016,
+ EXT_HTTP_GARBAGE = 3017,
+ EXT_HTTP_FOREIGN = 3018,
+ EXT_HTTP_EXT_REGEXP = 3019,
+ EXT_HTTP_HOPS = 3020,
+ EXT_HTTP_SELRANK = 3021,
+ EXT_HTTP_NOLINKS = 3022,
+ EXT_HTTP_WRONGMULTILANG = 3023,
+ EXT_HTTP_SOFTMIRRORS = 3024,
+ EXT_HTTP_BIGLEVEL = 3025,
+
+ // fast robot codes
+
+ EXT_HTTP_FASTHOPS = 4000,
+ EXT_HTTP_NODOC = 4001,
+
+ EXT_HTTP_MAX
+};
+
+enum HttpFlags {
+ // connection
+ ShouldDisconnect = 1,
+ ShouldRetry = 2,
+ // UNUSED 4
+
+ // indexer
+ ShouldReindex = 8,
+ ShouldDelete = 16,
+ CheckLocation = 32,
+ CheckLinks = 64,
+ MarkSuspect = 128,
+ // UNUSED 256
+ // UNUSED 512
+ MoveRedir = 1024,
+ CanBeFake = 2048,
+};
+
+const size_t EXT_HTTP_CODE_MAX = 1 << 12;
+
+static inline int Http2Status(int code) {
+ extern ui16* http2status;
+ return http2status[code & (EXT_HTTP_CODE_MAX - 1)];
+}
+
+TStringBuf ExtHttpCodeStr(int code) noexcept;
diff --git a/library/cpp/http/fetch/http_digest.cpp b/library/cpp/http/fetch/http_digest.cpp
new file mode 100644
index 0000000000..1eaa02b7f2
--- /dev/null
+++ b/library/cpp/http/fetch/http_digest.cpp
@@ -0,0 +1,206 @@
+#include "http_digest.h"
+
+#include <library/cpp/digest/md5/md5.h>
+#include <util/stream/output.h>
+#include <util/stream/str.h>
+
+/************************************************************/
+/************************************************************/
+static const char* WWW_PREFIX = "Authorization: Digest ";
+
+/************************************************************/
+httpDigestHandler::httpDigestHandler()
+ : User_(nullptr)
+ , Password_(nullptr)
+ , Nonce_(nullptr)
+ , NonceCount_(0)
+ , HeaderInstruction_(nullptr)
+{
+}
+
+/************************************************************/
+httpDigestHandler::~httpDigestHandler() {
+ clear();
+}
+
+/************************************************************/
+void httpDigestHandler::clear() {
+ free(Nonce_);
+ free(HeaderInstruction_);
+ User_ = Password_ = nullptr;
+ Nonce_ = HeaderInstruction_ = nullptr;
+ NonceCount_ = 0;
+}
+
+/************************************************************/
+void httpDigestHandler::setAuthorization(const char* user, const char* password) {
+ clear();
+ if (user && password) {
+ User_ = user;
+ Password_ = password;
+ }
+}
+
+/************************************************************/
+const char* httpDigestHandler::getHeaderInstruction() const {
+ return HeaderInstruction_;
+}
+
+/************************************************************/
+void httpDigestHandler::generateCNonce(char* outCNonce) {
+ if (!*outCNonce)
+ sprintf(outCNonce, "%ld", (long)time(nullptr));
+}
+
+/************************************************************/
+inline void addMD5(MD5& ctx, const char* value) {
+ ctx.Update((const unsigned char*)(value), strlen(value));
+}
+
+inline void addMD5(MD5& ctx, const char* value, int len) {
+ ctx.Update((const unsigned char*)(value), len);
+}
+
+inline void addMD5Sep(MD5& ctx) {
+ addMD5(ctx, ":", 1);
+}
+
+/************************************************************/
+/* calculate H(A1) as per spec */
+void httpDigestHandler::digestCalcHA1(const THttpAuthHeader& hd,
+ char* outSessionKey,
+ char* outCNonce) {
+ MD5 ctx;
+ ctx.Init();
+ addMD5(ctx, User_);
+ addMD5Sep(ctx);
+ addMD5(ctx, hd.realm);
+ addMD5Sep(ctx);
+ addMD5(ctx, Password_);
+
+ if (hd.algorithm == 1) { //MD5-sess
+ unsigned char digest[16];
+ ctx.Final(digest);
+
+ generateCNonce(outCNonce);
+
+ ctx.Init();
+ ctx.Update(digest, 16);
+ addMD5Sep(ctx);
+ addMD5(ctx, hd.nonce);
+ addMD5Sep(ctx);
+ addMD5(ctx, outCNonce);
+ ctx.End(outSessionKey);
+ }
+
+ ctx.End(outSessionKey);
+};
+
+/************************************************************/
+/* calculate request-digest/response-digest as per HTTP Digest spec */
+void httpDigestHandler::digestCalcResponse(const THttpAuthHeader& hd,
+ const char* path,
+ const char* method,
+ const char* nonceCount,
+ char* outResponse,
+ char* outCNonce) {
+ char HA1[33];
+ digestCalcHA1(hd, HA1, outCNonce);
+
+ char HA2[33];
+ MD5 ctx;
+ ctx.Init();
+ addMD5(ctx, method);
+ addMD5Sep(ctx);
+ addMD5(ctx, path);
+ //ignore auth-int
+ ctx.End(HA2);
+
+ ctx.Init();
+ addMD5(ctx, HA1, 32);
+ addMD5Sep(ctx);
+ addMD5(ctx, Nonce_);
+ addMD5Sep(ctx);
+
+ if (hd.qop_auth) {
+ if (!*outCNonce)
+ generateCNonce(outCNonce);
+
+ addMD5(ctx, nonceCount, 8);
+ addMD5Sep(ctx);
+ addMD5(ctx, outCNonce);
+ addMD5Sep(ctx);
+ addMD5(ctx, "auth", 4);
+ addMD5Sep(ctx);
+ }
+ addMD5(ctx, HA2, 32);
+ ctx.End(outResponse);
+}
+
+/************************************************************/
+bool httpDigestHandler::processHeader(const THttpAuthHeader* header,
+ const char* path,
+ const char* method,
+ const char* cnonce) {
+ if (!User_ || !header || !header->use_auth || !header->realm || !header->nonce)
+ return false;
+
+ if (Nonce_) {
+ if (strcmp(Nonce_, header->nonce)) {
+ free(Nonce_);
+ Nonce_ = nullptr;
+ NonceCount_ = 0;
+ }
+ }
+ if (!Nonce_) {
+ Nonce_ = strdup(header->nonce);
+ NonceCount_ = 0;
+ }
+ free(HeaderInstruction_);
+ HeaderInstruction_ = nullptr;
+ NonceCount_++;
+
+ char nonceCount[20];
+ sprintf(nonceCount, "%08d", NonceCount_);
+
+ char CNonce[50];
+ if (cnonce)
+ strcpy(CNonce, cnonce);
+ else
+ CNonce[0] = 0;
+
+ char response[33];
+ digestCalcResponse(*header, path, method, nonceCount, response, CNonce);
+
+ //digest-response = 1#( username | realm | nonce | digest-uri
+ // | response | [ algorithm ] | [cnonce] |
+ // [opaque] | [message-qop] |
+ // [nonce-count] | [auth-param] )
+
+ TStringStream out;
+ out << WWW_PREFIX << "username=\"" << User_ << "\"";
+ out << ", realm=\"" << header->realm << "\"";
+ out << ", nonce=\"" << header->nonce << "\"";
+ out << ", uri=\"" << path << "\"";
+ if (header->algorithm == 1)
+ out << ", algorithm=MD5-sess";
+ else
+ out << ", algorithm=MD5";
+ if (header->qop_auth)
+ out << ", qop=auth";
+ out << ", nc=" << nonceCount;
+ if (CNonce[0])
+ out << ", cnonce=\"" << CNonce << "\"";
+ out << ", response=\"" << response << "\"";
+ if (header->opaque)
+ out << ", opaque=\"" << header->opaque << "\"";
+ out << "\r\n";
+
+ TString s_out = out.Str();
+ HeaderInstruction_ = strdup(s_out.c_str());
+
+ return true;
+}
+
+/************************************************************/
+/************************************************************/
diff --git a/library/cpp/http/fetch/http_digest.h b/library/cpp/http/fetch/http_digest.h
new file mode 100644
index 0000000000..3b1872d70b
--- /dev/null
+++ b/library/cpp/http/fetch/http_digest.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "httpheader.h"
+
+#include <util/system/compat.h>
+#include <library/cpp/http/misc/httpcodes.h>
+
+class httpDigestHandler {
+protected:
+ const char* User_;
+ const char* Password_;
+ char* Nonce_;
+ int NonceCount_;
+ char* HeaderInstruction_;
+
+ void clear();
+
+ void generateCNonce(char* outCNonce);
+
+ void digestCalcHA1(const THttpAuthHeader& hd,
+ char* outSessionKey,
+ char* outCNonce);
+
+ void digestCalcResponse(const THttpAuthHeader& hd,
+ const char* method,
+ const char* path,
+ const char* nonceCount,
+ char* outResponse,
+ char* outCNonce);
+
+public:
+ httpDigestHandler();
+ ~httpDigestHandler();
+
+ void setAuthorization(const char* user,
+ const char* password);
+ bool processHeader(const THttpAuthHeader* header,
+ const char* path,
+ const char* method,
+ const char* cnonce = nullptr);
+
+ bool empty() const {
+ return (!User_);
+ }
+
+ const char* getHeaderInstruction() const;
+};
diff --git a/library/cpp/http/fetch/http_socket.cpp b/library/cpp/http/fetch/http_socket.cpp
new file mode 100644
index 0000000000..1524ef04a8
--- /dev/null
+++ b/library/cpp/http/fetch/http_socket.cpp
@@ -0,0 +1,206 @@
+#include "httpload.h"
+#include "http_digest.h"
+
+/************************************************************/
+
+#ifdef USE_GNUTLS
+
+#include <gcrypt.h>
+#include <gnutls/gnutls.h>
+#include <util/network/init.h>
+#include <util/network/socket.h>
+#include <util/system/mutex.h>
+
+/********************************************************/
+// HTTPS handler is used as implementation of
+// socketAbstractHandler for work through HTTPS protocol
+
+class socketSecureHandler: public socketRegularHandler {
+protected:
+ bool IsValid_;
+ gnutls_session Session_;
+ gnutls_certificate_credentials Credits_;
+
+public:
+ socketSecureHandler();
+ virtual ~socketSecureHandler();
+
+ virtual bool Good();
+ virtual int Connect(const TAddrList& addrs, TDuration Timeout);
+ virtual void Disconnect();
+ virtual void shutdown();
+ virtual bool send(const char* message, ssize_t messlen);
+ virtual bool peek();
+ virtual ssize_t read(void* buffer, ssize_t buflen);
+};
+
+/********************************************************/
+/********************************************************/
+static int gcry_pthread_mutex_init(void** priv) {
+ int err = 0;
+
+ try {
+ TMutex* lock = new TMutex;
+ *priv = lock;
+ } catch (...) {
+ err = -1;
+ }
+
+ return err;
+}
+
+static int gcry_pthread_mutex_destroy(void** lock) {
+ delete static_cast<TMutex*>(*lock);
+
+ return 0;
+}
+
+static int gcry_pthread_mutex_lock(void** lock) {
+ static_cast<TMutex*>(*lock)->Acquire();
+
+ return 0;
+}
+
+static int gcry_pthread_mutex_unlock(void** lock) {
+ static_cast<TMutex*>(*lock)->Release();
+
+ return 0;
+}
+
+static struct gcry_thread_cbs gcry_threads_pthread =
+ {
+ GCRY_THREAD_OPTION_PTHREAD, NULL,
+ gcry_pthread_mutex_init, gcry_pthread_mutex_destroy,
+ gcry_pthread_mutex_lock, gcry_pthread_mutex_unlock,
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL};
+
+/********************************************************/
+struct https_initor {
+ https_initor() {
+ gcry_control(GCRYCTL_SET_THREAD_CBS, &gcry_threads_pthread);
+ gnutls_global_init();
+ InitNetworkSubSystem();
+ }
+
+ ~https_initor() {
+ gnutls_global_deinit();
+ }
+};
+
+static https_initor _initor;
+
+/********************************************************/
+socketSecureHandler::socketSecureHandler()
+ : socketRegularHandler()
+ , IsValid_(false)
+ , Session_()
+ , Credits_()
+{
+}
+
+/********************************************************/
+socketSecureHandler::~socketSecureHandler() {
+ if (IsValid_)
+ Disconnect();
+}
+
+/********************************************************/
+bool socketSecureHandler::Good() {
+ return Socket_.Good() && IsValid_;
+}
+
+/********************************************************/
+int socketSecureHandler::Connect(const TAddrList& addrs, TDuration Timeout) {
+ IsValid_ = false;
+
+ int ret = socketRegularHandler::Connect(addrs, Timeout);
+ if (ret)
+ return ret;
+
+ gnutls_certificate_allocate_credentials(&Credits_);
+ gnutls_init(&Session_, GNUTLS_CLIENT);
+ gnutls_set_default_priority(Session_);
+ gnutls_credentials_set(Session_, GNUTLS_CRD_CERTIFICATE, Credits_);
+
+ SOCKET fd = Socket_;
+ gnutls_transport_set_ptr(Session_, (gnutls_transport_ptr)fd);
+
+ ret = gnutls_handshake(Session_);
+
+ if (ret < 0) {
+ fprintf(stderr, "*** Handshake failed\n");
+ gnutls_perror(ret);
+
+ gnutls_deinit(Session_);
+ if (Credits_) {
+ gnutls_certificate_free_credentials(Credits_);
+ Credits_ = 0;
+ }
+ return 1;
+ }
+
+ IsValid_ = true;
+ return !IsValid_;
+}
+
+/********************************************************/
+void socketSecureHandler::Disconnect() {
+ if (IsValid_) {
+ gnutls_bye(Session_, GNUTLS_SHUT_RDWR);
+ IsValid_ = false;
+ gnutls_deinit(Session_);
+ }
+
+ if (Credits_) {
+ gnutls_certificate_free_credentials(Credits_);
+ Credits_ = 0;
+ }
+
+ socketRegularHandler::Disconnect();
+}
+
+/********************************************************/
+void socketSecureHandler::shutdown() {
+}
+
+/********************************************************/
+bool socketSecureHandler::send(const char* message, ssize_t messlen) {
+ if (!IsValid_)
+ return false;
+ ssize_t rv = gnutls_record_send(Session_, message, messlen);
+ return rv >= 0;
+}
+
+/********************************************************/
+bool socketSecureHandler::peek() {
+ //ssize_t rv = gnutls_record_check_pending(mSession);
+ //return rv>0;
+ return true;
+}
+
+/********************************************************/
+ssize_t socketSecureHandler::read(void* buffer, ssize_t buflen) {
+ if (!IsValid_)
+ return false;
+ return gnutls_record_recv(Session_, (char*)buffer, buflen);
+}
+
+#endif
+
+/************************************************************/
+socketAbstractHandler* socketHandlerFactory::chooseHandler(const THttpURL& url) {
+ if (url.IsValidGlobal() && url.GetScheme() == THttpURL::SchemeHTTP)
+ return new socketRegularHandler;
+
+#ifdef USE_GNUTLS
+ if (url.IsValidGlobal() && url.GetScheme() == THttpURL::SchemeHTTPS)
+ return new socketSecureHandler;
+#endif
+
+ return nullptr;
+}
+
+/************************************************************/
+socketHandlerFactory socketHandlerFactory::sInstance;
+/************************************************************/
diff --git a/library/cpp/http/fetch/httpagent.h b/library/cpp/http/fetch/httpagent.h
new file mode 100644
index 0000000000..96475cc05d
--- /dev/null
+++ b/library/cpp/http/fetch/httpagent.h
@@ -0,0 +1,316 @@
+#pragma once
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+
+#include <library/cpp/uri/http_url.h>
+#include <util/datetime/base.h>
+#include <util/network/hostip.h>
+#include <util/network/ip.h>
+#include <util/network/sock.h>
+#include <util/generic/scope.h>
+#include <util/generic/utility.h>
+#include <util/string/cast.h>
+
+#include "exthttpcodes.h"
+#include "sockhandler.h"
+
+class TIpResolver {
+public:
+ TAddrList Resolve(const char* host, TIpPort port) const {
+ try {
+ TAddrList result;
+ TNetworkAddress na(host, port);
+ for (auto i = na.Begin(); i != na.End(); ++i) {
+ const struct addrinfo& ai = *i;
+ switch (ai.ai_family) {
+ case AF_INET:
+ result.push_back(new NAddr::TIPv4Addr(*(sockaddr_in*)ai.ai_addr));
+ break;
+ case AF_INET6:
+ result.push_back(new NAddr::TIPv6Addr(*(sockaddr_in6*)ai.ai_addr));
+ break;
+ }
+ }
+ return result;
+ } catch (const TNetworkResolutionError&) {
+ }
+ return TAddrList();
+ }
+};
+
+namespace NResolverHelpers {
+ Y_HAS_MEMBER(Resolve);
+
+ template <typename TResolver>
+ std::enable_if_t<TClassHasResolve<TResolver>::value, TAddrList> Resolve(const TResolver& r, const char* host, TIpPort port) {
+ return r.Resolve(host, port);
+ }
+
+ template <typename TResolver>
+ std::enable_if_t<!TClassHasResolve<TResolver>::value, TAddrList> Resolve(const TResolver& r, const char* host, TIpPort port) {
+ ui32 ip = 0;
+ if (r.GetHostIP(host, &ip)) {
+ // error
+ return TAddrList();
+ }
+ if (!ip) {
+ return TAddrList();
+ }
+
+ return TAddrList::MakeV4Addr(ip, port);
+ }
+}
+
+template <typename TBase>
+class TIpResolverWrapper {
+private:
+ TBase Base;
+
+public:
+ TIpResolverWrapper() = default;
+
+ template <typename T>
+ TIpResolverWrapper(T&& base)
+ : Base(std::forward(base))
+ {
+ }
+
+ TAddrList Resolve(const char* host, TIpPort port) const {
+ return NResolverHelpers::Resolve(Base, host, port);
+ }
+};
+
+template <class TSocketHandler = TSimpleSocketHandler, class TDnsClient = TIpResolver>
+class THttpAgent {
+public:
+ THttpAgent()
+ : Persistent(0)
+ , Timeout(TDuration::MicroSeconds(150))
+ , Hostheader(nullptr)
+ , Footer(nullptr)
+ , AltFooter(nullptr)
+ , PostData(nullptr)
+ , PostDataLen(0)
+ , Method(nullptr)
+ , MethodLen(0)
+ , HostheaderLen(0)
+ {
+ SetIdentification("YandexSomething/1.0", "webadmin@yandex.ru");
+ }
+
+ ~THttpAgent() {
+ Disconnect();
+ free(Hostheader);
+ free(Footer);
+ }
+
+ void SetIdentification(const char* user_agent, const char* http_from) {
+ free(Footer);
+ size_t len = user_agent ? strlen(user_agent) + 15 : 0;
+ len += http_from ? strlen(http_from) + 9 : 0;
+ len += 3;
+ Footer = (char*)malloc(len);
+ if (user_agent)
+ strcat(strcat(strcpy(Footer, "User-Agent: "), user_agent), "\r\n");
+ if (http_from)
+ strcat(strcat(strcat(Footer, "From: "), http_from), "\r\n");
+ }
+
+ void SetUserAgentFooter(const char* altFooter) {
+ AltFooter = altFooter;
+ }
+
+ void SetPostData(const char* postData, size_t postDataLen) {
+ PostData = postData;
+ PostDataLen = postDataLen;
+ }
+
+ void SetMethod(const char* method, size_t methodLen) {
+ Method = method;
+ MethodLen = methodLen;
+ }
+
+ // deprecated
+ ui32 GetIp() const {
+ return Addrs.GetV4Addr().first;
+ }
+
+ int GetScheme() const {
+ return THttpURL::SchemeHTTP;
+ }
+ void SetTimeout(TDuration tim) {
+ Timeout = tim;
+ }
+
+ void SetConnectTimeout(TDuration timeout) {
+ ConnectTimeout = timeout;
+ }
+
+ int Disconnected() {
+ return !Persistent || !Socket.Good();
+ }
+
+ int SetHost(const char* hostname, TIpPort port) {
+ Disconnect();
+ TAddrList addrs = DnsClient.Resolve(hostname, port);
+ if (!addrs.size()) {
+ return 1;
+ }
+
+ SetHost(hostname, port, addrs);
+ return 0;
+ }
+
+ int SetHost(const char* hostname, TIpPort port, const TAddrList& addrs) {
+ Disconnect();
+ Addrs = addrs;
+ size_t reqHostheaderLen = strlen(hostname) + 20;
+ if (HostheaderLen < reqHostheaderLen) {
+ free(Hostheader);
+ Hostheader = (char*)malloc((HostheaderLen = reqHostheaderLen));
+ }
+ if (port == 80)
+ sprintf(Hostheader, "Host: %s\r\n", hostname);
+ else
+ sprintf(Hostheader, "Host: %s:%u\r\n", hostname, port);
+ pHostBeg = strchr(Hostheader, ' ') + 1;
+ pHostEnd = strchr(pHostBeg, '\r');
+ // convert hostname to lower case since some web server don't like
+ // uppper case (Task ROBOT-562)
+ for (char* p = pHostBeg; p < pHostEnd; p++)
+ *p = tolower(*p);
+ return 0;
+ }
+
+ // deprecated v4-only
+ int SetHost(const char* hostname, TIpPort port, ui32 ip) {
+ return SetHost(hostname, port, TAddrList::MakeV4Addr(ip, port));
+ }
+
+ void SetHostHeader(const char* host) {
+ size_t reqHostheaderLen = strlen(host) + 20;
+ if (HostheaderLen < reqHostheaderLen) {
+ delete[] Hostheader;
+ Hostheader = new char[(HostheaderLen = reqHostheaderLen)];
+ }
+ sprintf(Hostheader, "Host: %s\r\n", host);
+ }
+
+ void SetSocket(SOCKET fd) {
+ Socket.SetSocket(fd);
+ }
+
+ SOCKET PickOutSocket() {
+ return Socket.PickOutSocket();
+ }
+
+ void Disconnect() {
+ Socket.Disconnect();
+ }
+
+ ssize_t read(void* buffer, size_t buflen) {
+ return Socket.read(buffer, buflen);
+ }
+
+ int RequestGet(const char* url, const char* const* headers, int persistent = 1, bool head_request = false) {
+ if (!Addrs.size())
+ return HTTP_DNS_FAILURE;
+ char message[MessageMax];
+ ssize_t messlen = 0;
+ if (Method) {
+ strncpy(message, Method, MethodLen);
+ message[MethodLen] = ' ';
+ messlen = MethodLen + 1;
+ } else if (PostData) {
+ strcpy(message, "POST ");
+ messlen = 5;
+ } else if (head_request) {
+ strcpy(message, "HEAD ");
+ messlen = 5;
+ } else {
+ strcpy(message, "GET ");
+ messlen = 4;
+ }
+#define _AppendMessage(mes) messlen += Min(MessageMax - messlen, \
+ (ssize_t)strlcpy(message + messlen, (mes), MessageMax - messlen))
+ _AppendMessage(url);
+ _AppendMessage(" HTTP/1.1\r\n");
+ if (*url == '/') //if not then Host is a proxy
+ _AppendMessage(Hostheader);
+ _AppendMessage("Connection: ");
+ _AppendMessage(persistent ? "Keep-Alive\r\n" : "Close\r\n");
+ while (headers && *headers)
+ _AppendMessage(*headers++);
+ if (AltFooter)
+ _AppendMessage(AltFooter);
+ else
+ _AppendMessage(Footer);
+ _AppendMessage("\r\n");
+#undef _AppendMessage
+ if (messlen >= MessageMax)
+ return HTTP_HEADER_TOO_LARGE;
+
+ if (!Persistent)
+ Disconnect();
+ Persistent = persistent;
+ int connected = Socket.Good();
+ for (int attempt = !connected; attempt < 2; attempt++) {
+ const auto connectTimeout = ConnectTimeout ? ConnectTimeout : Timeout;
+ if (!Socket.Good() && Socket.Connect(Addrs, connectTimeout))
+ return HTTP_CONNECT_FAILED;
+
+ int sendOk = Socket.send(message, messlen);
+ if (sendOk && PostData && PostDataLen)
+ sendOk = Socket.send(PostData, PostDataLen);
+ if (!sendOk) {
+ int err = errno;
+ Disconnect();
+ errno = err;
+ continue;
+ }
+
+ if (!Socket.peek()) {
+ int err = errno;
+ Disconnect();
+ if (err == EINTR) {
+ errno = err;
+ return HTTP_INTERRUPTED;
+ }
+ } else {
+ if (!persistent)
+ Socket.shutdown();
+ return 0;
+ }
+ }
+ return connected ? HTTP_CONNECTION_LOST : HTTP_CONNECT_FAILED;
+ }
+
+protected:
+ TSocketHandler Socket;
+ TIpResolverWrapper<TDnsClient> DnsClient;
+ TAddrList Addrs;
+ int Persistent;
+ TDuration Timeout;
+ TDuration ConnectTimeout;
+ char *Hostheader, *Footer, *pHostBeg, *pHostEnd;
+ const char* AltFooter; // alternative footer can be set by the caller
+ const char* PostData;
+ size_t PostDataLen;
+ const char* Method;
+ size_t MethodLen;
+ unsigned short HostheaderLen;
+ static const ssize_t MessageMax = 32768;
+};
+
+struct TNoTimer {
+ inline void OnBeforeSend() {
+ }
+ inline void OnAfterSend() {
+ }
+ inline void OnBeforeRecv() {
+ }
+ inline void OnAfterRecv() {
+ }
+};
diff --git a/library/cpp/http/fetch/httpfetcher.h b/library/cpp/http/fetch/httpfetcher.h
new file mode 100644
index 0000000000..7fc251afd2
--- /dev/null
+++ b/library/cpp/http/fetch/httpfetcher.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#ifdef _MSC_VER
+#include <io.h>
+#endif
+
+#include <library/cpp/http/misc/httpdate.h>
+
+#include "httpagent.h"
+#include "httpparser.h"
+
+struct TFakeBackup {
+ int Write(void* /*buf*/, size_t /*size*/) {
+ return 0;
+ }
+};
+
+template <size_t bufsize = 5000>
+struct TFakeAlloc {
+ void Shrink(void* /*buf*/, size_t /*size*/) {
+ }
+ void* Grab(size_t /*min*/, size_t* real) {
+ *real = bufsize;
+ return buf;
+ }
+ char buf[bufsize];
+};
+
+template <typename TAlloc = TFakeAlloc<>,
+ typename TCheck = TFakeCheck<>,
+ typename TWriter = TFakeBackup,
+ typename TAgent = THttpAgent<>>
+class THttpFetcher: public THttpParser<TCheck>, public TAlloc, public TWriter, public TAgent {
+public:
+ static const size_t TCP_MIN = 1500;
+ static int TerminateNow;
+
+ THttpFetcher()
+ : THttpParser<TCheck>()
+ , TAlloc()
+ , TWriter()
+ , TAgent()
+ {
+ }
+
+ virtual ~THttpFetcher() {
+ }
+
+ int Fetch(THttpHeader* header, const char* path, const char* const* headers, int persistent, bool head_request = false) {
+ int ret = 0;
+ int fetcherr = 0;
+
+ THttpParser<TCheck>::Init(header, head_request);
+ const char* scheme = HttpUrlSchemeKindToString((THttpURL::TSchemeKind)TAgent::GetScheme());
+ size_t schemelen = strlen(scheme);
+ if (*path == '/') {
+ header->base = TStringBuf(scheme, schemelen);
+ header->base += TStringBuf("://", 3);
+ header->base += TStringBuf(TAgent::pHostBeg, TAgent::pHostEnd - TAgent::pHostBeg);
+ header->base += path;
+ } else {
+ if (strlen(path) >= FETCHER_URL_MAX) {
+ header->error = HTTP_URL_TOO_LARGE;
+ return 0;
+ }
+ header->base = path;
+ }
+
+ if ((ret = TAgent::RequestGet(path, headers, persistent, head_request))) {
+ header->error = (i16)ret;
+ return 0;
+ }
+
+ bool inheader = 1;
+ void *bufptr = nullptr, *buf = nullptr, *parsebuf = nullptr;
+ ssize_t got;
+ size_t buffree = 0, bufsize = 0, buflen = 0;
+ size_t maxsize = TCheck::GetMaxHeaderSize();
+ do {
+ if (buffree < TCP_MIN) {
+ if (buf) {
+ TAlloc::Shrink(buf, buflen - buffree);
+ if (TWriter::Write(buf, buflen - buffree) < 0) {
+ buf = nullptr;
+ ret = EIO;
+ break;
+ }
+ }
+ if (!(buf = TAlloc::Grab(TCP_MIN, &buflen))) {
+ ret = ENOMEM;
+ break;
+ }
+ bufptr = buf;
+ buffree = buflen;
+ }
+ if ((got = TAgent::read(bufptr, buffree)) < 0) {
+ fetcherr = errno;
+ if (errno == EINTR)
+ header->error = HTTP_INTERRUPTED;
+ else if (errno == ETIMEDOUT)
+ header->error = HTTP_TIMEDOUT_WHILE_BYTES_RECEIVING;
+ else
+ header->error = HTTP_CONNECTION_LOST;
+
+ break;
+ }
+
+ parsebuf = bufptr;
+ bufptr = (char*)bufptr + got;
+ bufsize += got;
+ buffree -= got;
+
+ THttpParser<TCheck>::Parse(parsebuf, got);
+
+ if (header->error)
+ break; //if ANY error ocurred we will stop download that file or will have unprognosed stream position until MAX size reached
+
+ if (inheader && THttpParser<TCheck>::GetState() != THttpParser<TCheck>::hp_in_header) {
+ inheader = 0;
+ if (TCheck::Check(header))
+ break;
+ if (header->header_size > (long)maxsize) {
+ header->error = HTTP_HEADER_TOO_LARGE;
+ break;
+ }
+ }
+ if (!inheader) {
+ maxsize = TCheck::GetMaxBodySize(header);
+ }
+ if (header->http_status >= HTTP_EXTENDED)
+ break;
+ if (bufsize > maxsize) {
+ header->error = inheader ? HTTP_HEADER_TOO_LARGE : HTTP_BODY_TOO_LARGE;
+ break;
+ }
+ if (TerminateNow) {
+ header->error = HTTP_INTERRUPTED;
+ break;
+ }
+ } while (THttpParser<TCheck>::GetState() > THttpParser<TCheck>::hp_eof);
+
+ i64 Adjustment = 0;
+ if (!header->error) {
+ if (header->transfer_chunked) {
+ Adjustment = header->header_size + header->entity_size - bufsize - 1;
+ } else if (header->content_length >= 0) {
+ Adjustment = header->header_size + header->content_length - bufsize;
+ }
+ if (Adjustment > 0)
+ Adjustment = 0;
+ }
+
+ if (buf) {
+ TAlloc::Shrink(buf, buflen - buffree + Adjustment);
+
+ if (TWriter::Write(buf, buflen - buffree) < 0)
+ ret = EIO;
+ }
+ TCheck::CheckEndDoc(header);
+ if (ret || header->error || header->http_status >= HTTP_EXTENDED || header->connection_closed) {
+ TAgent::Disconnect();
+ if (!fetcherr)
+ fetcherr = errno;
+ }
+ errno = fetcherr;
+ return ret;
+ }
+};
+
+template <typename TAlloc, typename TCheck, typename TWriter, typename TAgent>
+int THttpFetcher<TAlloc, TCheck, TWriter, TAgent>::TerminateNow = 0;
diff --git a/library/cpp/http/fetch/httpfsm.h b/library/cpp/http/fetch/httpfsm.h
new file mode 100644
index 0000000000..c4abdcd0d2
--- /dev/null
+++ b/library/cpp/http/fetch/httpfsm.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include "httpheader.h"
+
+#include <util/system/maxlen.h>
+#include <util/datetime/parser.h>
+
+#include <time.h>
+
+struct THttpHeaderParser {
+ static constexpr int ErrFirstlineTypeMismatch = -3;
+ static constexpr int ErrHeader = -2;
+ static constexpr int Err = -1;
+ static constexpr int Final = 0;
+ static constexpr int NeedMore = 1;
+ static constexpr int Accepted = 2;
+
+ int Execute(const void* inBuf, size_t len) {
+ return execute((unsigned char*)inBuf, (int)len);
+ }
+
+ int Execute(TStringBuf str) {
+ return Execute(str.data(), str.size());
+ }
+
+ int Init(THttpHeader* h) {
+ int ret = Init((THttpBaseHeader*)(h));
+ hd = h;
+ hd->Init();
+ hreflangpos = hd->hreflangs;
+ hreflangspace = HREFLANG_MAX;
+ return ret;
+ }
+
+ int Init(THttpAuthHeader* h) {
+ int ret = Init((THttpHeader*)(h));
+ auth_hd = h;
+ return ret;
+ }
+ int Init(THttpRequestHeader* h) {
+ int ret = Init((THttpBaseHeader*)(h));
+ request_hd = h;
+ request_hd->Init();
+ return ret;
+ }
+
+ THttpHeader* hd;
+ long I;
+ int Dc;
+ TDateTimeFieldsDeprecated DateTimeFields;
+ char buf[FETCHER_URL_MAX];
+ size_t buflen;
+ char* lastchar;
+
+ const unsigned char* langstart;
+ size_t langlen;
+
+ char* hreflangpos;
+ size_t hreflangspace;
+
+ bool AcceptingXRobots;
+
+ THttpAuthHeader* auth_hd;
+ THttpRequestHeader* request_hd;
+
+private:
+ THttpBaseHeader* base_hd;
+ int cs;
+
+private:
+ int Init(THttpBaseHeader* header) {
+ base_hd = header;
+ auth_hd = nullptr;
+ request_hd = nullptr;
+ hd = nullptr;
+ init();
+ return 0;
+ }
+
+ int execute(unsigned char* inBuf, int len);
+ void init();
+};
+
+struct THttpChunkParser {
+ int Execute(const void* inBuf, int len) {
+ return execute((unsigned char*)inBuf, len);
+ }
+
+ int Init() {
+ init();
+ return 0;
+ }
+
+ int chunk_length;
+ char* lastchar;
+ long I;
+ int Dc;
+ i64 cnt64;
+
+private:
+ int cs;
+ int execute(unsigned char* inBuf, int len);
+ void init();
+};
diff --git a/library/cpp/http/fetch/httpfsm.rl6 b/library/cpp/http/fetch/httpfsm.rl6
new file mode 100644
index 0000000000..eab0328b18
--- /dev/null
+++ b/library/cpp/http/fetch/httpfsm.rl6
@@ -0,0 +1,684 @@
+#include <stdio.h>
+#include <time.h>
+
+#include <library/cpp/charset/doccodes.h>
+#include <library/cpp/charset/codepage.h>
+#include <library/cpp/http/misc/httpcodes.h>
+#include <util/datetime/base.h>
+#include <util/generic/ylimits.h>
+#include <algorithm> // max
+
+#include <library/cpp/http/fetch/httpheader.h>
+#include <library/cpp/http/fetch/httpfsm.h>
+
+#ifdef _MSC_VER
+#pragma warning(disable: 4702) // unreachable code
+#endif
+
+#define c(i) I = i;
+#define m(i) I = std::max(I, (long)i);
+
+static inline int X(unsigned char c) {
+ return (c >= 'A' ? ((c & 0xdf) - 'A' + 10) : (c - '0'));
+}
+
+template <typename x>
+static inline void guard(x &val) {
+ val = (val >= -1) ? -4 - val : -2; // f(-2) = -2
+}
+
+template <typename x>
+static inline void setguarded(x &val, long cnt) {
+ val = (val == -4 - -1 || cnt == -4 -val) ? cnt : -2;
+}
+
+////////////////////////////////////////////////////////////////////
+/// HTTP PARSER
+////////////////////////////////////////////////////////////////////
+
+%%{
+machine http_header_parser;
+
+include HttpDateTimeParser "../../../../util/datetime/parser.rl6";
+
+alphtype unsigned char;
+
+################# 2.2 Basic Rules #################
+eol = '\r'? '\n';
+ws = [ \t];
+lw = '\r'? '\n'? ws;
+separator = [()<>@,;:\\"/\[\]?={}];
+token_char = [!-~] - separator; # http tokens chars
+url_char = [!-~] - ["<>\[\]\\^`{}|]; # uric chars
+text_char = ws | 33..126 | 128..255;
+any_text_char = any - [\r\n];
+
+lws = lw*;
+eoh = lws eol;
+token = token_char+;
+ex_token = (token_char | ws)* token_char;
+text = (text_char | lw)*;
+any_text = (any_text_char | lw)*;
+def = lws ':' lws;
+
+action clear_buf { buflen = 0; }
+action update_buf { if (buflen < sizeof(buf)) buf[buflen++] = fc; }
+
+###################################################
+############ response status line #################
+action set_minor { base_hd->http_minor = I; }
+action set_status {
+ if (hd) {
+ hd->http_status = I;
+ }
+ if (request_hd) {
+ return -3;
+ }
+}
+
+status_code = int3;
+http_major = int;
+http_minor = int;
+reason_phrase = ws+ text_char*;
+http_version = "http/"i http_major '.' http_minor %set_minor;
+response_status_line = http_version ws+ status_code reason_phrase? eol %set_status;
+
+############ request status line #################
+action set_request_uri {
+ if (request_hd && buflen < FETCHER_URL_MAX) {
+ if (!request_hd->request_uri.empty()) {
+ return -2;
+ }
+ request_hd->request_uri =TStringBuf(buf, buflen);
+ }
+}
+action set_http_method {
+ if (request_hd) {
+ request_hd->http_method = I;
+ }
+ if (hd) {
+ return -3;
+ }
+}
+
+http_extension_method = token;
+http_method = ("options"i %{c(0)} @1
+ | "get"i %{c(1)} @1
+ | "head"i %{c(2)} @1
+ | "post"i %{c(3)} @1
+ | "put"i %{c(4)} @1
+ | "delete"i %{c(5)} @1
+ | "trace"i %{c(6)} @1
+ | "connect"i %{c(7)} @1
+ | http_extension_method %{c(8)} $0)
+ %set_http_method;
+request_uri = (token_char | separator)+ >clear_buf $update_buf
+ %set_request_uri;
+request_status_line = http_method ws+ request_uri ws+ http_version eoh;
+
+################# connection ######################
+action beg_connection { guard(base_hd->connection_closed); I = -1; }
+action set_connection { setguarded(base_hd->connection_closed, I); }
+
+c_token = "close"i %{m(1)}
+ | "keep-alive"i %{m(0)};
+c_tokenlist = c_token (lws ',' lws c_token)?;
+connection = "connection"i def %beg_connection c_tokenlist eoh %set_connection;
+
+################# content-encoding ################
+action beg_content_encoding { I = HTTP_COMPRESSION_ERROR; }
+action set_content_encoding { base_hd->compression_method =
+ ((base_hd->compression_method == HTTP_COMPRESSION_UNSET ||
+ base_hd->compression_method == I) ?
+ I : (int)HTTP_COMPRESSION_ERROR); }
+
+ce_tokenlist = "identity"i %{c(HTTP_COMPRESSION_IDENTITY)}
+ | "gzip"i %{c(HTTP_COMPRESSION_GZIP)}
+ | "x-gzip"i %{c(HTTP_COMPRESSION_GZIP)}
+ | "deflate"i %{c(HTTP_COMPRESSION_DEFLATE)}
+ | "compress"i %{c(HTTP_COMPRESSION_COMPRESS)}
+ | "x-compress"i %{c(HTTP_COMPRESSION_COMPRESS)};
+content_encoding = "content-encoding"i def %beg_content_encoding ce_tokenlist eoh %set_content_encoding;
+
+################# transfer-encoding ###############
+action beg_encoding { guard(base_hd->transfer_chunked); }
+action set_encoding { setguarded(base_hd->transfer_chunked, I); }
+
+e_tokenlist = "identity"i %{c(0)}
+ | "chunked"i %{c(1)};
+transfer_encoding = "transfer-encoding"i def %beg_encoding e_tokenlist eoh %set_encoding;
+
+################# content-length ##################
+action beg_content_length { guard(base_hd->content_length); }
+action set_content_length { setguarded(base_hd->content_length, I); }
+
+content_length = "content-length"i def %beg_content_length int eoh %set_content_length;
+
+################# content-range ###################
+action beg_content_range_start { guard(base_hd->content_range_start); I = -1; }
+action set_content_range_start { setguarded(base_hd->content_range_start, I); }
+action beg_content_range_end { guard(base_hd->content_range_end); I = -1; }
+action set_content_range_end { setguarded(base_hd->content_range_end, I); }
+action beg_content_range_el { guard(base_hd->content_range_entity_length); I = -1; }
+action set_content_range_el { setguarded(base_hd->content_range_entity_length, I); }
+
+content_range = "content-range"i def "bytes"i sp %beg_content_range_start int '-' %set_content_range_start
+ %beg_content_range_end int '/' %set_content_range_end
+ %beg_content_range_el int eoh %set_content_range_el;
+
+################# accept-ranges ###################
+action beg_accept_ranges {
+ if (hd) {
+ guard(hd->accept_ranges);
+ I = -1;
+ }
+}
+action set_accept_ranges { if (hd) setguarded(hd->accept_ranges, I); }
+
+ar_tokenlist = "bytes"i %{c(1)}
+ | "none"i %{c(0)};
+accept_ranges = "accept-ranges"i def %beg_accept_ranges ar_tokenlist eoh %set_accept_ranges;
+
+################# content-type ####################
+action beg_mime { guard(base_hd->mime_type); }
+action set_mime { setguarded(base_hd->mime_type, I); }
+action set_charset {
+ if (buflen < FETCHER_URL_MAX) {
+ buf[buflen++] = 0;
+ base_hd->charset = EncodingHintByName((const char*)buf);
+ }
+}
+
+mime_type = "text/plain"i %{c(MIME_TEXT)}
+ | "text/html"i %{c(MIME_HTML)}
+ | "application/pdf"i %{c(MIME_PDF)}
+ | "application/rtf"i %{c(MIME_RTF)}
+ | "text/rtf"i %{c(MIME_RTF)}
+ | "application/msword"i %{c(MIME_DOC)}
+ | "audio/mpeg"i %{c(MIME_MPEG)}
+ | "text/xml"i %{c(MIME_XML)}
+ | "application/xml"i %{c(MIME_XML)}
+ | "application/rss+xml"i %{c(MIME_RSS)}
+ | "application/rdf+xml"i %{c(MIME_RSS)}
+ | "application/atom+xml"i %{c(MIME_RSS)}
+ | "text/vnd.wap.wml"i %{c(MIME_WML)}
+ | "application/x-shockwave-flash"i %{c(MIME_SWF)}
+ | "application/vnd.ms-excel"i %{c(MIME_XLS)}
+ | "application/vnd.ms-powerpoint"i %{c(MIME_PPT)}
+ | "image/jpeg"i %{c(MIME_IMAGE_JPG)}
+ | "image/jpg"i %{c(MIME_IMAGE_JPG)}
+ | "image/pjpeg"i %{c(MIME_IMAGE_PJPG)}
+ | "image/png"i %{c(MIME_IMAGE_PNG)}
+ | "image/gif"i %{c(MIME_IMAGE_GIF)}
+ | "application/xhtml+xml"i %{c(MIME_XHTMLXML)}
+ | "application/vnd.openxmlformats-officedocument.wordprocessingml.document"i %{c(MIME_DOCX)}
+ | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"i %{c(MIME_XLSX)}
+ | "application/vnd.openxmlformats-officedocument.presentationml.presentation"i %{c(MIME_PPTX)}
+ | "application/vnd.oasis.opendocument.text"i %{c(MIME_ODT)}
+ | "application/vnd.oasis.opendocument.presentation"i %{c(MIME_ODP)}
+ | "application/vnd.oasis.opendocument.spreadsheet"i %{c(MIME_ODS)}
+ | "application/vnd.oasis.opendocument.graphics"i %{c(MIME_ODG)}
+ | "image/x-ms-bmp"i %{c(MIME_IMAGE_BMP)}
+ | "image/bmp"i %{c(MIME_IMAGE_BMP)}
+ | "audio/x-wav"i %{c(MIME_WAV)}
+ | ( "application/x-tar"i | "application/x-ustar"i | "application/x-gtar"i | "application/zip"i | "application/x-archive"i
+ | "application/x-bzip2"i | "application/x-rar"i ) %{c(MIME_ARCHIVE)}
+ | "application/x-dosexec"i %{c(MIME_EXE)}
+ | "application/x-gzip"i %{c(MIME_GZIP)}
+ | "application/json"i %{c(MIME_JSON)}
+ | ("application/javascript"i | "text/javascript"i) %{c(MIME_JAVASCRIPT)}
+ | "application/vnd.android.package-archive"i %{c(MIME_APK)}
+ | ("image/x-icon"i | "image/vnd.microsoft.icon"i) %{c(MIME_IMAGE_ICON)}
+ ;
+
+
+charset_name = token_char+ >clear_buf $update_buf;
+mime_param = "charset"i ws* '=' ws* '"'? charset_name '"'? %set_charset @2
+ | token ws* '=' ws* '"'? token '"'? @1
+ | text $0;
+mime_parms = (lws ';' lws mime_param)*;
+content_type = "content-type"i def %beg_mime mime_type mime_parms eoh %set_mime;
+
+################# last modified ###################
+action beg_modtime { guard(base_hd->http_time); }
+action set_modtime {
+ setguarded(base_hd->http_time, DateTimeFields.ToTimeT(-1));
+}
+
+last_modified = "last-modified"i def %beg_modtime http_date eoh %set_modtime;
+
+################# location ########################
+action set_location {
+ while (buflen > 0 && (buf[buflen - 1] == ' ' || buf[buflen - 1] == '\t')) {
+ buflen --;
+ }
+ if (hd && buflen < FETCHER_URL_MAX) {
+ hd->location = TStringBuf(buf, buflen);
+ }
+}
+
+action set_status_303{ if (hd) hd->http_status = 303; }
+
+url = url_char+ >clear_buf $update_buf;
+loc_url = any_text_char+ >clear_buf $update_buf;
+location = "location"i def loc_url eoh %set_location;
+refresh = "refresh"i def int ';' lws "url="i loc_url eoh %set_location;
+
+################# x-robots-tag ################
+action set_x_robots {
+ if (hd && AcceptingXRobots) {
+ if (I > 0)
+ hd->x_robots_tag |= I;
+
+ int pos = (I > 0 ? I : -I);
+ for (size_t i = 0; i < 5; ++i)
+ if (abs(pos) & (1 << i)) // permissive flags take priority
+ hd->x_robots_state[i] = (I < 0) ? '1' : (hd->x_robots_state[i] != '1') ? '0' : '1';
+ }
+}
+
+action accept_x_robots {
+ AcceptingXRobots = (bool)I;
+}
+
+x_robots_directive = "none"i %{c(3)} | "all"i %{c(-3)}
+ | "noindex"i %{c(1)} | "index"i %{c(-1)}
+ | "nofollow"i %{c(2)} | "follow"i %{c(-2)}
+ | "noarchive"i %{c(4)} | "archive"i %{c(-4)}
+ | "noyaca"i %{c(16)}
+ | "noodp"i %{c(8)};
+
+any_value = (any_text_char - [, \t])+ (lws (any_text_char - [, \t])+)*;
+any_key = (any_text_char - [:, \t])+ (lws (any_text_char - [:, \t])+)*;
+
+unavailable_after_directive = "unavailable_after"i def any_value;
+
+yandex_robot = "yandex"i | "yandexbot"i;
+other_robot = any_key - "unavailable_after"i - yandex_robot;
+robot_specifier = yandex_robot %{c(1)} | other_robot %{c(0)};
+
+x_robots_value = (robot_specifier def %accept_x_robots)? (unavailable_after_directive | (x_robots_directive %set_x_robots) | any_value? );
+
+x_robots_tag = "x-robots-tag"i def >{ AcceptingXRobots = true; } x_robots_value (lws ',' lws x_robots_value)* eoh;
+
+################# rel_canonical ###############
+action set_canonical {
+ if (hd && buflen < FETCHER_URL_MAX) {
+ hd->rel_canonical = TStringBuf(buf, buflen);
+ }
+}
+
+rel_canonical = "link"i def '<' url ">;"i lws "rel"i lws '=' lws "\"canonical\"" eoh %set_canonical;
+################# hreflang ###############
+action set_hreflang {
+ bool first = (hreflangpos == hd->hreflangs);
+ size_t len2 = (first ? 0 : 1) + langlen + 1 + buflen;
+ if (langlen && len2 < hreflangspace) {
+ if (!first) {
+ *(hreflangpos++) = '\t';
+ }
+ memcpy(hreflangpos, langstart, langlen);
+ hreflangpos += langlen;
+ *(hreflangpos++) = ' ';
+ memcpy(hreflangpos, buf, buflen);
+ hreflangpos += buflen;
+ *(hreflangpos) = 0;
+ hreflangspace -= len2;
+ }
+}
+
+action start_lang {
+ langstart = fpc;
+ langlen = 0;
+}
+action end_lang {
+ langlen = fpc - langstart;
+}
+hreflang_token = (token_char - ['])+;
+quote = ['"]?; #"
+lang = hreflang_token >start_lang %end_lang;
+
+hreflang = "link"i def '<' url '>' lws ";" lws
+ ( ( "rel"i lws '=' lws quote "alternate" quote lws ';' lws "hreflang"i lws '=' lws quote lang quote )
+ | ( "hreflang"i lws '=' lws quote lang quote lws ';' lws "rel"i lws '=' lws quote "alternate" quote ) )
+ eoh %set_hreflang;
+################# squid_error #################
+action set_squid_error {
+ hd->squid_error = 1;
+}
+
+squid_error = "X-Yandex-Squid-Error"i def any_text eoh %set_squid_error;
+
+################# auth ########################
+action init_auth {
+ if (auth_hd)
+ auth_hd->use_auth=true;
+}
+
+action update_auth_buf
+ { if (auth_hd && buflen < sizeof(buf)) buf[buflen++] = *fpc; }
+
+quoted_str = /"/ (text_char - /"/)* /"/ >2;
+auth_quoted_str = ( /"/ ( ( text_char - /"/ )* >clear_buf $update_auth_buf ) /"/ ) > 2;
+
+# do not support auth-int, too heavy procedure
+
+qop_auth_option = "auth"i @1 %{if(auth_hd) auth_hd->qop_auth = true; };
+
+qop_option = ( qop_auth_option @1 ) | (( token-"auth"i) $0 );
+
+auth_good_param = ( "nonce"i /=/ auth_quoted_str )
+ %{if (auth_hd && buflen < FETCHER_URL_MAX-1) {
+ buf[buflen++] = 0;
+ auth_hd->nonce = strdup((const char*)buf);
+ }}
+ | ( "realm"i /=/ auth_quoted_str )
+ %{if (auth_hd && buflen < FETCHER_URL_MAX-1) {
+ buf[buflen++] = 0;
+ auth_hd->realm = strdup((const char*)buf);
+ }}
+ | ( "opaque"i /=/ auth_quoted_str )
+ %{if (auth_hd && buflen < FETCHER_URL_MAX-1) {
+ buf[buflen++] = 0;
+ auth_hd->opaque = strdup((const char*)buf);
+ }}
+ | "stale"i /=/ "true"i
+ %{if (auth_hd) auth_hd->stale = true; }
+ | "algorithm"i /=/ "md5"i /-/ "sess"i
+ %{if (auth_hd) auth_hd->algorithm = 1; }
+ | ( "qop"i /="/ qop_option (ws* "," ws* qop_option)* /"/);
+
+auth_param = auth_good_param @1 |
+ ( (token - ( "nonce"i | "opaque"i | "realm"i | "qop"i ) )
+ /=/ (token | quoted_str ) ) $0;
+
+auth_params = auth_param ( ws* /,/ ws* auth_param )*;
+
+digest_challenge = ("digest"i %init_auth ws+ auth_params) |
+ ((token-"digest"i) text);
+
+auth = "www-authenticate"i def digest_challenge eoh;
+
+###################### host #######################
+action set_host {
+ if (request_hd && buflen < HOST_MAX) {
+ buf[buflen++] = 0;
+ if (request_hd->host[0] != 0) {
+ return -2;
+ }
+ memcpy(request_hd->host, buf, buflen);
+ }
+}
+
+host = (url_char | [:])* >clear_buf $update_buf;
+host_header = "host"i def host eoh %set_host;
+
+###################### from #######################
+action set_from {
+ if (request_hd && buflen < MAXWORD_LEN) {
+ buf[buflen++] = 0;
+ if (request_hd->from[0] != 0) {
+ return -2;
+ }
+ memcpy(request_hd->from, buf, buflen);
+ }
+}
+
+mailbox = (token "@" token) >clear_buf $update_buf;
+from_header = "from"i def mailbox eoh %set_from;
+
+################### user-agent ####################
+action set_user_agent {
+ if (request_hd && buflen < MAXWORD_LEN) {
+ buf[buflen++] = 0;
+ if (request_hd->user_agent[0] != 0) {
+ return -2;
+ }
+ memcpy(request_hd->user_agent, buf, buflen);
+ }
+}
+
+user_agent = any_text_char* >clear_buf $update_buf;
+user_agent_header = "user-agent"i def user_agent eoh %set_user_agent;
+
+############### x-yandex-langregion ################
+action set_langregion {
+ if (request_hd && buflen < MAX_LANGREGION_LEN) {
+ buf[buflen++] = 0;
+ if (request_hd->x_yandex_langregion[0] != 0) {
+ return -2;
+ }
+ memcpy(request_hd->x_yandex_langregion, buf, buflen);
+ }
+}
+
+langregion = any_text_char* >clear_buf $update_buf;
+langregion_header = "x-yandex-langregion"i def langregion eoh %set_langregion;
+
+############### x-yandex-sourcename ################
+action set_sourcename {
+ if (request_hd && buflen < MAXWORD_LEN) {
+ buf[buflen++] = 0;
+ if (request_hd->x_yandex_sourcename[0] != 0) {
+ return -2;
+ }
+ memcpy(request_hd->x_yandex_sourcename, buf, buflen);
+ }
+}
+
+sourcename = any_text_char* >clear_buf $update_buf;
+sourcename_header = "x-yandex-sourcename"i def sourcename eoh %set_sourcename;
+
+############### x-yandex-requesttype ###############
+action set_requesttype {
+ if (request_hd && buflen < MAXWORD_LEN) {
+ buf[buflen++] = 0;
+ if (request_hd->x_yandex_requesttype[0] != 0) {
+ return -2;
+ }
+ memcpy(request_hd->x_yandex_requesttype, buf, buflen);
+ }
+}
+
+requesttype = any_text_char* >clear_buf $update_buf;
+requesttype_header = "x-yandex-requesttype"i def requesttype eoh %set_requesttype;
+
+################ x-yandex-fetchoptions ###############
+action set_fetchoptions {
+ if (request_hd && buflen < MAXWORD_LEN) {
+ buf[buflen++] = 0;
+ if (request_hd->x_yandex_fetchoptions[0] != 0) {
+ return -2;
+ }
+ memcpy(request_hd->x_yandex_fetchoptions, buf, buflen);
+ }
+}
+
+fetchoptions = any_text_char* >clear_buf $update_buf;
+fetchoptions_header = "x-yandex-fetchoptions"i def fetchoptions eoh %set_fetchoptions;
+
+################ if-modified-since ################
+action set_if_modified_since {
+ if (request_hd) {
+ request_hd->if_modified_since = DateTimeFields.ToTimeT(-1);
+ }
+}
+
+if_modified_since = "if-modified-since"i def http_date eoh
+ %set_if_modified_since;
+
+################ retry-after ################
+action set_retry_after_withdate {
+ if (hd) {
+ hd->retry_after = DateTimeFields.ToTimeT(-1);
+ }
+}
+
+action set_retry_after_withdelta {
+ if (hd) {
+ hd->retry_after = TInstant::Now().Seconds() + I;
+ }
+}
+
+retry_after_withdate = "retry-after"i def http_date eoh
+ %set_retry_after_withdate;
+retry_after_withdelta = "retry-after"i def int eoh
+ %set_retry_after_withdelta;
+
+############## request-cache-control ##############
+action SETMAXAGE { if (request_hd) request_hd->max_age = I; }
+
+delta_seconds = int;
+cache_extension = token ("=" (token | quoted_str))?;
+request_cache_directive = "no-cache"i
+ | "no-store"i
+ | ("max-age"i "=" delta_seconds %SETMAXAGE)
+ | ("max-stale"i ("=" delta_seconds)?)
+ | ("min-fresh"i "=" delta_seconds)
+ | "non-transform"i
+ | "only-if-cached"i
+ | cache_extension;
+request_cache_control = "cache-control"i def request_cache_directive eoh;
+
+############ x-yandex-response-timeout #############
+
+action set_response_timeout {
+ if (request_hd) {
+ request_hd->x_yandex_response_timeout = I;
+ }
+}
+
+response_timeout = "x-yandex-response-timeout"i def int eoh
+ %set_response_timeout;
+
+############ x-yandex-request-priority #############
+
+action set_request_priority {
+ if (request_hd) {
+ request_hd->x_yandex_request_priority = I;
+ }
+}
+
+request_priority = "x-yandex-request-priority"i def int eoh
+ %set_request_priority;
+
+################# message header ##################
+other_header = ( ex_token - "www-authenticate"i ) def any_text eoh;
+message_header = other_header $0
+ | connection @1
+ | content_encoding @1
+ | transfer_encoding @1
+ | content_length @1
+ | content_type @1
+ | last_modified @1
+ | refresh @1
+ | content_range @1;
+response_header = message_header $0
+ | auth @1
+ | accept_ranges @1
+ | location @1
+ | x_robots_tag @1
+ | rel_canonical @1
+ | hreflang @1
+ | squid_error @1
+ | retry_after_withdate @1
+ | retry_after_withdelta @1;
+request_header = message_header $0
+ | from_header @1
+ | host_header @1
+ | user_agent_header @1
+ | sourcename_header @1
+ | requesttype_header @1
+ | langregion_header @1
+ | fetchoptions_header @1
+ | if_modified_since @1
+ | request_cache_control @1
+ | response_timeout @1
+ | request_priority @1;
+
+################# main ############################
+action accepted { lastchar = (char*)fpc; return 2; }
+
+main := ((response_status_line ('\r'? response_header)*)
+ | (request_status_line ('\r' ? request_header)*))
+ eol @accepted;
+
+}%%
+
+%% write data;
+
+int THttpHeaderParser::execute(unsigned char *inBuf, int len) {
+ const unsigned char *p = inBuf;
+ const unsigned char *pe = p + len;
+ %% write exec;
+ if (cs == http_header_parser_error)
+ return -1;
+ else if (cs == http_header_parser_first_final)
+ return 0;
+ else
+ return 1;
+}
+
+void THttpHeaderParser::init() {
+ %% write init;
+}
+
+%%{
+machine http_chunk_parser;
+
+alphtype unsigned char;
+
+action clear_hex { cnt64 = 0; }
+action update_hex { cnt64 = 16 * cnt64 + X(fc); if(cnt64 > Max<int>()) return -2; }
+action set_chunk { chunk_length = static_cast<int>(cnt64); }
+action accepted { lastchar = (char*)fpc; return 2; }
+
+eol = '\r'? '\n';
+ws = [ \t];
+sp = ' ';
+lw = '\r'? '\n'? ws;
+separator = [()<>@,;:\\"/\[\]?={}];
+token_char = [!-~] - separator; # http tokens chars
+url_char = [!-~] - ["<>\[\]\\^`{}|]; # uric chars
+text_char = ws | 33..127 | 160..255;
+
+lws = lw*;
+eoh = lws eol;
+token = token_char+;
+text = (text_char | lw)*;
+def = lws ':' lws;
+
+hex = (xdigit+) >clear_hex $update_hex;
+quoted_string = '"' ((text_char - '"') $0 | '\\"' @1)* '"';
+
+chunk_ext_val = token | quoted_string;
+chunk_ext_name = token;
+chunk_extension = ws* (';' chunk_ext_name ws* '=' ws* chunk_ext_val ws*)*;
+
+entity_header = token def text eoh;
+trailer = entity_header*;
+
+chunk = (hex - '0'+) chunk_extension? %set_chunk;
+last_chunk = '0'+ chunk_extension? eol trailer;
+main := eol (chunk $0 | last_chunk @1) eol @accepted;
+
+}%%
+
+%% write data;
+
+int THttpChunkParser::execute(unsigned char *inBuf, int len) {
+ const unsigned char *p = inBuf;
+ const unsigned char *pe = p + len;
+ %% write exec;
+ if (cs == http_chunk_parser_error)
+ return -1;
+ else if (cs == http_chunk_parser_first_final)
+ return 0;
+ else
+ return 1;
+}
+
+void THttpChunkParser::init() {
+ chunk_length = 0;
+ %% write init;
+}
diff --git a/library/cpp/http/fetch/httpfsm_ut.cpp b/library/cpp/http/fetch/httpfsm_ut.cpp
new file mode 100644
index 0000000000..b018e80101
--- /dev/null
+++ b/library/cpp/http/fetch/httpfsm_ut.cpp
@@ -0,0 +1,591 @@
+#include "httpfsm.h"
+#include "library-htfetch_ut_hreflang_in.h"
+#include "library-htfetch_ut_hreflang_out.h"
+
+#include <util/generic/ptr.h>
+#include <library/cpp/charset/doccodes.h>
+#include <library/cpp/testing/unittest/registar.h>
+
+class THttpHeaderParserTestSuite: public TTestBase {
+ UNIT_TEST_SUITE(THttpHeaderParserTestSuite);
+ UNIT_TEST(TestRequestHeader);
+ UNIT_TEST(TestSplitRequestHeader);
+ UNIT_TEST(TestTrailingData);
+ UNIT_TEST(TestProxyRequestHeader);
+ UNIT_TEST(TestIncorrectRequestHeader);
+ UNIT_TEST(TestLastModified);
+ UNIT_TEST(TestLastModifiedCorrupted);
+ UNIT_TEST(TestResponseHeaderOnRequest);
+ UNIT_TEST(TestRequestHeaderOnResponse);
+ UNIT_TEST(TestXRobotsTagUnknownTags);
+ UNIT_TEST(TestXRobotsTagMyBot);
+ UNIT_TEST(TestXRobotsTagOtherBot);
+ UNIT_TEST(TestXRobotsTagUnavailableAfterAware);
+ UNIT_TEST(TestXRobotsTagUnavailableAfterWorks);
+ UNIT_TEST(TestXRobotsTagOverridePriority);
+ UNIT_TEST(TestXRobotsTagDoesNotBreakCharset);
+ UNIT_TEST(TestXRobotsTagAllowsMultiline);
+ UNIT_TEST(TestRelCanonical);
+ UNIT_TEST(TestHreflang);
+ UNIT_TEST(TestHreflangOnLongInput);
+ UNIT_TEST(TestMimeType);
+ UNIT_TEST(TestRepeatedContentEncoding);
+ UNIT_TEST_SUITE_END();
+
+private:
+ THolder<THttpHeaderParser> httpHeaderParser;
+
+private:
+ void TestStart();
+ void TestFinish();
+
+public:
+ void TestRequestHeader();
+ void TestSplitRequestHeader();
+ void TestTrailingData();
+ void TestProxyRequestHeader();
+ void TestIncorrectRequestHeader();
+ void TestLastModified();
+ void TestLastModifiedCorrupted();
+ void TestResponseHeaderOnRequest();
+ void TestRequestHeaderOnResponse();
+ void TestXRobotsTagUnknownTags();
+ void TestXRobotsTagMyBot();
+ void TestXRobotsTagOtherBot();
+ void TestXRobotsTagUnavailableAfterAware();
+ void TestXRobotsTagUnavailableAfterWorks();
+ void TestXRobotsTagOverridePriority();
+ void TestXRobotsTagDoesNotBreakCharset();
+ void TestXRobotsTagAllowsMultiline();
+ void TestRelCanonical();
+ void TestHreflang();
+ void TestHreflangOnLongInput();
+ void TestMimeType();
+ void TestRepeatedContentEncoding();
+};
+
+void THttpHeaderParserTestSuite::TestStart() {
+ httpHeaderParser.Reset(new THttpHeaderParser());
+}
+
+void THttpHeaderParserTestSuite::TestFinish() {
+ httpHeaderParser.Reset();
+}
+
+void THttpHeaderParserTestSuite::TestRequestHeader() {
+ TestStart();
+ THttpRequestHeader httpRequestHeader;
+ httpHeaderParser->Init(&httpRequestHeader);
+ const char* request = "GET /search?q=hi HTTP/1.1\r\n"
+ "Host: www.google.ru:8080\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(request, strlen(request));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.http_method, HTTP_METHOD_GET);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.host, "www.google.ru:8080"), 0);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.request_uri, "/search?q=hi");
+ UNIT_ASSERT_EQUAL(httpRequestHeader.GetUrl(), "http://www.google.ru:8080/search?q=hi");
+ UNIT_ASSERT_EQUAL(httpHeaderParser->lastchar - request + 1,
+ (i32)strlen(request));
+ UNIT_ASSERT_EQUAL(httpRequestHeader.x_yandex_response_timeout,
+ DEFAULT_RESPONSE_TIMEOUT);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.x_yandex_request_priority,
+ DEFAULT_REQUEST_PRIORITY);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_sourcename, ""), 0);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_requesttype, ""), 0);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_fetchoptions, ""), 0);
+ TestFinish();
+ UNIT_ASSERT_EQUAL(httpRequestHeader.max_age, DEFAULT_MAX_AGE);
+}
+
+void THttpHeaderParserTestSuite::TestSplitRequestHeader() {
+ TestStart();
+ const char* request =
+ "GET /search?q=hi HTTP/1.1\r\n"
+ "Host: www.google.ru:8080 \r\n"
+ "\r\n";
+ const size_t rlen = strlen(request);
+
+ for (size_t n1 = 0; n1 < rlen; n1++) {
+ for (size_t n2 = n1; n2 < rlen; n2++) {
+ TString s1{request, 0, n1};
+ TString s2{request, n1, n2 - n1};
+ TString s3{request, n2, rlen - n2};
+ UNIT_ASSERT_EQUAL(s1 + s2 + s3, request);
+
+ THttpRequestHeader httpRequestHeader;
+ UNIT_ASSERT(0 == httpHeaderParser->Init(&httpRequestHeader));
+ i32 result = httpHeaderParser->Execute(s1);
+ UNIT_ASSERT_EQUAL(result, 1);
+ result = httpHeaderParser->Execute(s2);
+ UNIT_ASSERT_EQUAL(result, 1);
+ result = httpHeaderParser->Execute(s3);
+ UNIT_ASSERT_EQUAL(result, 2);
+
+ UNIT_ASSERT_EQUAL(httpRequestHeader.http_method, HTTP_METHOD_GET);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.host, "www.google.ru:8080"), 0);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.request_uri, "/search?q=hi");
+ }
+ }
+
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestTrailingData() {
+ TestStart();
+ THttpRequestHeader httpRequestHeader;
+ UNIT_ASSERT(0 == httpHeaderParser->Init(&httpRequestHeader));
+ const char* request =
+ "GET /search?q=hi HTTP/1.1\r\n"
+ "Host: www.google.ru:8080\r\n"
+ "\r\n"
+ "high.ru";
+ i32 result = httpHeaderParser->Execute(request, strlen(request));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.http_method, HTTP_METHOD_GET);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.host, "www.google.ru:8080"), 0);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.request_uri, "/search?q=hi");
+ UNIT_ASSERT_EQUAL(TString(httpHeaderParser->lastchar + 1), "high.ru");
+ UNIT_ASSERT_EQUAL(httpRequestHeader.http_minor, 1);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.transfer_chunked, -1);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.content_length, -1);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.connection_closed, -1);
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestProxyRequestHeader() {
+ TestStart();
+ THttpRequestHeader httpRequestHeader;
+ httpHeaderParser->Init(&httpRequestHeader);
+ const char* request =
+ "GET http://www.google.ru:8080/search?q=hi HTTP/1.1\r\n"
+ "X-Yandex-Response-Timeout: 1000\r\n"
+ "X-Yandex-Request-Priority: 2\r\n"
+ "X-Yandex-Sourcename: orange\r\n"
+ "X-Yandex-Requesttype: userproxy\r\n"
+ "X-Yandex-FetchOptions: d;c\r\n"
+ "Cache-control: max-age=100\r\n"
+ "If-Modified-Since: Sat, 29 Oct 1994 19:43:31 GMT\r\n"
+ "User-Agent: Yandex/1.01.001 (compatible; Win16; I)\r\n"
+ "From: webadmin@yandex.ru\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(request, strlen(request));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.http_method, HTTP_METHOD_GET);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.x_yandex_response_timeout, 1000);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.x_yandex_request_priority, 2);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_sourcename, "orange"), 0);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_requesttype, "userproxy"), 0);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_fetchoptions, "d;c"), 0);
+ UNIT_ASSERT_EQUAL(httpRequestHeader.max_age, 100);
+ UNIT_ASSERT_VALUES_EQUAL(httpRequestHeader.if_modified_since,
+ TInstant::ParseIso8601Deprecated("1994-10-29 19:43:31Z").TimeT());
+ UNIT_ASSERT_EQUAL(httpRequestHeader.request_uri,
+ "http://www.google.ru:8080/search?q=hi");
+ UNIT_ASSERT(httpRequestHeader.GetUrl() ==
+ "http://www.google.ru:8080/search?q=hi");
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.host, ""), 0);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.from, "webadmin@yandex.ru"), 0);
+ UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.user_agent,
+ "Yandex/1.01.001 (compatible; Win16; I)"),
+ 0);
+ UNIT_ASSERT_EQUAL(httpHeaderParser->lastchar - request + 1,
+ (i32)strlen(request));
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestIncorrectRequestHeader() {
+ TestStart();
+ THttpRequestHeader httpRequestHeader;
+ httpHeaderParser->Init(&httpRequestHeader);
+ const char* request = "GET /search?q=hi HTP/1.1\r\n"
+ "Host: www.google.ru:8080\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(request, strlen(request));
+ UNIT_ASSERT(result != 2);
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestLastModified() {
+ TestStart();
+ THttpHeader h;
+ UNIT_ASSERT(0 == httpHeaderParser->Init(&h));
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "Last-Modified: Thu, 13 Aug 2009 14:27:08 GMT\r\n\r\n";
+ UNIT_ASSERT(2 == httpHeaderParser->Execute(headers, strlen(headers)));
+ UNIT_ASSERT_VALUES_EQUAL(
+ TInstant::ParseIso8601Deprecated("2009-08-13 14:27:08Z").TimeT(),
+ h.http_time);
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestLastModifiedCorrupted() {
+ TestStart();
+ THttpHeader h;
+ UNIT_ASSERT(0 == httpHeaderParser->Init(&h));
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "Last-Modified: Thu, 13 Aug 2009 14:\r\n\r\n";
+ UNIT_ASSERT(2 == httpHeaderParser->Execute(headers, strlen(headers)));
+ UNIT_ASSERT(h.http_time < 0); // XXX: don't understand what is the proper value
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestXRobotsTagUnknownTags() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "x-robots-tag: asdfasdf asdf asdf,,, , noindex,noodpXXX , NOFOLLOW ,noodpnofollow\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 3);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "00xxx");
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestXRobotsTagMyBot() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "x-robots-tag: yandex: noindex, nofollow\r\n"
+ "x-robots-tag: yandexbot: noarchive, noodp\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 15);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "0000x");
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestXRobotsTagOtherBot() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "x-robots-tag: google: noindex, nofollow\r\n"
+ "x-robots-tag: googlebot: noarchive, noodp\r\n"
+ "x-robots-tag: !still(-other) bot_: foo, noyaca\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 0);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "xxxxx");
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestXRobotsTagUnavailableAfterAware() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ // проверяем только что unavailable_after ничего не ломает
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "x-robots-tag: unavailable_after: 01 Jan 2999 00:00 UTC, noindex, nofollow\r\n"
+ "x-robots-tag: yandex: unavailable_after: 01 Jan 2999 00:00 UTC, noarchive, noodp\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 15);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "0000x");
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestXRobotsTagUnavailableAfterWorks() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ // пока не поддерживается
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "x-robots-tag: unavailable_after: 01 Jan 2000 00:00 UTC\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ //UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 1);
+ //UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "0xxxx");
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestXRobotsTagOverridePriority() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "x-robots-tag: all, none\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "11xxx");
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 3); // NOTE legacy behavior, should be 0 as `all` overrides
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestXRobotsTagDoesNotBreakCharset() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "X-Robots-Tag: noarchive\r\n"
+ "Content-Type: application/json; charset=utf-8\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.mime_type, static_cast<ui8>(MIME_JSON));
+ UNIT_ASSERT_EQUAL(httpHeader.charset, static_cast<ui8>(CODES_UTF8));
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestXRobotsTagAllowsMultiline() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "X-Robots-Tag\r\n"
+ " :\r\n"
+ " unavailable_since\r\n"
+ " :\r\n"
+ " ,\r\n"
+ " unavailable_since\r\n"
+ " :\r\n"
+ " 01 Jan 2000\r\n"
+ " 00:00 UTC\r\n"
+ " ,\r\n"
+ " yandexbot\r\n"
+ " :\r\n"
+ " noindex\r\n"
+ " ,\r\n"
+ " garbage\r\n"
+ " ,\r\n"
+ " nofollow\r\n"
+ " ,\r\n"
+ " other\r\n"
+ " bot\r\n"
+ " :\r\n"
+ " noarchive\r\n"
+ " ,\r\n"
+ "Content-Type: application/json; charset=utf-8\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "00xxx");
+ UNIT_ASSERT_EQUAL(httpHeader.mime_type, static_cast<ui8>(MIME_JSON));
+ UNIT_ASSERT_EQUAL(httpHeader.charset, static_cast<ui8>(CODES_UTF8));
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestHreflang() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "link: <http://www.high.ru/>; rel='alternate'; hreflang='x-default'\r\n"
+ "link: <http://www.high.ru/en.html> ;rel = 'alternate' ;hreflang = en_GB \r\n"
+ "link: <http://www.high.ru/ru.html>;hreflang = ru_RU.KOI8-r ;rel = 'alternate' \r\n"
+ "\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_VALUES_EQUAL(result, 2);
+ // UNIT_ASSERT_VALUES_EQUAL(strcmp(httpHeader.hreflangs, "x-default http://www.high.ru/;"), 0);
+ UNIT_ASSERT_VALUES_EQUAL(httpHeader.hreflangs, "x-default http://www.high.ru/\ten_GB http://www.high.ru/en.html\tru_RU.KOI8-r http://www.high.ru/ru.html");
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestHreflangOnLongInput() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ TStringBuf testInput(hreflang_ut_in);
+ TStringBuf testOut(hreflang_ut_out);
+ i32 result = httpHeaderParser->Execute(testInput.data(), testInput.size());
+ UNIT_ASSERT_VALUES_EQUAL(result, 2);
+ UNIT_ASSERT_VALUES_EQUAL(httpHeader.hreflangs, testOut);
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestRelCanonical() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "Link: <http://yandex.ru>; rel = \"canonical\"\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.rel_canonical, "http://yandex.ru");
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestResponseHeaderOnRequest() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* request = "GET /search?q=hi HTP/1.1\r\n"
+ "Host: www.google.ru:8080\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(request, strlen(request));
+ UNIT_ASSERT_EQUAL(result, -3);
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestRequestHeaderOnResponse() {
+ TestStart();
+ THttpRequestHeader httpRequestHeader;
+ httpHeaderParser->Init(&httpRequestHeader);
+ const char* response = "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "Last-Modified: Thu, 13 Aug 2009 14:\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(response, strlen(response));
+ UNIT_ASSERT_EQUAL(result, -3);
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestMimeType() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char* headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Content-Type: application/json; charset=utf-8\r\n\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.mime_type, static_cast<ui8>(MIME_JSON));
+ UNIT_ASSERT_EQUAL(httpHeader.charset, static_cast<ui8>(CODES_UTF8));
+ TestFinish();
+}
+
+void THttpHeaderParserTestSuite::TestRepeatedContentEncoding() {
+ TestStart();
+ THttpHeader httpHeader;
+ httpHeaderParser->Init(&httpHeader);
+ const char *headers =
+ "HTTP/1.1 200 OK\r\n"
+ "Server: nginx\r\n"
+ "Date: Mon, 15 Oct 2018 10:40:44 GMT\r\n"
+ "Content-Type: text/plain\r\n"
+ "Transfer-Encoding: chunked\r\n"
+ "Connection: keep-alive\r\n"
+ "Last-Modified: Mon, 15 Oct 2018 03:48:54 GMT\r\n"
+ "ETag: W/\"5bc40e26-a956d\"\r\n"
+ "X-Autoru-LB: lb-03-sas.prod.vertis.yandex.net\r\n"
+ "Content-Encoding: gzip\r\n"
+ "Content-Encoding: gzip\r\n"
+ "X-UA-Bot: 1\r\n"
+ "\r\n";
+ i32 result = httpHeaderParser->Execute(headers, strlen(headers));
+ UNIT_ASSERT_EQUAL(result, 2);
+ UNIT_ASSERT_EQUAL(httpHeader.error, 0);
+ UNIT_ASSERT_EQUAL(httpHeader.compression_method, 3);
+ TestFinish();
+}
+
+UNIT_TEST_SUITE_REGISTRATION(THttpHeaderParserTestSuite);
+
+Y_UNIT_TEST_SUITE(TestHttpChunkParser) {
+ static THttpChunkParser initParser() {
+ THttpChunkParser parser;
+ parser.Init();
+ return parser;
+ }
+
+ static THttpChunkParser parseByteByByte(const TStringBuf& blob, const TVector<int>& states) {
+ UNIT_ASSERT(states.size() <= blob.size());
+ THttpChunkParser parser{initParser()};
+ for (size_t n = 0; n < states.size(); n++) {
+ const TStringBuf d{blob, n, 1};
+ int code = parser.Execute(d.data(), d.size());
+ Cout << TString(d).Quote() << " " << code << Endl;
+ UNIT_ASSERT_EQUAL(code, states[n]);
+ }
+ return parser;
+ }
+
+ static THttpChunkParser parseBytesWithLastState(const TStringBuf& blob, const int last_state) {
+ TVector<int> states(blob.size() - 1, 1);
+ states.push_back(last_state);
+ return parseByteByByte(blob, states);
+ }
+
+ Y_UNIT_TEST(TestWithoutEolHead) {
+ const TStringBuf blob{
+ "4\r\n"
+ "____\r\n"};
+ TVector<int> states{
+ -1, /* 1, -1,
+ 1, -1, 1, -1, 1, -1 */};
+ // as soon as error happens parser state should be considered
+ // undefined, state is meaningless after the very first `-1`
+ // moreover, testenv produces `states[1] == -1` for this input and
+ // my local build produces `states[1] == 1`.
+ parseByteByByte(blob, states);
+ }
+
+ Y_UNIT_TEST(TestTrivialChunk) {
+ const TStringBuf blob{
+ "\r\n"
+ "4\r\n"};
+ THttpChunkParser parser(parseBytesWithLastState(blob, 2));
+ UNIT_ASSERT_EQUAL(parser.chunk_length, 4);
+ UNIT_ASSERT_EQUAL(parser.cnt64, 4);
+ }
+
+ Y_UNIT_TEST(TestNegative) {
+ const TStringBuf blob{
+ "\r\n"
+ "-1"};
+ TVector<int> states{
+ 1, 1,
+ -1,
+ /* 1 */};
+ parseByteByByte(blob, states);
+ }
+
+ Y_UNIT_TEST(TestLeadingZero) {
+ const TStringBuf blob{
+ "\r\n"
+ "042\r\n"};
+ THttpChunkParser parser(parseBytesWithLastState(blob, 2));
+ UNIT_ASSERT_EQUAL(parser.chunk_length, 0x42);
+ }
+
+ Y_UNIT_TEST(TestIntOverflow) {
+ const TStringBuf blob{
+ "\r\n"
+ "deadbeef"};
+ THttpChunkParser parser(parseBytesWithLastState(blob, -2));
+ UNIT_ASSERT_EQUAL(parser.chunk_length, 0);
+ UNIT_ASSERT_EQUAL(parser.cnt64, 0xdeadbeef);
+ }
+
+ Y_UNIT_TEST(TestTrivialChunkWithTail) {
+ const TStringBuf blob{
+ "\r\n"
+ "4\r\n"
+ "_" // first byte of the chunk
+ };
+ TVector<int> states{
+ 1, 1,
+ 1, 1, 2,
+ -1};
+ parseByteByByte(blob, states);
+ }
+
+ Y_UNIT_TEST(TestLastChunk) {
+ // NB: current parser does not permit whitespace before `foo`,
+ // but I've never seen the feature in real-life traffic
+ const TStringBuf blob{
+ "\r\n"
+ "000 ;foo = bar \r\n"
+ "Trailer: bar\r\n"
+ "\r\n"};
+ THttpChunkParser parser(parseBytesWithLastState(blob, 2));
+ UNIT_ASSERT_EQUAL(parser.chunk_length, 0);
+ }
+}
diff --git a/library/cpp/http/fetch/httpheader.cpp b/library/cpp/http/fetch/httpheader.cpp
new file mode 100644
index 0000000000..7d2225b8b7
--- /dev/null
+++ b/library/cpp/http/fetch/httpheader.cpp
@@ -0,0 +1,7 @@
+#include "httpheader.h"
+
+const i64 DEFAULT_RETRY_AFTER = -1;
+const i64 DEFAULT_IF_MODIFIED_SINCE = -1;
+const i32 DEFAULT_MAX_AGE = -1;
+const i8 DEFAULT_REQUEST_PRIORITY = -1;
+const i32 DEFAULT_RESPONSE_TIMEOUT = -1;
diff --git a/library/cpp/http/fetch/httpheader.h b/library/cpp/http/fetch/httpheader.h
new file mode 100644
index 0000000000..b2810bbd41
--- /dev/null
+++ b/library/cpp/http/fetch/httpheader.h
@@ -0,0 +1,287 @@
+#pragma once
+
+#include "exthttpcodes.h"
+
+#include <library/cpp/mime/types/mime.h>
+
+#include <util/system/defaults.h>
+#include <util/system/compat.h>
+#include <util/generic/string.h>
+#include <util/generic/ylimits.h>
+#include <util/system/maxlen.h>
+
+#include <ctime>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+
+// This is ugly solution but here a lot of work to do it the right way.
+#define FETCHER_URL_MAX 8192
+
+extern const i64 DEFAULT_RETRY_AFTER; /// == -1
+extern const i64 DEFAULT_IF_MODIFIED_SINCE; /// == -1
+extern const i32 DEFAULT_MAX_AGE; /// == -1
+extern const i8 DEFAULT_REQUEST_PRIORITY; /// == -1
+extern const i32 DEFAULT_RESPONSE_TIMEOUT; /// == -1
+
+#define HTTP_PREFIX "http://"
+#define MAX_LANGREGION_LEN 4
+#define MAXWORD_LEN 55
+
+enum HTTP_COMPRESSION {
+ HTTP_COMPRESSION_UNSET = 0,
+ HTTP_COMPRESSION_ERROR = 1,
+ HTTP_COMPRESSION_IDENTITY = 2,
+ HTTP_COMPRESSION_GZIP = 3,
+ HTTP_COMPRESSION_DEFLATE = 4,
+ HTTP_COMPRESSION_COMPRESS = 5,
+ HTTP_COMPRESSION_MAX = 6
+};
+
+enum HTTP_METHOD {
+ HTTP_METHOD_UNDEFINED = -1,
+ HTTP_METHOD_OPTIONS,
+ HTTP_METHOD_GET,
+ HTTP_METHOD_HEAD,
+ HTTP_METHOD_POST,
+ HTTP_METHOD_PUT,
+ HTTP_METHOD_DELETE,
+ HTTP_METHOD_TRACE,
+ HTTP_METHOD_CONNECT,
+ HTTP_METHOD_EXTENSION
+};
+
+enum HTTP_CONNECTION {
+ HTTP_CONNECTION_UNDEFINED = -1,
+ HTTP_CONNECTION_KEEP_ALIVE = 0,
+ HTTP_CONNECTION_CLOSE = 1
+};
+
+/// Class represents general http header fields.
+struct THttpBaseHeader {
+public:
+ i16 error;
+ i32 header_size;
+ i32 entity_size;
+ i64 content_length;
+ i64 http_time; // seconds since epoch
+ i64 content_range_start; // Content-Range: first-byte-pos
+ i64 content_range_end; // Content-Range: last-byte-pos
+ i64 content_range_entity_length; // Content-Range: entity-length
+ i8 http_minor;
+ i8 mime_type;
+ i8 charset;
+ i8 compression_method;
+ i8 transfer_chunked;
+ i8 connection_closed;
+ TString base;
+
+public:
+ void Init() {
+ error = 0;
+ header_size = 0;
+ entity_size = 0;
+ content_length = -1;
+ http_time = -1;
+ http_minor = -1;
+ mime_type = -1;
+ charset = -1;
+ compression_method = HTTP_COMPRESSION_UNSET;
+ transfer_chunked = -1;
+ connection_closed = HTTP_CONNECTION_UNDEFINED;
+ content_range_start = -1;
+ content_range_end = -1;
+ content_range_entity_length = -1;
+ base.clear();
+ }
+
+ void Print() const {
+ printf("content_length: %" PRIi64 "\n", content_length);
+ printf("http_time: %" PRIi64 "\n", http_time);
+ printf("http_minor: %" PRIi8 "\n", http_minor);
+ printf("mime_type: %" PRIi8 "\n", mime_type);
+ printf("charset: %" PRIi8 "\n", charset);
+ printf("compression_method: %" PRIi8 "\n", compression_method);
+ printf("transfer_chunked: %" PRIi8 "\n", transfer_chunked);
+ printf("connection_closed: %" PRIi8 "\n", connection_closed);
+ printf("content_range_start: %" PRIi64 "\n", content_range_start);
+ printf("content_range_end: %" PRIi64 "\n", content_range_end);
+ printf("content_range_entity_length: %" PRIi64 "\n", content_range_entity_length);
+ printf("base: \"%s\"\n", base.c_str());
+ printf("error: %" PRIi16 "\n", error);
+ }
+
+ int SetBase(const char* path,
+ const char* hostNamePtr = nullptr,
+ int hostNameLength = 0) {
+ if (*path == '/') {
+ base = "http://";
+ base += TStringBuf(hostNamePtr, hostNameLength);
+ base += path;
+ } else {
+ base = path;
+ }
+ return error;
+ }
+};
+
+enum { HREFLANG_MAX = FETCHER_URL_MAX * 2 };
+/// Class represents Http Response Header.
+struct THttpHeader: public THttpBaseHeader {
+public:
+ i8 accept_ranges;
+ i8 squid_error;
+ i8 x_robots_tag; // deprecated, use x_robots_state instead
+ i16 http_status;
+ TString location;
+ TString rel_canonical;
+ char hreflangs[HREFLANG_MAX];
+ i64 retry_after;
+ TString x_robots_state; // 'xxxxx' format, see `library/html/zoneconf/parsefunc.cpp`
+
+public:
+ void Init() {
+ THttpBaseHeader::Init();
+ accept_ranges = -1;
+ squid_error = 0;
+ x_robots_tag = 0;
+ rel_canonical.clear();
+ http_status = -1;
+ location.clear();
+ hreflangs[0] = 0;
+ retry_after = DEFAULT_RETRY_AFTER;
+ x_robots_state = "xxxxx";
+ }
+
+ void Print() const {
+ THttpBaseHeader::Print();
+ printf("http_status: %" PRIi16 "\n", http_status);
+ printf("squid_error: %" PRIi8 "\n", squid_error);
+ printf("accept_ranges: %" PRIi8 "\n", accept_ranges);
+ printf("location: \"%s\"\n", location.c_str());
+ printf("retry_after: %" PRIi64 "\n", retry_after);
+ }
+};
+
+struct THttpRequestHeader: public THttpBaseHeader {
+public:
+ TString request_uri;
+ char host[HOST_MAX];
+ char from[MAXWORD_LEN];
+ char user_agent[MAXWORD_LEN];
+ char x_yandex_langregion[MAX_LANGREGION_LEN];
+ char x_yandex_sourcename[MAXWORD_LEN];
+ char x_yandex_requesttype[MAXWORD_LEN];
+ char x_yandex_fetchoptions[MAXWORD_LEN];
+ i8 http_method;
+ i8 x_yandex_request_priority;
+ i32 x_yandex_response_timeout;
+ i32 max_age;
+ i64 if_modified_since;
+
+public:
+ THttpRequestHeader() {
+ Init();
+ }
+
+ void Init() {
+ request_uri.clear();
+ host[0] = 0;
+ from[0] = 0;
+ user_agent[0] = 0;
+ x_yandex_langregion[0] = 0;
+ x_yandex_sourcename[0] = 0;
+ x_yandex_requesttype[0] = 0;
+ x_yandex_fetchoptions[0] = 0;
+ http_method = HTTP_METHOD_UNDEFINED;
+ x_yandex_request_priority = DEFAULT_REQUEST_PRIORITY;
+ x_yandex_response_timeout = DEFAULT_RESPONSE_TIMEOUT;
+ max_age = DEFAULT_MAX_AGE;
+ if_modified_since = DEFAULT_IF_MODIFIED_SINCE;
+ THttpBaseHeader::Init();
+ }
+
+ void Print() const {
+ THttpBaseHeader::Print();
+ printf("request_uri: \"%s\"\n", request_uri.c_str());
+ printf("host: \"%s\"\n", host);
+ printf("from: \"%s\"\n", from);
+ printf("user_agent: \"%s\"\n", user_agent);
+ printf("http_method: %" PRIi8 "\n", http_method);
+ printf("response_timeout: %" PRIi32 "\n", x_yandex_response_timeout);
+ printf("max_age: %" PRIi32 "\n", max_age);
+ printf("if_modified_since: %" PRIi64 "\n", if_modified_since);
+ }
+
+ /// It doesn't care about errors in request or headers, where
+ /// request_uri equals to '*'.
+ /// This returns copy of the string, which you have to delete.
+ TString GetUrl() {
+ TString url;
+ if (host[0] == 0 || !strcmp(host, "")) {
+ url = request_uri;
+ } else {
+ url = HTTP_PREFIX;
+ url += host;
+ url += request_uri;
+ }
+ return url;
+ }
+
+ char* GetUrl(char* buffer, size_t size) {
+ if (host[0] == 0 || !strcmp(host, "")) {
+ strlcpy(buffer, request_uri.c_str(), size);
+ } else {
+ snprintf(buffer, size, "http://%s%s", host, request_uri.c_str());
+ }
+ return buffer;
+ }
+};
+
+class THttpAuthHeader: public THttpHeader {
+public:
+ char* realm;
+ char* nonce;
+ char* opaque;
+ bool stale;
+ int algorithm;
+ bool qop_auth;
+ bool use_auth;
+
+ //we do not provide auth-int variant as too heavy
+ //bool qop_auth_int;
+
+ THttpAuthHeader()
+ : realm(nullptr)
+ , nonce(nullptr)
+ , opaque(nullptr)
+ , stale(false)
+ , algorithm(0)
+ , qop_auth(false)
+ , use_auth(true)
+ {
+ THttpHeader::Init();
+ }
+
+ ~THttpAuthHeader() {
+ free(realm);
+ free(nonce);
+ free(opaque);
+ }
+
+ void Print() {
+ THttpHeader::Print();
+ if (use_auth) {
+ if (realm)
+ printf("realm: \"%s\"\n", realm);
+ if (nonce)
+ printf("nonce: \"%s\"\n", nonce);
+ if (opaque)
+ printf("opaque: \"%s\"\n", opaque);
+ printf("stale: %d\n", stale);
+ printf("algorithm: %d\n", algorithm);
+ printf("qop_auth: %d\n", qop_auth);
+ }
+ }
+};
diff --git a/library/cpp/http/fetch/httpload.cpp b/library/cpp/http/fetch/httpload.cpp
new file mode 100644
index 0000000000..82ea8900b5
--- /dev/null
+++ b/library/cpp/http/fetch/httpload.cpp
@@ -0,0 +1,373 @@
+#include "httpload.h"
+
+/************************************************************/
+/************************************************************/
+httpAgentReader::httpAgentReader(httpSpecialAgent& agent,
+ const char* baseUrl,
+ bool assumeConnectionClosed,
+ bool use_auth,
+ int bufSize)
+ : Header_()
+ , Agent_(agent)
+ , Buffer_(new char[bufSize])
+ , BufPtr_(Buffer_)
+ , BufSize_(bufSize)
+ , BufRest_(0)
+{
+ HeadRequest = false;
+ Header = &Header_;
+ if (use_auth)
+ HeaderParser.Init(&Header_);
+ else
+ HeaderParser.Init(Header);
+ setAssumeConnectionClosed(assumeConnectionClosed ? 1 : 0);
+ Header_.SetBase(baseUrl);
+
+ if (Header_.error)
+ State = hp_error;
+ else
+ State = hp_in_header;
+}
+
+/************************************************************/
+httpAgentReader::~httpAgentReader() {
+ delete[] Buffer_;
+}
+
+/************************************************************/
+void httpAgentReader::readBuf() {
+ assert(BufRest_ == 0);
+ if (!BufPtr_) {
+ BufRest_ = -1;
+ return;
+ }
+
+ BufRest_ = Agent_.read(Buffer_, BufSize_);
+ if (BufRest_ <= 0) {
+ BufRest_ = -1;
+ BufPtr_ = nullptr;
+ } else {
+ BufPtr_ = Buffer_;
+
+ //cout << "BUF: " << mBuffer << endl << endl;
+ }
+}
+
+/************************************************************/
+const THttpHeader* httpAgentReader::readHeader() {
+ while (State == hp_in_header) {
+ if (!step()) {
+ Header_.error = HTTP_CONNECTION_LOST;
+ return nullptr;
+ }
+ ParseGeneric(BufPtr_, BufRest_);
+ }
+ if (State == hp_eof || State == hp_error) {
+ BufPtr_ = nullptr;
+ BufRest_ = -1;
+ }
+ if (State == hp_error || Header_.error)
+ return nullptr;
+ return &Header_;
+}
+
+/************************************************************/
+long httpAgentReader::readPortion(void*& buf) {
+ assert(State != hp_in_header);
+
+ long Chunk = 0;
+ do {
+ if (BufSize_ == 0 && !BufPtr_)
+ return 0;
+
+ if (!step())
+ return 0;
+
+ Chunk = ParseGeneric(BufPtr_, BufRest_);
+ buf = BufPtr_;
+
+ if (State == hp_error && Header_.entity_size > Header_.content_length) {
+ Chunk -= (Header_.entity_size - Header_.content_length);
+ BufPtr_ = (char*)BufPtr_ + Chunk;
+ BufRest_ = 0;
+ State = hp_eof;
+ Header_.error = 0;
+ break;
+ }
+
+ BufPtr_ = (char*)BufPtr_ + Chunk;
+ BufRest_ -= Chunk;
+
+ if (State == hp_eof || State == hp_error) {
+ BufRest_ = -1;
+ BufPtr_ = nullptr;
+ }
+ } while (!Chunk);
+ return Chunk;
+}
+
+/************************************************************/
+bool httpAgentReader::skipTheRest() {
+ void* b;
+ while (!eof())
+ readPortion(b);
+ return (State == hp_eof);
+}
+
+/************************************************************/
+/************************************************************/
+httpLoadAgent::httpLoadAgent(bool handleAuthorization,
+ socketHandlerFactory& factory)
+ : Factory_(factory)
+ , HandleAuthorization_(handleAuthorization)
+ , URL_()
+ , PersistentConn_(false)
+ , Reader_(nullptr)
+ , Headers_()
+ , ErrCode_(0)
+ , RealHost_(nullptr)
+{
+}
+
+/************************************************************/
+httpLoadAgent::~httpLoadAgent() {
+ delete Reader_;
+ free(RealHost_);
+}
+
+/************************************************************/
+void httpLoadAgent::clearReader() {
+ if (Reader_) {
+ bool opened = false;
+ if (PersistentConn_) {
+ const THttpHeader* H = Reader_->readHeader();
+ if (H && !H->connection_closed) {
+ Reader_->skipTheRest();
+ opened = true;
+ }
+ }
+ if (!opened)
+ Disconnect();
+ delete Reader_;
+ Reader_ = nullptr;
+ }
+ ErrCode_ = 0;
+}
+/************************************************************/
+void httpLoadAgent::setRealHost(const char* hostname) {
+ free(RealHost_);
+ if (hostname)
+ RealHost_ = strdup(hostname);
+ else
+ RealHost_ = nullptr;
+ ErrCode_ = 0;
+}
+
+/************************************************************/
+void httpLoadAgent::setIMS(const char* ifModifiedSince) {
+ char ims_buf[100];
+ snprintf(ims_buf, 100, "If-Modified-Since: %s\r\n",
+ ifModifiedSince);
+ Headers_.push_back(ims_buf);
+}
+
+/************************************************************/
+void httpLoadAgent::addHeaderInstruction(const char* instr) {
+ Headers_.push_back(instr);
+}
+
+/************************************************************/
+void httpLoadAgent::dropHeaderInstructions() {
+ Headers_.clear();
+}
+
+/************************************************************/
+bool httpLoadAgent::startRequest(const THttpURL& url,
+ bool persistent,
+ const TAddrList& addrs)
+
+{
+ clearReader();
+ ErrCode_ = 0;
+
+ URL_.Clear();
+ URL_ = url;
+ PersistentConn_ = persistent;
+ if (!URL_.IsValidAbs())
+ return false;
+ if (!HandleAuthorization_ && !URL_.IsNull(THttpURL::FlagAuth))
+ return false;
+
+ return doSetHost(addrs) && doStartRequest();
+}
+
+/************************************************************/
+bool httpLoadAgent::startRequest(const char* url,
+ const char* url_to_merge,
+ bool persistent,
+ const TAddrList& addrs) {
+ clearReader();
+
+ URL_.Clear();
+ PersistentConn_ = persistent;
+
+ long flags = THttpURL::FeatureSchemeKnown | THttpURL::FeaturesNormalizeSet;
+ if (HandleAuthorization_)
+ flags |= THttpURL::FeatureAuthSupported;
+
+ if (URL_.Parse(url, flags, url_to_merge) || !URL_.IsValidGlobal())
+ return false;
+
+ return doSetHost(addrs) && doStartRequest();
+}
+
+/************************************************************/
+bool httpLoadAgent::startRequest(const char* url,
+ const char* url_to_merge,
+ bool persistent,
+ ui32 ip) {
+ clearReader();
+
+ URL_.Clear();
+ PersistentConn_ = persistent;
+
+ long flags = THttpURL::FeatureSchemeKnown | THttpURL::FeaturesNormalizeSet;
+ if (HandleAuthorization_)
+ flags |= THttpURL::FeatureAuthSupported;
+
+ if (URL_.Parse(url, flags, url_to_merge) || !URL_.IsValidGlobal())
+ return false;
+
+ return doSetHost(TAddrList::MakeV4Addr(ip, URL_.GetPort())) && doStartRequest();
+}
+
+/************************************************************/
+bool httpLoadAgent::doSetHost(const TAddrList& addrs) {
+ socketAbstractHandler* h = Factory_.chooseHandler(URL_);
+ if (!h)
+ return false;
+ Socket.setHandler(h);
+
+ if (addrs.size()) {
+ ErrCode_ = SetHost(URL_.Get(THttpURL::FieldHost),
+ URL_.GetPort(), addrs);
+ } else {
+ ErrCode_ = SetHost(URL_.Get(THttpURL::FieldHost),
+ URL_.GetPort());
+ }
+ if (ErrCode_)
+ return false;
+
+ if (RealHost_) {
+ free(Hostheader);
+ Hostheader = (char*)malloc(strlen(RealHost_) + 20);
+ sprintf(Hostheader, "Host: %s\r\n", RealHost_);
+ }
+
+ if (!URL_.IsNull(THttpURL::FlagAuth)) {
+ if (!HandleAuthorization_) {
+ ErrCode_ = HTTP_UNAUTHORIZED;
+ return false;
+ }
+
+ Digest_.setAuthorization(URL_.Get(THttpURL::FieldUsername),
+ URL_.Get(THttpURL::FieldPassword));
+ }
+
+ return true;
+}
+
+/************************************************************/
+bool httpLoadAgent::setHost(const char* host_url,
+ const TAddrList& addrs) {
+ clearReader();
+
+ URL_.Clear();
+ PersistentConn_ = true;
+
+ long flags = THttpURL::FeatureSchemeKnown | THttpURL::FeaturesNormalizeSet;
+ if (HandleAuthorization_)
+ flags |= THttpURL::FeatureAuthSupported;
+
+ if (URL_.Parse(host_url, flags) || !URL_.IsValidGlobal())
+ return false;
+
+ return doSetHost(addrs);
+}
+
+/************************************************************/
+bool httpLoadAgent::startOneRequest(const char* local_url) {
+ clearReader();
+
+ THttpURL lURL;
+ if (lURL.Parse(local_url, THttpURL::FeaturesNormalizeSet) || lURL.IsValidGlobal())
+ return false;
+
+ URL_.SetInMemory(THttpURL::FieldPath, lURL.Get(THttpURL::FieldPath));
+ URL_.SetInMemory(THttpURL::FieldQuery, lURL.Get(THttpURL::FieldQuery));
+ URL_.Rewrite();
+
+ return doStartRequest();
+}
+
+/************************************************************/
+bool httpLoadAgent::doStartRequest() {
+ TString urlStr = URL_.PrintS(THttpURL::FlagPath | THttpURL::FlagQuery);
+ if (!urlStr)
+ urlStr = "/";
+
+ for (int step = 0; step < 10; step++) {
+ const char* digestHeader = Digest_.getHeaderInstruction();
+
+ unsigned i = (digestHeader) ? 2 : 1;
+ const char** headers =
+ (const char**)(alloca((i + Headers_.size()) * sizeof(char*)));
+
+ for (i = 0; i < Headers_.size(); i++)
+ headers[i] = Headers_[i].c_str();
+ if (digestHeader)
+ headers[i++] = digestHeader;
+ headers[i] = nullptr;
+
+ ErrCode_ = RequestGet(urlStr.c_str(), headers, PersistentConn_);
+
+ if (ErrCode_) {
+ Disconnect();
+ return false;
+ }
+
+ TString urlBaseStr = URL_.PrintS(THttpURL::FlagNoFrag);
+
+ clearReader();
+ Reader_ = new httpAgentReader(*this, urlBaseStr.c_str(),
+ !PersistentConn_, !Digest_.empty());
+
+ if (Reader_->readHeader()) {
+ //mReader->getHeader()->Print();
+ if (getHeader()->http_status == HTTP_UNAUTHORIZED &&
+ step < 1 &&
+ Digest_.processHeader(getAuthHeader(),
+ urlStr.c_str(),
+ "GET")) {
+ //mReader->skipTheRest();
+ delete Reader_;
+ Reader_ = nullptr;
+ ErrCode_ = 0;
+ Disconnect();
+ continue;
+ }
+
+ return true;
+ }
+ Disconnect();
+ clearReader();
+
+ return false;
+ }
+
+ ErrCode_ = HTTP_UNAUTHORIZED;
+ return false;
+}
+
+/************************************************************/
+/************************************************************/
diff --git a/library/cpp/http/fetch/httpload.h b/library/cpp/http/fetch/httpload.h
new file mode 100644
index 0000000000..e22e4b809e
--- /dev/null
+++ b/library/cpp/http/fetch/httpload.h
@@ -0,0 +1,307 @@
+#pragma once
+
+#include "httpagent.h"
+#include "httpparser.h"
+#include "http_digest.h"
+
+#include <util/system/compat.h>
+#include <util/string/vector.h>
+#include <util/network/ip.h>
+#include <library/cpp/uri/http_url.h>
+#include <library/cpp/http/misc/httpcodes.h>
+
+/********************************************************/
+// Section 1: socket handlers
+/********************************************************/
+// The following classes allows to adopt template scheme
+// THttpAgent for work with socket by flexible
+// object-style scheme.
+
+/********************************************************/
+// This class is used as a base one for flexible
+// socket handling
+class socketAbstractHandler {
+public:
+ virtual bool Good() = 0;
+
+ virtual int Connect(const TAddrList& addrs, TDuration Timeout) = 0;
+
+ virtual void Disconnect() = 0;
+
+ virtual void shutdown() = 0;
+
+ virtual bool send(const char* message, ssize_t messlen) = 0;
+
+ virtual bool peek() = 0;
+
+ virtual ssize_t read(void* buffer, ssize_t buflen) = 0;
+
+ virtual ~socketAbstractHandler() {
+ }
+
+protected:
+ socketAbstractHandler() {
+ }
+};
+
+/********************************************************/
+// This class is used as a proxy between THttpAgent and
+// socketAbstractHandler
+// (it is used by template scheme,
+// so it does not have virtual methods)
+class TSocketHandlerPtr {
+protected:
+ socketAbstractHandler* Handler_;
+
+public:
+ TSocketHandlerPtr()
+ : Handler_(nullptr)
+ {
+ }
+
+ virtual ~TSocketHandlerPtr() {
+ delete Handler_;
+ }
+
+ int Good() {
+ return (Handler_ && Handler_->Good());
+ }
+
+ int Connect(const TAddrList& addrs, TDuration Timeout) {
+ return (Handler_) ? Handler_->Connect(addrs, Timeout) : 1;
+ }
+
+ void Disconnect() {
+ if (Handler_)
+ Handler_->Disconnect();
+ }
+
+ void shutdown() {
+ if (Handler_)
+ Handler_->shutdown();
+ }
+
+ bool send(const char* message, ssize_t messlen) {
+ return (Handler_) ? Handler_->send(message, messlen) : false;
+ }
+
+ virtual bool peek() {
+ return (Handler_) ? Handler_->peek() : false;
+ }
+
+ virtual ssize_t read(void* buffer, ssize_t buflen) {
+ return (Handler_) ? Handler_->read(buffer, buflen) : 0;
+ }
+
+ void setHandler(socketAbstractHandler* handler) {
+ if (Handler_)
+ delete Handler_;
+ Handler_ = handler;
+ }
+};
+
+/********************************************************/
+// Here is httpAgent that uses socketAbstractHandler class
+// ant its derivatives
+using httpSpecialAgent = THttpAgent<TSocketHandlerPtr>;
+
+/********************************************************/
+// Regular handler is used as implementation of
+// socketAbstractHandler for work through HTTP protocol
+class socketRegularHandler: public socketAbstractHandler {
+protected:
+ TSimpleSocketHandler Socket_;
+
+public:
+ socketRegularHandler()
+ : Socket_()
+ {
+ }
+
+ bool Good() override {
+ return Socket_.Good();
+ }
+
+ int Connect(const TAddrList& addrs, TDuration Timeout) override {
+ return Socket_.Connect(addrs, Timeout);
+ }
+
+ void Disconnect() override {
+ Socket_.Disconnect();
+ }
+
+ void shutdown() override {
+ //Do not block writing to socket
+ //There are servers that works in a bad way with this
+ //mSocket.shutdown();
+ }
+
+ bool send(const char* message, ssize_t messlen) override {
+ return Socket_.send(message, messlen);
+ }
+
+ bool peek() override {
+ return Socket_.peek();
+ }
+
+ ssize_t read(void* buffer, ssize_t buflen) override {
+ return Socket_.read(buffer, buflen);
+ }
+};
+
+/********************************************************/
+// The base factory that allows to choose an appropriate
+// socketAbstractHandler implementation by url schema
+
+class socketHandlerFactory {
+public:
+ virtual ~socketHandlerFactory() {
+ }
+
+ //returns mHandler_HTTP for correct HTTP-based url
+ virtual socketAbstractHandler* chooseHandler(const THttpURL& url);
+
+ static socketHandlerFactory sInstance;
+};
+
+/********************************************************/
+// Section 2: the configurates tool to parse an HTTP-response
+/********************************************************/
+
+class httpAgentReader: public THttpParserGeneric<1> {
+protected:
+ THttpAuthHeader Header_;
+ httpSpecialAgent& Agent_;
+
+ char* Buffer_;
+ void* BufPtr_;
+ int BufSize_;
+ long BufRest_;
+
+ void readBuf();
+
+ bool step() {
+ if (BufRest_ == 0)
+ readBuf();
+ if (eof())
+ return false;
+ return true;
+ }
+
+public:
+ httpAgentReader(httpSpecialAgent& agent,
+ const char* baseUrl,
+ bool assumeConnectionClosed,
+ bool use_auth = false,
+ int bufSize = 0x1000);
+
+ ~httpAgentReader();
+
+ bool eof() {
+ return BufRest_ < 0;
+ }
+
+ int error() {
+ return Header_.error;
+ }
+
+ void setError(int errCode) {
+ Header_.error = errCode;
+ }
+
+ const THttpAuthHeader* getAuthHeader() {
+ return &Header_;
+ }
+
+ const THttpHeader* readHeader();
+ long readPortion(void*& buf);
+ bool skipTheRest();
+};
+
+/********************************************************/
+// Section 3: the main class
+/********************************************************/
+class httpLoadAgent: public httpSpecialAgent {
+protected:
+ socketHandlerFactory& Factory_;
+ bool HandleAuthorization_;
+ THttpURL URL_;
+ bool PersistentConn_;
+ httpAgentReader* Reader_;
+ TVector<TString> Headers_;
+ int ErrCode_;
+ char* RealHost_;
+ httpDigestHandler Digest_;
+
+ void clearReader();
+ bool doSetHost(const TAddrList& addrs);
+ bool doStartRequest();
+
+public:
+ httpLoadAgent(bool handleAuthorization = false,
+ socketHandlerFactory& factory = socketHandlerFactory::sInstance);
+ ~httpLoadAgent();
+
+ void setRealHost(const char* host);
+ void setIMS(const char* ifModifiedSince);
+ void addHeaderInstruction(const char* instr);
+ void dropHeaderInstructions();
+
+ bool startRequest(const char* url,
+ const char* url_to_merge = nullptr,
+ bool persistent = false,
+ const TAddrList& addrs = TAddrList());
+
+ // deprecated v4-only
+ bool startRequest(const char* url,
+ const char* url_to_merge,
+ bool persistent,
+ ui32 ip);
+
+ bool startRequest(const THttpURL& url,
+ bool persistent = false,
+ const TAddrList& addrs = TAddrList());
+
+ bool setHost(const char* host_url,
+ const TAddrList& addrs = TAddrList());
+
+ bool startOneRequest(const char* local_url);
+
+ const THttpAuthHeader* getAuthHeader() {
+ if (Reader_ && Reader_->getAuthHeader()->use_auth)
+ return Reader_->getAuthHeader();
+ return nullptr;
+ }
+
+ const THttpHeader* getHeader() {
+ if (Reader_)
+ return Reader_->getAuthHeader();
+ return nullptr;
+ }
+
+ const THttpURL& getURL() {
+ return URL_;
+ }
+
+ bool eof() {
+ if (Reader_)
+ return Reader_->eof();
+ return true;
+ }
+
+ int error() {
+ if (ErrCode_)
+ return ErrCode_;
+ if (Reader_)
+ return Reader_->error();
+ return HTTP_BAD_URL;
+ }
+
+ long readPortion(void*& buf) {
+ if (Reader_)
+ return Reader_->readPortion(buf);
+ return -1;
+ }
+};
+
+/********************************************************/
diff --git a/library/cpp/http/fetch/httpparser.h b/library/cpp/http/fetch/httpparser.h
new file mode 100644
index 0000000000..769828e4ae
--- /dev/null
+++ b/library/cpp/http/fetch/httpparser.h
@@ -0,0 +1,372 @@
+#pragma once
+
+#include "httpfsm.h"
+#include "httpheader.h"
+
+#include <library/cpp/mime/types/mime.h>
+#include <util/system/yassert.h>
+#include <library/cpp/http/misc/httpcodes.h>
+
+template <size_t headermax = 100 << 10, size_t bodymax = 1 << 20>
+struct TFakeCheck {
+ bool Check(THttpHeader* /*header*/) {
+ return false;
+ }
+ void CheckDocPart(void* /*buf*/, size_t /*len*/, THttpHeader* /*header*/) {
+ } //for every part of DocumentBody will be called
+ void CheckEndDoc(THttpHeader* /*header*/) {
+ }
+ size_t GetMaxHeaderSize() {
+ return headermax;
+ }
+ size_t GetMaxBodySize(THttpHeader*) {
+ return bodymax;
+ }
+};
+
+class THttpParserBase {
+public:
+ enum States {
+ hp_error,
+ hp_eof,
+ hp_in_header,
+ hp_read_alive,
+ hp_read_closed,
+ hp_begin_chunk_header,
+ hp_chunk_header,
+ hp_read_chunk
+ };
+
+ States GetState() {
+ return State;
+ }
+
+ void setAssumeConnectionClosed(int value) {
+ AssumeConnectionClosed = value;
+ }
+
+ THttpHeader* GetHttpHeader() const {
+ return Header;
+ }
+
+protected:
+ int CheckHeaders() {
+ if (Header->http_status < HTTP_OK || Header->http_status == HTTP_NO_CONTENT || Header->http_status == HTTP_NOT_MODIFIED) {
+ Header->content_length = 0;
+ Header->transfer_chunked = 0;
+ }
+ if (Header->transfer_chunked < -1) {
+ Header->error = HTTP_BAD_ENCODING;
+ return 1;
+ } else if (Header->transfer_chunked == -1) {
+ Header->transfer_chunked = 0;
+ }
+ if (!Header->transfer_chunked && Header->content_length < -1) {
+ Header->error = HTTP_BAD_CONTENT_LENGTH;
+ return 1;
+ }
+ if (Header->http_status == HTTP_OK) {
+ if (Header->compression_method != HTTP_COMPRESSION_UNSET &&
+ Header->compression_method != HTTP_COMPRESSION_IDENTITY &&
+ Header->compression_method != HTTP_COMPRESSION_GZIP &&
+ Header->compression_method != HTTP_COMPRESSION_DEFLATE)
+ {
+ Header->error = HTTP_BAD_CONTENT_ENCODING;
+ return 1;
+ }
+ }
+ if (Header->connection_closed == -1)
+ Header->connection_closed = (Header->http_minor == 0 ||
+ AssumeConnectionClosed);
+ if (!Header->transfer_chunked && !Header->connection_closed && Header->content_length < 0 && !HeadRequest) {
+ Header->error = HTTP_LENGTH_UNKNOWN;
+ return 1;
+ }
+ if (Header->http_time < 0)
+ Header->http_time = 0;
+ if (Header->mime_type < 0)
+ Header->mime_type = MIME_UNKNOWN;
+ return 0;
+ }
+
+ THttpHeaderParser HeaderParser;
+ THttpChunkParser ChunkParser;
+ States State;
+ long ChunkSize;
+ THttpHeader* Header;
+ int AssumeConnectionClosed;
+ bool HeadRequest;
+};
+
+template <int isReader, typename TCheck = TFakeCheck<>>
+class THttpParserGeneric: public THttpParserBase, public TCheck {
+protected:
+ long ParseGeneric(void*& buf, long& size) {
+ if (!size) {
+ switch (State) {
+ case hp_error:
+ case hp_eof:
+ break;
+ case hp_read_closed:
+ State = hp_eof;
+ break;
+ case hp_in_header:
+ Header->error = HTTP_HEADER_EOF;
+ State = hp_error;
+ break;
+ case hp_read_alive:
+ case hp_read_chunk:
+ if (HeadRequest)
+ State = hp_eof;
+ else {
+ Header->error = HTTP_MESSAGE_EOF;
+ State = hp_error;
+ }
+ break;
+ case hp_begin_chunk_header:
+ case hp_chunk_header:
+ if (HeadRequest)
+ State = hp_eof;
+ else {
+ Header->error = HTTP_CHUNK_EOF;
+ State = hp_error;
+ }
+ break;
+ }
+ return 0;
+ }
+ while (size) {
+ int ret;
+
+ switch (State) {
+ case hp_error:
+ return 0;
+
+ case hp_eof:
+ return 0;
+
+ case hp_in_header:
+ if ((ret = HeaderParser.Execute(buf, size)) < 0) {
+ Header->error = HTTP_BAD_HEADER_STRING;
+ State = hp_error;
+ return 0;
+ } else if (ret == 2) {
+ Header->header_size += i32(HeaderParser.lastchar - (char*)buf + 1);
+ size -= long(HeaderParser.lastchar - (char*)buf + 1);
+ buf = HeaderParser.lastchar + 1;
+ State = CheckHeaders() ? hp_error
+ : Header->transfer_chunked ? hp_begin_chunk_header
+ : Header->content_length == 0 ? hp_eof
+ : Header->content_length > 0 ? hp_read_alive
+ : hp_read_closed;
+ if (State == hp_begin_chunk_header) {
+ // unget \n for chunk reader
+ buf = (char*)buf - 1;
+ size++;
+ }
+ if (isReader)
+ return size;
+ } else {
+ Header->header_size += size;
+ size = 0;
+ }
+ break;
+
+ case hp_read_alive:
+ Header->entity_size += size;
+ if (Header->entity_size >= Header->content_length) {
+ State = hp_eof;
+ }
+
+ TCheck::CheckDocPart(buf, size, Header);
+ if (isReader)
+ return size;
+ size = 0;
+ break;
+
+ case hp_read_closed:
+ Header->entity_size += size;
+ TCheck::CheckDocPart(buf, size, Header);
+ if (isReader)
+ return size;
+ size = 0;
+ break;
+
+ case hp_begin_chunk_header:
+ ChunkParser.Init();
+ State = hp_chunk_header;
+ [[fallthrough]];
+
+ case hp_chunk_header:
+ if ((ret = ChunkParser.Execute(buf, size)) < 0) {
+ Header->error = i16(ret == -2 ? HTTP_CHUNK_TOO_LARGE : HTTP_BAD_CHUNK);
+ State = hp_error;
+ return 0;
+ } else if (ret == 2) {
+ Header->entity_size += i32(ChunkParser.lastchar - (char*)buf + 1);
+ size -= long(ChunkParser.lastchar - (char*)buf + 1);
+ buf = ChunkParser.lastchar + 1;
+ ChunkSize = ChunkParser.chunk_length;
+ Y_ASSERT(ChunkSize >= 0);
+ State = ChunkSize ? hp_read_chunk : hp_eof;
+ } else {
+ Header->entity_size += size;
+ size = 0;
+ }
+ break;
+
+ case hp_read_chunk:
+ if (size >= ChunkSize) {
+ Header->entity_size += ChunkSize;
+ State = hp_begin_chunk_header;
+ TCheck::CheckDocPart(buf, ChunkSize, Header);
+ if (isReader)
+ return ChunkSize;
+ size -= ChunkSize;
+ buf = (char*)buf + ChunkSize;
+ } else {
+ Header->entity_size += size;
+ ChunkSize -= size;
+ TCheck::CheckDocPart(buf, size, Header);
+ if (isReader)
+ return size;
+ size = 0;
+ }
+ break;
+ }
+ }
+ return size;
+ }
+};
+
+template <class TCheck = TFakeCheck<>>
+class THttpParser: public THttpParserGeneric<0, TCheck> {
+ typedef THttpParserGeneric<0, TCheck> TBaseT; //sorry avoiding gcc 3.4.6 BUG!
+public:
+ void Init(THttpHeader* H, bool head_request = false) {
+ TBaseT::Header = H;
+ TBaseT::HeaderParser.Init(TBaseT::Header);
+ TBaseT::State = TBaseT::hp_in_header;
+ TBaseT::AssumeConnectionClosed = 0;
+ TBaseT::HeadRequest = head_request;
+ }
+
+ void Parse(void* buf, long size) {
+ TBaseT::ParseGeneric(buf, size);
+ }
+};
+
+class TMemoReader {
+public:
+ int Init(void* buf, long bufsize) {
+ Buf = buf;
+ Bufsize = bufsize;
+ return 0;
+ }
+ long Read(void*& buf) {
+ Y_ASSERT(Bufsize >= 0);
+ if (!Bufsize) {
+ Bufsize = -1;
+ return 0;
+ }
+ buf = Buf;
+ long ret = Bufsize;
+ Bufsize = 0;
+ return ret;
+ }
+
+protected:
+ long Bufsize;
+ void* Buf;
+};
+
+template <class Reader>
+class THttpReader: public THttpParserGeneric<1>, public Reader {
+ typedef THttpParserGeneric<1> TBaseT;
+
+public:
+ using TBaseT::AssumeConnectionClosed;
+ using TBaseT::Header;
+ using TBaseT::ParseGeneric;
+ using TBaseT::State;
+
+ int Init(THttpHeader* H, int parsHeader, int assumeConnectionClosed = 0, bool headRequest = false) {
+ Header = H;
+ Eoferr = 1;
+ Size = 0;
+ AssumeConnectionClosed = assumeConnectionClosed;
+ HeadRequest = headRequest;
+ return parsHeader ? ParseHeader() : SkipHeader();
+ }
+
+ long Read(void*& buf) {
+ long Chunk;
+ do {
+ if (!Size) {
+ if (Eoferr != 1)
+ return Eoferr;
+ else if ((Size = (long)Reader::Read(Ptr)) < 0) {
+ Header->error = HTTP_CONNECTION_LOST;
+ return Eoferr = -1;
+ }
+ }
+ Chunk = ParseGeneric(Ptr, Size);
+ buf = Ptr;
+ Ptr = (char*)Ptr + Chunk;
+ Size -= Chunk;
+ if (State == hp_eof) {
+ Size = 0;
+ Eoferr = 0;
+ } else if (State == hp_error)
+ return Eoferr = -1;
+ } while (!Chunk);
+ return Chunk;
+ }
+
+protected:
+ int ParseHeader() {
+ HeaderParser.Init(Header);
+ State = hp_in_header;
+ while (State == hp_in_header) {
+ if ((Size = (long)Reader::Read(Ptr)) < 0)
+ return Eoferr = -1;
+ ParseGeneric(Ptr, Size);
+ }
+ if (State == hp_error)
+ return Eoferr = -1;
+ if (State == hp_eof)
+ Eoferr = 0;
+ return 0;
+ }
+
+ int SkipHeader() {
+ long hdrsize = Header->header_size;
+ while (hdrsize) {
+ if ((Size = (long)Reader::Read(Ptr)) <= 0)
+ return Eoferr = -1;
+ if (Size >= hdrsize) {
+ Size -= hdrsize;
+ Ptr = (char*)Ptr + hdrsize;
+ break;
+ }
+ hdrsize -= Size;
+ }
+ State = Header->transfer_chunked ? hp_begin_chunk_header
+ : Header->content_length == 0 ? hp_eof
+ : Header->content_length > 0 ? hp_read_alive
+ : hp_read_closed;
+ Header->entity_size = 0;
+ if (State == hp_eof)
+ Eoferr = 0;
+ else if (State == hp_begin_chunk_header) {
+ // unget \n for chunk reader
+ Ptr = (char*)Ptr - 1;
+ ++Size;
+ }
+ return 0;
+ }
+
+ void* Ptr;
+ long Size;
+ int Eoferr;
+};
diff --git a/library/cpp/http/fetch/httpparser_ut.cpp b/library/cpp/http/fetch/httpparser_ut.cpp
new file mode 100644
index 0000000000..3b3b938e7a
--- /dev/null
+++ b/library/cpp/http/fetch/httpparser_ut.cpp
@@ -0,0 +1,231 @@
+#include "httpparser.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#define ENUM_OUT(arg) \
+ case type ::arg: { \
+ out << #arg; \
+ return; \
+ }
+
+template <>
+void Out<THttpParserBase::States>(IOutputStream& out, THttpParserBase::States st) {
+ using type = THttpParserBase::States;
+ switch (st) {
+ ENUM_OUT(hp_error)
+ ENUM_OUT(hp_eof)
+ ENUM_OUT(hp_in_header)
+ ENUM_OUT(hp_read_alive)
+ ENUM_OUT(hp_read_closed)
+ ENUM_OUT(hp_begin_chunk_header)
+ ENUM_OUT(hp_chunk_header)
+ ENUM_OUT(hp_read_chunk)
+ }
+}
+
+namespace {
+ class TSomethingLikeFakeCheck;
+
+ using TTestHttpParser = THttpParser<TSomethingLikeFakeCheck>;
+
+ class TSomethingLikeFakeCheck {
+ TString Body_;
+
+ public:
+ const TString& Body() const {
+ return Body_;
+ }
+
+ // other functions are not really called by THttpParser
+ void CheckDocPart(const void* buf, size_t len, THttpHeader* /* header */) {
+ TString s(static_cast<const char*>(buf), len);
+ Cout << "State = " << static_cast<TTestHttpParser*>(this)->GetState() << ", CheckDocPart(" << s.Quote() << ")\n";
+ Body_ += s;
+ }
+ };
+
+}
+
+Y_UNIT_TEST_SUITE(TestHttpParser) {
+ Y_UNIT_TEST(TestTrivialRequest) {
+ const TString blob{
+ "GET /search?q=hi HTTP/1.1\r\n"
+ "Host: www.google.ru:8080 \r\n"
+ "\r\n"};
+ THttpHeader hdr;
+ THttpParser<> parser;
+ parser.Init(&hdr);
+ parser.Parse((void*)blob.data(), blob.size());
+ UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_error); // can't parse request as response
+ }
+
+ // XXX: `entity_size` is i32 and `content_length` is i64!
+ Y_UNIT_TEST(TestTrivialResponse) {
+ const TString blob{
+ "HTTP/1.1 200 Ok\r\n"
+ "Content-Length: 2\r\n"
+ "\r\n"
+ "OK"};
+ THttpHeader hdr;
+ TTestHttpParser parser;
+ parser.Init(&hdr);
+ parser.Parse((void*)blob.data(), blob.size());
+ UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof);
+ UNIT_ASSERT_EQUAL(parser.Body(), "OK");
+ UNIT_ASSERT_EQUAL(hdr.header_size, strlen(
+ "HTTP/1.1 200 Ok\r\n"
+ "Content-Length: 2\r\n"
+ "\r\n"));
+ UNIT_ASSERT_EQUAL(hdr.entity_size, strlen("OK"));
+ }
+
+ // XXX: `entity_size` is off by one in TE:chunked case.
+ Y_UNIT_TEST(TestChunkedResponse) {
+ const TString blob{
+ "HTTP/1.1 200 OK\r\n"
+ "Transfer-Encoding: chunked\r\n"
+ "\r\n"
+ "2\r\n"
+ "Ok\r\n"
+ "8\r\n"
+ "AllRight\r\n"
+ "0\r\n"
+ "\r\n"};
+ THttpHeader hdr;
+ TTestHttpParser parser;
+ parser.Init(&hdr);
+ parser.Parse((void*)blob.data(), blob.size());
+ UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof);
+ UNIT_ASSERT_EQUAL(parser.Body(), "OkAllRight");
+ UNIT_ASSERT_EQUAL(hdr.header_size, strlen(
+ "HTTP/1.1 200 OK\r\n"
+ "Transfer-Encoding: chunked\r\n"
+ "\r\n"));
+ const int off_by_one_err = -1; // XXX: it really looks so
+ UNIT_ASSERT_EQUAL(hdr.entity_size + off_by_one_err, strlen(
+ "2\r\n"
+ "Ok\r\n"
+ "8\r\n"
+ "AllRight\r\n"
+ "0\r\n"
+ "\r\n"));
+ }
+
+ static const TString PipelineClenBlob_{
+ "HTTP/1.1 200 Ok\r\n"
+ "Content-Length: 4\r\n"
+ "\r\n"
+ "OK\r\n"
+ "HTTP/1.1 200 Zz\r\n"
+ "Content-Length: 4\r\n"
+ "\r\n"
+ "ZZ\r\n"};
+
+ void AssertPipelineClen(TTestHttpParser & parser, const THttpHeader& hdr) {
+ UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof);
+ UNIT_ASSERT_EQUAL(4, hdr.content_length);
+ UNIT_ASSERT_EQUAL(hdr.header_size, strlen(
+ "HTTP/1.1 200 Ok\r\n"
+ "Content-Length: 4\r\n"
+ "\r\n"));
+ }
+
+ Y_UNIT_TEST(TestPipelineClenByteByByte) {
+ const TString& blob = PipelineClenBlob_;
+ THttpHeader hdr;
+ TTestHttpParser parser;
+ parser.Init(&hdr);
+ for (size_t i = 0; i < blob.size(); ++i) {
+ const TStringBuf d{blob, i, 1};
+ parser.Parse((void*)d.data(), d.size());
+ Cout << TString(d).Quote() << " -> " << parser.GetState() << Endl;
+ }
+ AssertPipelineClen(parser, hdr);
+ UNIT_ASSERT_EQUAL(parser.Body(), "OK\r\n");
+ UNIT_ASSERT_EQUAL(hdr.entity_size, hdr.content_length);
+ }
+
+ // XXX: Content-Length is ignored, Body() looks unexpected!
+ Y_UNIT_TEST(TestPipelineClenOneChunk) {
+ const TString& blob = PipelineClenBlob_;
+ THttpHeader hdr;
+ TTestHttpParser parser;
+ parser.Init(&hdr);
+ parser.Parse((void*)blob.data(), blob.size());
+ AssertPipelineClen(parser, hdr);
+ UNIT_ASSERT_EQUAL(parser.Body(),
+ "OK\r\n"
+ "HTTP/1.1 200 Zz\r\n"
+ "Content-Length: 4\r\n"
+ "\r\n"
+ "ZZ\r\n");
+ UNIT_ASSERT_EQUAL(hdr.entity_size, strlen(
+ "OK\r\n"
+ "HTTP/1.1 200 Zz\r\n"
+ "Content-Length: 4\r\n"
+ "\r\n"
+ "ZZ\r\n"));
+ }
+
+ static const TString PipelineChunkedBlob_{
+ "HTTP/1.1 200 OK\r\n"
+ "Transfer-Encoding: chunked\r\n"
+ "\r\n"
+ "2\r\n"
+ "Ok\r\n"
+ "8\r\n"
+ "AllRight\r\n"
+ "0\r\n"
+ "\r\n"
+ "HTTP/1.1 200 OK\r\n"
+ "Transfer-Encoding: chunked\r\n"
+ "\r\n"
+ "2\r\n"
+ "Yo\r\n"
+ "8\r\n"
+ "uWin!Iam\r\n"
+ "0\r\n"
+ "\r\n"};
+
+ void AssertPipelineChunked(TTestHttpParser & parser, const THttpHeader& hdr) {
+ UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof);
+ UNIT_ASSERT_EQUAL(parser.Body(), "OkAllRight");
+ UNIT_ASSERT_EQUAL(-1, hdr.content_length);
+ UNIT_ASSERT_EQUAL(hdr.header_size, strlen(
+ "HTTP/1.1 200 OK\r\n"
+ "Transfer-Encoding: chunked\r\n"
+ "\r\n"));
+ const int off_by_one_err = -1;
+ UNIT_ASSERT_EQUAL(hdr.entity_size + off_by_one_err, strlen(
+ "2\r\n"
+ "Ok\r\n"
+ "8\r\n"
+ "AllRight\r\n"
+ "0\r\n"
+ "\r\n"));
+ }
+
+ Y_UNIT_TEST(TestPipelineChunkedByteByByte) {
+ const TString& blob = PipelineChunkedBlob_;
+ THttpHeader hdr;
+ TTestHttpParser parser;
+ parser.Init(&hdr);
+ for (size_t i = 0; i < blob.size(); ++i) {
+ const TStringBuf d{blob, i, 1};
+ parser.Parse((void*)d.data(), d.size());
+ Cout << TString(d).Quote() << " -> " << parser.GetState() << Endl;
+ if (blob.size() / 2 - 1 <= i) // last \n sets EOF
+ UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof);
+ }
+ AssertPipelineChunked(parser, hdr);
+ }
+
+ Y_UNIT_TEST(TestPipelineChunkedOneChunk) {
+ const TString& blob = PipelineChunkedBlob_;
+ THttpHeader hdr;
+ TTestHttpParser parser;
+ parser.Init(&hdr);
+ parser.Parse((void*)blob.data(), blob.size());
+ AssertPipelineChunked(parser, hdr);
+ }
+}
diff --git a/library/cpp/http/fetch/httpzreader.h b/library/cpp/http/fetch/httpzreader.h
new file mode 100644
index 0000000000..68eb00853d
--- /dev/null
+++ b/library/cpp/http/fetch/httpzreader.h
@@ -0,0 +1,295 @@
+#pragma once
+
+#include "httpheader.h"
+#include "httpparser.h"
+#include "exthttpcodes.h"
+
+#include <util/system/defaults.h>
+#include <util/generic/yexception.h>
+
+#include <contrib/libs/zlib/zlib.h>
+
+#include <errno.h>
+
+#ifndef ENOTSUP
+#define ENOTSUP 45
+#endif
+
+template <class Reader>
+class TCompressedHttpReader: public THttpReader<Reader> {
+ typedef THttpReader<Reader> TBase;
+
+public:
+ using TBase::AssumeConnectionClosed;
+ using TBase::Header;
+ using TBase::ParseGeneric;
+ using TBase::State;
+
+ static constexpr size_t DefaultBufSize = 64 << 10;
+ static constexpr unsigned int DefaultWinSize = 15;
+
+ TCompressedHttpReader()
+ : CompressedInput(false)
+ , BufSize(0)
+ , CurContSize(0)
+ , MaxContSize(0)
+ , Buf(nullptr)
+ , ZErr(0)
+ , ConnectionClosed(0)
+ , IgnoreTrailingGarbage(true)
+ {
+ memset(&Stream, 0, sizeof(Stream));
+ }
+
+ ~TCompressedHttpReader() {
+ ClearStream();
+
+ if (Buf) {
+ free(Buf);
+ Buf = nullptr;
+ }
+ }
+
+ void SetConnectionClosed(int cc) {
+ ConnectionClosed = cc;
+ }
+
+ void SetIgnoreTrailingGarbage(bool ignore) {
+ IgnoreTrailingGarbage = ignore;
+ }
+
+ int Init(
+ THttpHeader* H,
+ int parsHeader,
+ const size_t maxContSize = Max<size_t>(),
+ const size_t bufSize = DefaultBufSize,
+ const unsigned int winSize = DefaultWinSize,
+ bool headRequest = false)
+ {
+ ZErr = 0;
+ CurContSize = 0;
+ MaxContSize = maxContSize;
+
+ int ret = TBase::Init(H, parsHeader, ConnectionClosed, headRequest);
+ if (ret)
+ return ret;
+
+ ret = SetCompression(H->compression_method, bufSize, winSize);
+ return ret;
+ }
+
+ long Read(void*& buf) {
+ if (!CompressedInput) {
+ long res = TBase::Read(buf);
+ if (res > 0) {
+ CurContSize += (size_t)res;
+ if (CurContSize > MaxContSize) {
+ ZErr = E2BIG;
+ return -1;
+ }
+ }
+ return res;
+ }
+
+ while (true) {
+ if (Stream.avail_in == 0) {
+ void* tmpin = Stream.next_in;
+ long res = TBase::Read(tmpin);
+ Stream.next_in = (Bytef*)tmpin;
+ if (res <= 0)
+ return res;
+ Stream.avail_in = (uInt)res;
+ }
+
+ Stream.next_out = Buf;
+ Stream.avail_out = (uInt)BufSize;
+ buf = Buf;
+
+ int err = inflate(&Stream, Z_SYNC_FLUSH);
+
+ //Y_ASSERT(Stream.avail_in == 0);
+
+ switch (err) {
+ case Z_OK:
+ // there is no data in next_out yet
+ if (BufSize == Stream.avail_out)
+ continue;
+ [[fallthrough]]; // don't break or return; continue with Z_STREAM_END case
+
+ case Z_STREAM_END:
+ if (Stream.total_out > MaxContSize) {
+ ZErr = E2BIG;
+ return -1;
+ }
+ if (!IgnoreTrailingGarbage && BufSize == Stream.avail_out && Stream.avail_in > 0) {
+ Header->error = EXT_HTTP_GZIPERROR;
+ ZErr = EFAULT;
+ Stream.msg = (char*)"trailing garbage";
+ return -1;
+ }
+ return long(BufSize - Stream.avail_out);
+
+ case Z_NEED_DICT:
+ case Z_DATA_ERROR:
+ Header->error = EXT_HTTP_GZIPERROR;
+ ZErr = EFAULT;
+ return -1;
+
+ case Z_MEM_ERROR:
+ ZErr = ENOMEM;
+ return -1;
+
+ default:
+ ZErr = EINVAL;
+ return -1;
+ }
+ }
+
+ return -1;
+ }
+
+ const char* ZMsg() const {
+ return Stream.msg;
+ }
+
+ int ZError() const {
+ return ZErr;
+ }
+
+ size_t GetCurContSize() const {
+ return CompressedInput ? Stream.total_out : CurContSize;
+ }
+
+protected:
+ int SetCompression(const int compression, const size_t bufSize,
+ const unsigned int winSize) {
+ ClearStream();
+
+ int winsize = winSize;
+ switch ((enum HTTP_COMPRESSION)compression) {
+ case HTTP_COMPRESSION_UNSET:
+ case HTTP_COMPRESSION_IDENTITY:
+ CompressedInput = false;
+ return 0;
+ case HTTP_COMPRESSION_GZIP:
+ CompressedInput = true;
+ winsize += 16; // 16 indicates gzip, see zlib.h
+ break;
+ case HTTP_COMPRESSION_DEFLATE:
+ CompressedInput = true;
+ winsize = -winsize; // negative indicates raw deflate stream, see zlib.h
+ break;
+ case HTTP_COMPRESSION_COMPRESS:
+ case HTTP_COMPRESSION_ERROR:
+ default:
+ CompressedInput = false;
+ ZErr = ENOTSUP;
+ return -1;
+ }
+
+ if (bufSize != BufSize) {
+ if (Buf)
+ free(Buf);
+ Buf = (ui8*)malloc(bufSize);
+ if (!Buf) {
+ ZErr = ENOMEM;
+ return -1;
+ }
+ BufSize = bufSize;
+ }
+
+ int err = inflateInit2(&Stream, winsize);
+ switch (err) {
+ case Z_OK:
+ Stream.total_in = 0;
+ Stream.total_out = 0;
+ Stream.avail_in = 0;
+ return 0;
+
+ case Z_DATA_ERROR: // never happens, see zlib.h
+ CompressedInput = false;
+ ZErr = EFAULT;
+ return -1;
+
+ case Z_MEM_ERROR:
+ CompressedInput = false;
+ ZErr = ENOMEM;
+ return -1;
+
+ default:
+ CompressedInput = false;
+ ZErr = EINVAL;
+ return -1;
+ }
+ }
+
+ void ClearStream() {
+ if (CompressedInput) {
+ inflateEnd(&Stream);
+ CompressedInput = false;
+ }
+ }
+
+ z_stream Stream;
+ bool CompressedInput;
+ size_t BufSize;
+ size_t CurContSize, MaxContSize;
+ ui8* Buf;
+ int ZErr;
+ int ConnectionClosed;
+ bool IgnoreTrailingGarbage;
+};
+
+class zlib_exception: public yexception {
+};
+
+template <class Reader>
+class SCompressedHttpReader: public TCompressedHttpReader<Reader> {
+ typedef TCompressedHttpReader<Reader> TBase;
+
+public:
+ using TBase::ZError;
+ using TBase::ZMsg;
+
+ SCompressedHttpReader()
+ : TBase()
+ {
+ }
+
+ int Init(
+ THttpHeader* H,
+ int parsHeader,
+ const size_t maxContSize = Max<size_t>(),
+ const size_t bufSize = TBase::DefaultBufSize,
+ const unsigned int winSize = TBase::DefaultWinSize,
+ bool headRequest = false)
+ {
+ int ret = TBase::Init(H, parsHeader, maxContSize, bufSize, winSize, headRequest);
+ return (int)HandleRetValue((long)ret);
+ }
+
+ long Read(void*& buf) {
+ long ret = TBase::Read(buf);
+ return HandleRetValue(ret);
+ }
+
+protected:
+ long HandleRetValue(long ret) {
+ switch (ZError()) {
+ case 0:
+ return ret;
+ case ENOMEM:
+ ythrow yexception() << "SCompressedHttpReader: not enough memory";
+ case EINVAL:
+ ythrow yexception() << "SCompressedHttpReader: zlib error: " << ZMsg();
+ case ENOTSUP:
+ ythrow yexception() << "SCompressedHttpReader: unsupported compression method";
+ case EFAULT:
+ ythrow zlib_exception() << "SCompressedHttpReader: " << ZMsg();
+ case E2BIG:
+ ythrow zlib_exception() << "SCompressedHttpReader: Content exceeds maximum length";
+ default:
+ ythrow yexception() << "SCompressedHttpReader: unknown error";
+ }
+ }
+};
diff --git a/library/cpp/http/fetch/library-htfetch_ut_hreflang_in.h b/library/cpp/http/fetch/library-htfetch_ut_hreflang_in.h
new file mode 100644
index 0000000000..0df89bdc79
--- /dev/null
+++ b/library/cpp/http/fetch/library-htfetch_ut_hreflang_in.h
@@ -0,0 +1,155 @@
+#pragma once
+
+char hreflang_ut_in[] = "HTTP/1.1 200 OK\n"
+ "Date: Thu, 15 Nov 2012 22:38:28 GMT\n"
+ "Server: Apache/2\n"
+ "X-Powered-By: PHP/5.2.17\n"
+ "Set-Cookie: PHPSESSID=6d69474d1cc019d7d82714c9472bc6d6; path=/\n"
+ "Expires: Thu, 19 Nov 1981 08:52:00 GMT\n"
+ "Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0\n"
+ "Pragma: no-cache\n"
+ "Link: <http://www.forexticket.cn.com/zh/currency/converter-EEK-XAG>; rel='alternate'; hreflang='zh-CN'\n"
+ "Link: <http://www.forexticket.tw/zh/currency/converter-EEK-XAG>; rel='alternate'; hreflang='zh-TW'\n"
+ "Link: <http://www.forexticket.hk/zh/currency/converter-EEK-XAG>; rel='alternate'; hreflang='zh-HK'\n"
+ "Link: <http://www.forexticket.sg/zh/currency/converter-EEK-XAG>; rel='alternate'; hreflang='zh-SG'\n"
+ "Link: <http://www.forexticket.in/hi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='hi-IN'\n"
+ "Link: <http://www.forexticket.com.fj/hi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='hi-FJ'\n"
+ "Link: <http://www.forexticket.in/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-IN'\n"
+ "Link: <http://www.forexticket.us/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-US'\n"
+ "Link: <http://www.forexticket.com.pk/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-PK'\n"
+ "Link: <http://www.forexticket-bd.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-BD'\n"
+ "Link: <http://www.forexticket-ng.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-NG'\n"
+ "Link: <http://www.forexticket.co.uk/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-GB'\n"
+ "Link: <http://www.forexticket.co.za/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-ZA'\n"
+ "Link: <http://www.forexticket.co.ke/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-KE'\n"
+ "Link: <http://www.forexticket.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-CA'\n"
+ "Link: <http://www.forexticket-gh.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-GH'\n"
+ "Link: <http://www.forexticket.biz/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-AU'\n"
+ "Link: <http://www.forexticket.cm/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-CM'\n"
+ "Link: <http://www.forexticket-kh.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-KH'\n"
+ "Link: <http://www.forexticket.hk/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-HK'\n"
+ "Link: <http://www.forexticket.la/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-LA'\n"
+ "Link: <http://www.forexticket.sg/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-SG'\n"
+ "Link: <http://www.forexticket.co.nz/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-NZ'\n"
+ "Link: <http://www.forexticket.com.pr/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-PR'\n"
+ "Link: <http://www.forexticket.com.fj/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-FJ'\n"
+ "Link: <http://www.forexticket.us/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-US'\n"
+ "Link: <http://www.forexticket.mx/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-MX'\n"
+ "Link: <http://www.forexticket.co/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-CO'\n"
+ "Link: <http://www.forexticket.com.ar/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-AR'\n"
+ "Link: <http://www.forexticket-pe.com/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-PE'\n"
+ "Link: <http://www.forexticket.co.ve/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-VE'\n"
+ "Link: <http://www.forexticket.cl/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-CL'\n"
+ "Link: <http://www.forexticket.ec/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-EC'\n"
+ "Link: <http://www.forexticket.com.gt/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-GT'\n"
+ "Link: <http://www.forexticket.bo/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-BO'\n"
+ "Link: <http://www.forexticket.hn/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-HN'\n"
+ "Link: <http://www.forexticket.com.py/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-PY'\n"
+ "Link: <http://www.forexticket.es/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-ES'\n"
+ "Link: <http://www.forexticket.com.sv/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-SV'\n"
+ "Link: <http://www.forexticket.com.ni/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-NI'\n"
+ "Link: <http://www.forexticket.co.cr/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-CR'\n"
+ "Link: <http://www.forexticket.com.pr/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-PR'\n"
+ "Link: <http://www.forexticket.com.uy/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-UY'\n"
+ "Link: <http://www.forexticket.com.pa/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-PA'\n"
+ "Link: <http://www.forexticket.asia.com/id/currency/converter-EEK-XAG>; rel='alternate'; hreflang='id-ID'\n"
+ "Link: <http://www.forexticket.com.br/pt/moeda/conversor-EEK-XAG>; rel='alternate'; hreflang='pt-BR'\n"
+ "Link: <http://www.forexticket-mz.com/pt/moeda/conversor-EEK-XAG>; rel='alternate'; hreflang='pt-MZ'\n"
+ "Link: <http://www.forexticket.com.pt/pt/moeda/conversor-EEK-XAG>; rel='alternate'; hreflang='pt-PT'\n"
+ "Link: <http://www.forexticket.tl/pt/moeda/conversor-EEK-XAG>; rel='alternate'; hreflang='pt-TL'\n"
+ "Link: <http://www.forexticket.ru/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-RU'\n"
+ "Link: <http://www.forexticket-kz.com/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-KZ'\n"
+ "Link: <http://www.forexticket-tj.com/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-TJ'\n"
+ "Link: <http://www.forexticket-kg.com/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-KG'\n"
+ "Link: <http://www.forexticket-ge.com/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-GE'\n"
+ "Link: <http://www.forexticket.mn/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-MN'\n"
+ "Link: <http://www.forexticket.jp/ja/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ja-JP'\n"
+ "Link: <http://www.forexticket-ph.com/tl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='tl-PH'\n"
+ "Link: <http://www.forexticket.vn/vi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='vi-VN'\n"
+ "Link: <http://www.forexticket.de/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-DE'\n"
+ "Link: <http://www.forexticket.be/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-BE'\n"
+ "Link: <http://www.forexticket.at/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-AT'\n"
+ "Link: <http://www.forexticket.ch/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-CH'\n"
+ "Link: <http://www.forexticket.lu/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-LU'\n"
+ "Link: <http://www.forexticket.li/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-LI'\n"
+ "Link: <http://www.forexticket.de/de/waehrungsumrechner/devisen-EEK-XAG>; rel='canonical'\n"
+ "Link: <http://www.forexticket-eg.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-EG'\n"
+ "Link: <http://www.forexticket-dz.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-DZ'\n"
+ "Link: <http://www.forexticket-ma.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-MA'\n"
+ "Link: <http://www.forexticket-iq.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-IQ'\n"
+ "Link: <http://www.forexticket-sa.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-SA'\n"
+ "Link: <http://www.forexticket-sy.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-SY'\n"
+ "Link: <http://www.forexticket-tn.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-TN'\n"
+ "Link: <http://www.forexticket-td.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-TD'\n"
+ "Link: <http://www.forexticket-so.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-SO'\n"
+ "Link: <http://www.forexticket.co.il/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-IL'\n"
+ "Link: <http://www.forexticket-jo.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-JO'\n"
+ "Link: <http://www.forexticket.ae/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-AE'\n"
+ "Link: <http://www.forexticket-lb.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-LB'\n"
+ "Link: <http://www.forexticket-om.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-OM'\n"
+ "Link: <http://www.forexticket-kw.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-KW'\n"
+ "Link: <http://www.forexticket-tr.com/tr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='tr-TR'\n"
+ "Link: <http://www.forexticket-bg.com/tr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='tr-BG'\n"
+ "Link: <http://www.forexticket-cy.com/tr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='tr-CY'\n"
+ "Link: <http://www.forexticket.ir/fa/currency/converter-EEK-XAG>; rel='alternate'; hreflang='fa-IR'\n"
+ "Link: <http://www.forexticket.af/fa/currency/converter-EEK-XAG>; rel='alternate'; hreflang='fa-AF'\n"
+ "Link: <http://www.forexticket.cd/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-CD'\n"
+ "Link: <http://www.forexticket.fr/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-FR'\n"
+ "Link: <http://www.forexticket.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-CA'\n"
+ "Link: <http://www.forexticket.mg/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-MG'\n"
+ "Link: <http://www.forexticket.cm/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-CM'\n"
+ "Link: <http://www.forexticket-kh.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-KH'\n"
+ "Link: <http://www.forexticket-ml.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-ML'\n"
+ "Link: <http://www.forexticket-sn.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-SN'\n"
+ "Link: <http://www.forexticket-tn.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-TN'\n"
+ "Link: <http://www.forexticket-td.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-TD'\n"
+ "Link: <http://www.forexticket.be/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-BE'\n"
+ "Link: <http://www.forexticket-gn.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-GN'\n"
+ "Link: <http://www.forexticket.ht/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-HT'\n"
+ "Link: <http://www.forexticket.ch/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-CH'\n"
+ "Link: <http://www.forexticket.la/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-LA'\n"
+ "Link: <http://www.forexticket.lu/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-LU'\n"
+ "Link: <http://www.forexticket-th.com/th/currency/converter-EEK-XAG>; rel='alternate'; hreflang='th-TH'\n"
+ "Link: <http://www.forexticket.co.uk/cy/currency/converter-EEK-XAG>; rel='alternate'; hreflang='cy-GB'\n"
+ "Link: <http://www.forexticket.co.uk/ga/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ga-GB'\n"
+ "Link: <http://www.forexticket.it/it/convertitore/valuta-EEK-XAG>; rel='alternate'; hreflang='it-IT'\n"
+ "Link: <http://www.forexticket.ch/it/convertitore/valuta-EEK-XAG>; rel='alternate'; hreflang='it-CH'\n"
+ "Link: <http://www.forexticket.co.za/af/currency/converter-EEK-XAG>; rel='alternate'; hreflang='af-ZA'\n"
+ "Link: <http://www.forexticket.kr/ko/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ko-KR'\n"
+ "Link: <http://www.forexticket-ua.com/uk/currency/converter-EEK-XAG>; rel='alternate'; hreflang='uk-UA'\n"
+ "Link: <http://www.forexticket-tz.com/sw/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sw-TZ'\n"
+ "Link: <http://www.forexticket.co.ke/sw/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sw-KE'\n"
+ "Link: <http://www.forexticket.pl/pl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='pl-PL'\n"
+ "Link: <http://www.forexticket.com.my/ms/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ms-MY'\n"
+ "Link: <http://www.forexticket.sg/ms/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ms-SG'\n"
+ "Link: <http://www.forexticket.ro/ro/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ro-RO'\n"
+ "Link: <http://www.forexticket.nl/nl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='nl-NL'\n"
+ "Link: <http://www.forexticket.be/nl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='nl-BE'\n"
+ "Link: <http://www.forexticket.gr/el/currency/converter-EEK-XAG>; rel='alternate'; hreflang='el-GR'\n"
+ "Link: <http://www.forexticket-al.com/el/currency/converter-EEK-XAG>; rel='alternate'; hreflang='el-AL'\n"
+ "Link: <http://www.forexticket-cy.com/el/currency/converter-EEK-XAG>; rel='alternate'; hreflang='el-CY'\n"
+ "Link: <http://www.forexticket.cz/cs/currency/converter-EEK-XAG>; rel='alternate'; hreflang='cs-CZ'\n"
+ "Link: <http://www.forexticket.hu/hu/currency/converter-EEK-XAG>; rel='alternate'; hreflang='hu-HU'\n"
+ "Link: <http://www.forexticket.se/sv/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sv-SE'\n"
+ "Link: <http://www.forexticket.eu/sv/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sv-FI'\n"
+ "Link: <http://www.forexticket.co.il/iw/currency/converter-EEK-XAG>; rel='alternate'; hreflang='iw-IL'\n"
+ "Link: <http://www.forexticket.co.il/yi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='yi-IL'\n"
+ "Link: <http://www.forexticket-bg.com/bg/currency/converter-EEK-XAG>; rel='alternate'; hreflang='bg-BG'\n"
+ "Link: <http://www.forexticket.es/ca/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ca-ES'\n"
+ "Link: <http://www.forexticket.es/gl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='gl-ES'\n"
+ "Link: <http://www.forexticket.dk/da/currency/converter-EEK-XAG>; rel='alternate'; hreflang='da-DK'\n"
+ "Link: <http://www.forexticket.eu/fi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='fi-FI'\n"
+ "Link: <http://www.forexticket-hr.com/hr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='hr-HR'\n"
+ "Link: <http://www.forexticket-hr.com/sr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sr-HR'\n"
+ "Link: <http://www.forexticket.me/sr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sr-ME'\n"
+ "Link: <http://www.forexticket.lt/lt/currency/converter-EEK-XAG>; rel='alternate'; hreflang='lt-LT'\n"
+ "Link: <http://www.forexticket-al.com/sq/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sq-AL'\n"
+ "Link: <http://www.forexticket.lv/lv/currency/converter-EEK-XAG>; rel='alternate'; hreflang='lv-LV'\n"
+ "Link: <http://www.forexticket.co.ee/et/currency/converter-EEK-XAG>; rel='alternate'; hreflang='et-EE'\n"
+ "Vary: Accept-Encoding,User-Agent\n"
+ "Content-Encoding: gzip\n"
+ "Keep-Alive: timeout=1, max=100\n"
+ "Connection: Keep-Alive\n"
+ "Transfer-Encoding: chunked\n"
+ "Content-Type: text/html\n"
+ "\n";
diff --git a/library/cpp/http/fetch/library-htfetch_ut_hreflang_out.h b/library/cpp/http/fetch/library-htfetch_ut_hreflang_out.h
new file mode 100644
index 0000000000..bef8bacff5
--- /dev/null
+++ b/library/cpp/http/fetch/library-htfetch_ut_hreflang_out.h
@@ -0,0 +1,3 @@
+#pragma once
+
+char hreflang_ut_out[] = "zh-CN http://www.forexticket.cn.com/zh/currency/converter-EEK-XAG\tzh-TW http://www.forexticket.tw/zh/currency/converter-EEK-XAG\tzh-HK http://www.forexticket.hk/zh/currency/converter-EEK-XAG\tzh-SG http://www.forexticket.sg/zh/currency/converter-EEK-XAG\thi-IN http://www.forexticket.in/hi/currency/converter-EEK-XAG\thi-FJ http://www.forexticket.com.fj/hi/currency/converter-EEK-XAG\ten-IN http://www.forexticket.in/en/currency/converter-EEK-XAG\ten-US http://www.forexticket.us/en/currency/converter-EEK-XAG\ten-PK http://www.forexticket.com.pk/en/currency/converter-EEK-XAG\ten-BD http://www.forexticket-bd.com/en/currency/converter-EEK-XAG\ten-NG http://www.forexticket-ng.com/en/currency/converter-EEK-XAG\ten-GB http://www.forexticket.co.uk/en/currency/converter-EEK-XAG\ten-ZA http://www.forexticket.co.za/en/currency/converter-EEK-XAG\ten-KE http://www.forexticket.co.ke/en/currency/converter-EEK-XAG\ten-CA http://www.forexticket.com/en/currency/converter-EEK-XAG\ten-GH http://www.forexticket-gh.com/en/currency/converter-EEK-XAG\ten-AU http://www.forexticket.biz/en/currency/converter-EEK-XAG\ten-CM http://www.forexticket.cm/en/currency/converter-EEK-XAG\ten-KH http://www.forexticket-kh.com/en/currency/converter-EEK-XAG\ten-HK http://www.forexticket.hk/en/currency/converter-EEK-XAG\ten-LA http://www.forexticket.la/en/currency/converter-EEK-XAG\ten-SG http://www.forexticket.sg/en/currency/converter-EEK-XAG\ten-NZ http://www.forexticket.co.nz/en/currency/converter-EEK-XAG\ten-PR http://www.forexticket.com.pr/en/currency/converter-EEK-XAG\ten-FJ http://www.forexticket.com.fj/en/currency/converter-EEK-XAG\tes-US http://www.forexticket.us/es/cambio/divisas-EEK-XAG\tes-MX http://www.forexticket.mx/es/cambio/divisas-EEK-XAG\tes-CO http://www.forexticket.co/es/cambio/divisas-EEK-XAG\tes-AR http://www.forexticket.com.ar/es/cambio/divisas-EEK-XAG\tes-PE http://www.forexticket-pe.com/es/cambio/divisas-EEK-XAG\tes-VE http://www.forexticket.co.ve/es/cambio/divisas-EEK-XAG\tes-CL http://www.forexticket.cl/es/cambio/divisas-EEK-XAG\tes-EC http://www.forexticket.ec/es/cambio/divisas-EEK-XAG\tes-GT http://www.forexticket.com.gt/es/cambio/divisas-EEK-XAG\tes-BO http://www.forexticket.bo/es/cambio/divisas-EEK-XAG\tes-HN http://www.forexticket.hn/es/cambio/divisas-EEK-XAG\tes-PY http://www.forexticket.com.py/es/cambio/divisas-EEK-XAG\tes-ES http://www.forexticket.es/es/cambio/divisas-EEK-XAG\tes-SV http://www.forexticket.com.sv/es/cambio/divisas-EEK-XAG\tes-NI http://www.forexticket.com.ni/es/cambio/divisas-EEK-XAG\tes-CR http://www.forexticket.co.cr/es/cambio/divisas-EEK-XAG\tes-PR http://www.forexticket.com.pr/es/cambio/divisas-EEK-XAG\tes-UY http://www.forexticket.com.uy/es/cambio/divisas-EEK-XAG\tes-PA http://www.forexticket.com.pa/es/cambio/divisas-EEK-XAG\tid-ID http://www.forexticket.asia.com/id/currency/converter-EEK-XAG\tpt-BR http://www.forexticket.com.br/pt/moeda/conversor-EEK-XAG\tpt-MZ http://www.forexticket-mz.com/pt/moeda/conversor-EEK-XAG\tpt-PT http://www.forexticket.com.pt/pt/moeda/conversor-EEK-XAG\tpt-TL http://www.forexticket.tl/pt/moeda/conversor-EEK-XAG\tru-RU http://www.forexticket.ru/ru/currency/converter-EEK-XAG\tru-KZ http://www.forexticket-kz.com/ru/currency/converter-EEK-XAG\tru-TJ http://www.forexticket-tj.com/ru/currency/converter-EEK-XAG\tru-KG http://www.forexticket-kg.com/ru/currency/converter-EEK-XAG\tru-GE http://www.forexticket-ge.com/ru/currency/converter-EEK-XAG\tru-MN http://www.forexticket.mn/ru/currency/converter-EEK-XAG\tja-JP http://www.forexticket.jp/ja/currency/converter-EEK-XAG\ttl-PH http://www.forexticket-ph.com/tl/currency/converter-EEK-XAG\tvi-VN http://www.forexticket.vn/vi/currency/converter-EEK-XAG\tde-DE http://www.forexticket.de/de/waehrungsumrechner/devisen-EEK-XAG\tde-BE http://www.forexticket.be/de/waehrungsumrechner/devisen-EEK-XAG\tde-AT http://www.forexticket.at/de/waehrungsumrechner/devisen-EEK-XAG\tde-CH http://www.forexticket.ch/de/waehrungsumrechner/devisen-EEK-XAG\tde-LU http://www.forexticket.lu/de/waehrungsumrechner/devisen-EEK-XAG\tde-LI http://www.forexticket.li/de/waehrungsumrechner/devisen-EEK-XAG\tar-EG http://www.forexticket-eg.com/ar/currency/converter-EEK-XAG\tar-DZ http://www.forexticket-dz.com/ar/currency/converter-EEK-XAG\tar-MA http://www.forexticket-ma.com/ar/currency/converter-EEK-XAG\tar-IQ http://www.forexticket-iq.com/ar/currency/converter-EEK-XAG\tar-SA http://www.forexticket-sa.com/ar/currency/converter-EEK-XAG\tar-SY http://www.forexticket-sy.com/ar/currency/converter-EEK-XAG\tar-TN http://www.forexticket-tn.com/ar/currency/converter-EEK-XAG\tar-TD http://www.forexticket-td.com/ar/currency/converter-EEK-XAG\tar-SO http://www.forexticket-so.com/ar/currency/converter-EEK-XAG\tar-IL http://www.forexticket.co.il/ar/currency/converter-EEK-XAG\tar-JO http://www.forexticket-jo.com/ar/currency/converter-EEK-XAG\tar-AE http://www.forexticket.ae/ar/currency/converter-EEK-XAG\tar-LB http://www.forexticket-lb.com/ar/currency/converter-EEK-XAG\tar-OM http://www.forexticket-om.com/ar/currency/converter-EEK-XAG\tar-KW http://www.forexticket-kw.com/ar/currency/converter-EEK-XAG\ttr-TR http://www.forexticket-tr.com/tr/currency/converter-EEK-XAG\ttr-BG http://www.forexticket-bg.com/tr/currency/converter-EEK-XAG\ttr-CY http://www.forexticket-cy.com/tr/currency/converter-EEK-XAG\tfa-IR http://www.forexticket.ir/fa/currency/converter-EEK-XAG\tfa-AF http://www.forexticket.af/fa/currency/converter-EEK-XAG\tfr-CD http://www.forexticket.cd/fr/conversion/monnaie-EEK-XAG\tfr-FR http://www.forexticket.fr/fr/conversion/monnaie-EEK-XAG\tfr-CA http://www.forexticket.com/fr/conversion/monnaie-EEK-XAG\tfr-MG http://www.forexticket.mg/fr/conversion/monnaie-EEK-XAG\tfr-CM http://www.forexticket.cm/fr/conversion/monnaie-EEK-XAG\tfr-KH http://www.forexticket-kh.com/fr/conversion/monnaie-EEK-XAG\tfr-ML http://www.forexticket-ml.com/fr/conversion/monnaie-EEK-XAG\tfr-SN http://www.forexticket-sn.com/fr/conversion/monnaie-EEK-XAG\tfr-TN http://www.forexticket-tn.com/fr/conversion/monnaie-EEK-XAG\tfr-TD http://www.forexticket-td.com/fr/conversion/monnaie-EEK-XAG\tfr-BE http://www.forexticket.be/fr/conversion/monnaie-EEK-XAG\tfr-GN http://www.forexticket-gn.com/fr/conversion/monnaie-EEK-XAG\tfr-HT http://www.forexticket.ht/fr/conversion/monnaie-EEK-XAG\tfr-CH http://www.forexticket.ch/fr/conversion/monnaie-EEK-XAG\tfr-LA http://www.forexticket.la/fr/conversion/monnaie-EEK-XAG\tfr-LU http://www.forexticket.lu/fr/conversion/monnaie-EEK-XAG\tth-TH http://www.forexticket-th.com/th/currency/converter-EEK-XAG\tcy-GB http://www.forexticket.co.uk/cy/currency/converter-EEK-XAG\tga-GB http://www.forexticket.co.uk/ga/currency/converter-EEK-XAG\tit-IT http://www.forexticket.it/it/convertitore/valuta-EEK-XAG\tit-CH http://www.forexticket.ch/it/convertitore/valuta-EEK-XAG\taf-ZA http://www.forexticket.co.za/af/currency/converter-EEK-XAG\tko-KR http://www.forexticket.kr/ko/currency/converter-EEK-XAG\tuk-UA http://www.forexticket-ua.com/uk/currency/converter-EEK-XAG\tsw-TZ http://www.forexticket-tz.com/sw/currency/converter-EEK-XAG\tsw-KE http://www.forexticket.co.ke/sw/currency/converter-EEK-XAG\tpl-PL http://www.forexticket.pl/pl/currency/converter-EEK-XAG\tms-MY http://www.forexticket.com.my/ms/currency/converter-EEK-XAG\tms-SG http://www.forexticket.sg/ms/currency/converter-EEK-XAG\tro-RO http://www.forexticket.ro/ro/currency/converter-EEK-XAG\tnl-NL http://www.forexticket.nl/nl/currency/converter-EEK-XAG\tnl-BE http://www.forexticket.be/nl/currency/converter-EEK-XAG\tel-GR http://www.forexticket.gr/el/currency/converter-EEK-XAG\tel-AL http://www.forexticket-al.com/el/currency/converter-EEK-XAG\tel-CY http://www.forexticket-cy.com/el/currency/converter-EEK-XAG\tcs-CZ http://www.forexticket.cz/cs/currency/converter-EEK-XAG\thu-HU http://www.forexticket.hu/hu/currency/converter-EEK-XAG\tsv-SE http://www.forexticket.se/sv/currency/converter-EEK-XAG\tsv-FI http://www.forexticket.eu/sv/currency/converter-EEK-XAG\tiw-IL http://www.forexticket.co.il/iw/currency/converter-EEK-XAG\tyi-IL http://www.forexticket.co.il/yi/currency/converter-EEK-XAG\tbg-BG http://www.forexticket-bg.com/bg/currency/converter-EEK-XAG\tca-ES http://www.forexticket.es/ca/currency/converter-EEK-XAG\tgl-ES http://www.forexticket.es/gl/currency/converter-EEK-XAG\tda-DK http://www.forexticket.dk/da/currency/converter-EEK-XAG\tfi-FI http://www.forexticket.eu/fi/currency/converter-EEK-XAG\thr-HR http://www.forexticket-hr.com/hr/currency/converter-EEK-XAG\tsr-HR http://www.forexticket-hr.com/sr/currency/converter-EEK-XAG\tsr-ME http://www.forexticket.me/sr/currency/converter-EEK-XAG\tlt-LT http://www.forexticket.lt/lt/currency/converter-EEK-XAG\tsq-AL http://www.forexticket-al.com/sq/currency/converter-EEK-XAG\tlv-LV http://www.forexticket.lv/lv/currency/converter-EEK-XAG\tet-EE http://www.forexticket.co.ee/et/currency/converter-EEK-XAG";
diff --git a/library/cpp/http/fetch/sockhandler.h b/library/cpp/http/fetch/sockhandler.h
new file mode 100644
index 0000000000..e18149f657
--- /dev/null
+++ b/library/cpp/http/fetch/sockhandler.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <library/cpp/logger/all.h>
+
+#include <util/generic/buffer.h>
+#include <util/generic/map.h>
+#include <util/generic/vector.h>
+#include <util/network/address.h>
+#include <util/network/ip.h>
+#include <util/network/socket.h>
+#include <util/system/mutex.h>
+#include <util/system/yassert.h>
+
+#include <cerrno>
+#include <util/generic/noncopyable.h>
+
+class TAddrList: public TVector<NAddr::IRemoteAddrRef> {
+private:
+ using TBase = TVector<NAddr::IRemoteAddrRef>;
+
+public:
+ //msvc doesn't support base class constructor inheritance
+ TAddrList() = default;
+
+ template <typename T>
+ TAddrList(T&& arg)
+ : TBase(std::forward<T>(arg))
+ {
+ }
+
+ template <typename T1, typename T2>
+ TAddrList(T1&& arg1, T2&& arg2)
+ : TBase(std::forward<T1>(arg1), std::forward<T2>(arg2))
+ {
+ }
+
+ TAddrList(std::initializer_list<NAddr::IRemoteAddrRef> list)
+ : TBase(list)
+ {
+ }
+
+ static TAddrList MakeV4Addr(ui32 ip, TIpPort port) {
+ return TAddrList({new NAddr::TIPv4Addr(TIpAddress(htonl(ip), htons(port)))});
+ }
+
+ std::pair<ui32, TIpPort> GetV4Addr() const {
+ for (const auto& addrRef : *this) {
+ const sockaddr* sa = addrRef->Addr();
+ if (sa->sa_family == AF_INET) {
+ const sockaddr_in* sin = reinterpret_cast<const sockaddr_in*>(sa);
+ return std::make_pair(ntohl(sin->sin_addr.s_addr), ntohs(sin->sin_port));
+ }
+ }
+ return std::make_pair(0, 0);
+ }
+};
+
+class TSimpleSocketHandler {
+public:
+ TSimpleSocketHandler() = default;
+
+ int Good() const {
+ return static_cast<bool>(Socket);
+ }
+
+ int Connect(const TAddrList& addrs, TDuration timeout) {
+ try {
+ for (const auto& item : addrs) {
+ const sockaddr* sa = item->Addr();
+ TSocketHolder s(socket(sa->sa_family, SOCK_STREAM, 0));
+ if (s.Closed()) {
+ continue;
+ }
+
+#ifndef WIN32
+ if (fcntl(s, F_SETFD, FD_CLOEXEC)) // no inherit on fork()/exec()
+ return errno ? errno : EBADF;
+#endif
+ if (connect(s, sa, item->Len())) {
+ s.Close();
+ continue;
+ }
+
+ Socket.Reset(new TSocket(s.Release()));
+ Socket->SetSocketTimeout(timeout.Seconds(), timeout.MilliSecondsOfSecond());
+ Socket->SetZeroLinger();
+ Socket->SetKeepAlive(true);
+ return 0;
+ }
+ } catch (...) {
+ return EBADF;
+ }
+ return errno ? errno : EBADF;
+ }
+
+ void Disconnect() {
+ if (!Socket)
+ return;
+ Socket->ShutDown(SHUT_RDWR);
+ Socket.Destroy();
+ }
+
+ void SetSocket(SOCKET fd) {
+ Socket.Reset(new TSocket(fd));
+ }
+
+ void shutdown() {
+ Socket->ShutDown(SHUT_WR);
+ }
+
+ int send(const void* message, size_t messlen) {
+ return ((ssize_t)messlen == Socket->Send(message, messlen));
+ }
+
+ int peek() {
+ char buf[1];
+ return (1 == recv(*Socket, buf, 1, MSG_PEEK));
+ }
+
+ ssize_t read(void* buffer, size_t buflen) {
+ return Socket->Recv(buffer, buflen);
+ }
+
+ THolder<TSocket> PickOutSocket() {
+ return std::move(Socket);
+ }
+
+protected:
+ THolder<TSocket> Socket;
+};
diff --git a/library/cpp/http/fetch/ut/ya.make b/library/cpp/http/fetch/ut/ya.make
new file mode 100644
index 0000000000..7486986b36
--- /dev/null
+++ b/library/cpp/http/fetch/ut/ya.make
@@ -0,0 +1,12 @@
+UNITTEST_FOR(library/cpp/http/fetch)
+
+OWNER(
+ g:zora
+)
+
+SRCS(
+ httpfsm_ut.cpp
+ httpparser_ut.cpp
+)
+
+END()
diff --git a/library/cpp/http/fetch/ya.make b/library/cpp/http/fetch/ya.make
new file mode 100644
index 0000000000..7737127463
--- /dev/null
+++ b/library/cpp/http/fetch/ya.make
@@ -0,0 +1,38 @@
+LIBRARY()
+
+OWNER(
+ g:zora
+)
+
+PEERDIR(
+ contrib/libs/zlib
+ library/cpp/charset
+ library/cpp/digest/md5
+ library/cpp/http/misc
+ library/cpp/logger
+ library/cpp/mime/types
+ library/cpp/uri
+)
+
+SRCS(
+ http_digest.cpp
+ http_socket.cpp
+ httpheader.cpp
+ httpload.cpp
+ exthttpcodes.cpp
+ httpfsm.rl6
+ httpagent.h
+ httpfetcher.h
+ httpheader.h
+ httpparser.h
+ httpzreader.h
+ sockhandler.h
+)
+
+GENERATE_ENUM_SERIALIZATION(httpheader.h)
+
+SET(RAGEL6_FLAGS -CF1)
+
+END()
+
+RECURSE_FOR_TESTS(ut)