diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/http/fetch | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/http/fetch')
22 files changed, 4942 insertions, 0 deletions
diff --git a/library/cpp/http/fetch/exthttpcodes.cpp b/library/cpp/http/fetch/exthttpcodes.cpp new file mode 100644 index 0000000000..acc05650c8 --- /dev/null +++ b/library/cpp/http/fetch/exthttpcodes.cpp @@ -0,0 +1,266 @@ +#include "exthttpcodes.h" + +#include <cstring> + +const ui16 CrazyServer = ShouldDelete | MarkSuspect; + +struct http_flag { + ui16 http; + ui16 flag; +}; +static http_flag HTTP_FLAG[] = { + {HTTP_CONTINUE, MarkSuspect}, // 100 + {HTTP_SWITCHING_PROTOCOLS, CrazyServer}, // 101 + {HTTP_PROCESSING, CrazyServer}, // 102 + + {HTTP_OK, ShouldReindex}, // 200 + {HTTP_CREATED, CrazyServer}, // 201 + {HTTP_ACCEPTED, ShouldDelete}, // 202 + {HTTP_NON_AUTHORITATIVE_INFORMATION, ShouldReindex}, // 203 + {HTTP_NO_CONTENT, ShouldDelete}, // 204 + {HTTP_RESET_CONTENT, ShouldDelete}, // 205 + {HTTP_PARTIAL_CONTENT, ShouldReindex}, // 206 + {HTTP_MULTI_STATUS, CrazyServer}, // 207 + {HTTP_ALREADY_REPORTED, CrazyServer}, // 208 + {HTTP_IM_USED, CrazyServer}, // 226 + + {HTTP_MULTIPLE_CHOICES, CheckLinks | ShouldDelete}, // 300 + {HTTP_MOVED_PERMANENTLY, CheckLocation | ShouldDelete | MoveRedir}, // 301 + {HTTP_FOUND, CheckLocation | ShouldDelete | MoveRedir}, // 302 + {HTTP_SEE_OTHER, CheckLocation | ShouldDelete | MoveRedir}, // 303 + {HTTP_NOT_MODIFIED, 0}, // 304 + {HTTP_USE_PROXY, ShouldDelete}, // 305 + {HTTP_TEMPORARY_REDIRECT, CheckLocation | ShouldDelete | MoveRedir}, // 307 + {HTTP_PERMANENT_REDIRECT, CheckLocation | ShouldDelete | MoveRedir}, // 308 + + {HTTP_BAD_REQUEST, CrazyServer}, // 400 + {HTTP_UNAUTHORIZED, ShouldDelete}, // 401 + {HTTP_PAYMENT_REQUIRED, ShouldDelete}, // 402 + {HTTP_FORBIDDEN, ShouldDelete}, // 403 + {HTTP_NOT_FOUND, ShouldDelete}, // 404 + {HTTP_METHOD_NOT_ALLOWED, ShouldDelete}, // 405 + {HTTP_NOT_ACCEPTABLE, ShouldDelete}, // 406 + {HTTP_PROXY_AUTHENTICATION_REQUIRED, CrazyServer}, // 407 + {HTTP_REQUEST_TIME_OUT, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 408 + {HTTP_CONFLICT, MarkSuspect}, // 409 + {HTTP_GONE, ShouldDelete}, // 410 + {HTTP_LENGTH_REQUIRED, CrazyServer}, // 411 + {HTTP_PRECONDITION_FAILED, CrazyServer}, // 412 + {HTTP_REQUEST_ENTITY_TOO_LARGE, CrazyServer}, // 413 + {HTTP_REQUEST_URI_TOO_LARGE, ShouldDelete}, // 414 + {HTTP_UNSUPPORTED_MEDIA_TYPE, CrazyServer}, // 415 + {HTTP_REQUESTED_RANGE_NOT_SATISFIABLE, CrazyServer}, // 416 + {HTTP_EXPECTATION_FAILED, ShouldDelete}, // 417 + {HTTP_I_AM_A_TEAPOT, CrazyServer}, // 418 + {HTTP_AUTHENTICATION_TIMEOUT, ShouldDelete}, // 419 + + {HTTP_MISDIRECTED_REQUEST, CrazyServer}, // 421 + {HTTP_UNPROCESSABLE_ENTITY, CrazyServer}, // 422 + {HTTP_LOCKED, ShouldDelete}, // 423 + {HTTP_FAILED_DEPENDENCY, CrazyServer}, // 424 + {HTTP_UPGRADE_REQUIRED, ShouldDelete}, // 426 + {HTTP_PRECONDITION_REQUIRED, ShouldDelete}, // 428 + {HTTP_TOO_MANY_REQUESTS, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 429 + {HTTP_UNAVAILABLE_FOR_LEGAL_REASONS, ShouldDelete}, // 451 + + {HTTP_INTERNAL_SERVER_ERROR, MarkSuspect}, // 500 + {HTTP_NOT_IMPLEMENTED, ShouldDelete | ShouldDisconnect}, // 501 + {HTTP_BAD_GATEWAY, MarkSuspect}, // 502 + {HTTP_SERVICE_UNAVAILABLE, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 503 + {HTTP_GATEWAY_TIME_OUT, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 504 + {HTTP_HTTP_VERSION_NOT_SUPPORTED, CrazyServer | ShouldDisconnect}, // 505 + + {HTTP_VARIANT_ALSO_NEGOTIATES, CrazyServer | ShouldDisconnect}, // 506 + {HTTP_INSUFFICIENT_STORAGE, CrazyServer | ShouldDisconnect}, // 507 + {HTTP_LOOP_DETECTED, CrazyServer | ShouldDisconnect}, // 508 + {HTTP_BANDWIDTH_LIMIT_EXCEEDED, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 509 + {HTTP_NOT_EXTENDED, ShouldDelete}, // 510 + {HTTP_NETWORK_AUTHENTICATION_REQUIRED, ShouldDelete}, // 511 + + // custom + {HTTP_BAD_RESPONSE_HEADER, CrazyServer}, // 1000 + {HTTP_CONNECTION_LOST, ShouldRetry}, // 1001 + {HTTP_BODY_TOO_LARGE, ShouldDelete | CanBeFake}, // 1002 + {HTTP_ROBOTS_TXT_DISALLOW, ShouldDelete}, // 1003 + {HTTP_BAD_URL, ShouldDelete}, // 1004 + {HTTP_BAD_MIME, ShouldDelete}, // 1005 + {HTTP_DNS_FAILURE, ShouldDisconnect | MarkSuspect}, // 1006 + {HTTP_BAD_STATUS_CODE, CrazyServer}, // 1007 + {HTTP_BAD_HEADER_STRING, CrazyServer}, // 1008 + {HTTP_BAD_CHUNK, CrazyServer}, // 1009 + {HTTP_CONNECT_FAILED, ShouldDisconnect | ShouldRetry | MarkSuspect}, // 1010 + {HTTP_FILTER_DISALLOW, ShouldDelete}, // 1011 + {HTTP_LOCAL_EIO, ShouldRetry}, // 1012 + {HTTP_BAD_CONTENT_LENGTH, ShouldDelete}, // 1013 + {HTTP_BAD_ENCODING, ShouldDelete}, // 1014 + {HTTP_LENGTH_UNKNOWN, ShouldDelete}, // 1015 + {HTTP_HEADER_EOF, ShouldRetry | CanBeFake}, // 1016 + {HTTP_MESSAGE_EOF, ShouldRetry | CanBeFake}, // 1017 + {HTTP_CHUNK_EOF, ShouldRetry | CanBeFake}, // 1018 + {HTTP_PAST_EOF, ShouldRetry | ShouldDelete | CanBeFake}, // 1019 + {HTTP_HEADER_TOO_LARGE, ShouldDelete}, // 1020 + {HTTP_URL_TOO_LARGE, ShouldDelete}, // 1021 + {HTTP_INTERRUPTED, 0}, // 1022 + {HTTP_CUSTOM_NOT_MODIFIED, 0}, // 1023 + {HTTP_BAD_CONTENT_ENCODING, ShouldDelete}, // 1024 + {HTTP_PROXY_UNKNOWN, 0}, // 1030 + {HTTP_PROXY_REQUEST_TIME_OUT, 0}, // 1031 + {HTTP_PROXY_INTERNAL_ERROR, 0}, // 1032 + {HTTP_PROXY_CONNECT_FAILED, 0}, // 1033 + {HTTP_PROXY_CONNECTION_LOST, 0}, // 1034 + {HTTP_PROXY_NO_PROXY, 0}, // 1035 + {HTTP_PROXY_ERROR, 0}, // 1036 + {HTTP_SSL_ERROR, 0}, // 1037 + {HTTP_CACHED_COPY_NOT_FOUND, 0}, // 1038 + {HTTP_TIMEDOUT_WHILE_BYTES_RECEIVING, ShouldRetry}, // 1039 + {HTTP_FETCHER_BAD_RESPONSE, 0}, // 1040 + {HTTP_FETCHER_MB_ERROR, 0}, // 1041 + {HTTP_SSL_CERT_ERROR, 0}, // 1042 + + // Custom (replace HTTP 200/304) + {EXT_HTTP_MIRRMOVE, 0}, // 2000 + {EXT_HTTP_MANUAL_DELETE, ShouldDelete}, // 2001 + {EXT_HTTP_NOTUSED2, ShouldDelete}, // 2002 + {EXT_HTTP_NOTUSED3, ShouldDelete}, // 2003 + {EXT_HTTP_REFRESH, ShouldDelete | CheckLinks | MoveRedir}, // 2004 + {EXT_HTTP_NOINDEX, ShouldDelete | CheckLinks}, // 2005 + {EXT_HTTP_BADCODES, ShouldDelete}, // 2006 + {EXT_HTTP_SITESTAT, ShouldDelete}, // 2007 + {EXT_HTTP_IOERROR, ShouldDelete}, // 2008 + {EXT_HTTP_BASEERROR, ShouldDelete}, // 2009 + {EXT_HTTP_PARSERROR, ShouldDelete | CanBeFake}, // 2010 + {EXT_HTTP_BAD_CHARSET, ShouldDelete | CheckLinks}, // 2011 + {EXT_HTTP_BAD_LANGUAGE, ShouldDelete | CheckLinks}, // 2012 + {EXT_HTTP_NUMERERROR, ShouldDelete}, // 2013 + {EXT_HTTP_EMPTYDOC, ShouldDelete | CheckLinks}, // 2014 + {EXT_HTTP_HUGEDOC, ShouldDelete}, // 2015 + {EXT_HTTP_LINKGARBAGE, ShouldDelete}, // 2016 + {EXT_HTTP_PARSERFAIL, ShouldDelete}, // 2019 + {EXT_HTTP_GZIPERROR, ShouldDelete}, // 2020 + {EXT_HTTP_MANUAL_DELETE_URL, ShouldDelete}, // 2022 + {EXT_HTTP_CUSTOM_PARTIAL_CONTENT, ShouldReindex}, // 2023 + {EXT_HTTP_EMPTY_RESPONSE, ShouldDelete}, // 2024 + {EXT_HTTP_REL_CANONICAL, ShouldDelete | CheckLinks | MoveRedir}, // 2025 + {0, 0}}; + +static ui16* prepare_flags(http_flag* arg) { + static ui16 flags[EXT_HTTP_CODE_MAX]; + http_flag* ptr; + size_t i; + + // устанавливаем значение по умолчанию для кодов не перечисленных в таблице выше + for (i = 0; i < EXT_HTTP_CODE_MAX; ++i) + flags[i] = CrazyServer; + + // устанавливаем флаги для перечисленных кодов + for (ptr = arg; ptr->http; ++ptr) + flags[ptr->http & (EXT_HTTP_CODE_MAX - 1)] = ptr->flag; + + // для стандартных кодов ошибок берем флаги из первого кода каждой группы и проставляем их + // всем кодам не перечисленным в таблице выше + for (size_t group = 0; group < 1000; group += 100) + for (size_t j = group + 1; j < group + 100; ++j) + flags[j] = flags[group]; + + // предыдущий цикл затер некоторые флаги перечисленные в таблице выше + // восстанавливаем их + for (ptr = arg; ptr->http; ++ptr) + flags[ptr->http & (EXT_HTTP_CODE_MAX - 1)] = ptr->flag; + + return flags; +} + +ui16* http2status = prepare_flags(HTTP_FLAG); + +TStringBuf ExtHttpCodeStr(int code) noexcept { + if (code < HTTP_CODE_MAX) { + return HttpCodeStr(code); + } + switch (code) { + case HTTP_BAD_RESPONSE_HEADER: + return TStringBuf("Bad response header"); + case HTTP_CONNECTION_LOST: + return TStringBuf("Connection lost"); + case HTTP_BODY_TOO_LARGE: + return TStringBuf("Body too large"); + case HTTP_ROBOTS_TXT_DISALLOW: + return TStringBuf("robots.txt disallow"); + case HTTP_BAD_URL: + return TStringBuf("Bad url"); + case HTTP_BAD_MIME: + return TStringBuf("Bad mime type"); + case HTTP_DNS_FAILURE: + return TStringBuf("Dns failure"); + case HTTP_BAD_STATUS_CODE: + return TStringBuf("Bad status code"); + case HTTP_BAD_HEADER_STRING: + return TStringBuf("Bad header string"); + case HTTP_BAD_CHUNK: + return TStringBuf("Bad chunk"); + case HTTP_CONNECT_FAILED: + return TStringBuf("Connect failed"); + case HTTP_FILTER_DISALLOW: + return TStringBuf("Filter disallow"); + case HTTP_LOCAL_EIO: + return TStringBuf("Local eio"); + case HTTP_BAD_CONTENT_LENGTH: + return TStringBuf("Bad content length"); + case HTTP_BAD_ENCODING: + return TStringBuf("Bad encoding"); + case HTTP_LENGTH_UNKNOWN: + return TStringBuf("Length unknown"); + case HTTP_HEADER_EOF: + return TStringBuf("Header EOF"); + case HTTP_MESSAGE_EOF: + return TStringBuf("Message EOF"); + case HTTP_CHUNK_EOF: + return TStringBuf("Chunk EOF"); + case HTTP_PAST_EOF: + return TStringBuf("Past EOF"); + case HTTP_HEADER_TOO_LARGE: + return TStringBuf("Header is too large"); + case HTTP_URL_TOO_LARGE: + return TStringBuf("Url is too large"); + case HTTP_INTERRUPTED: + return TStringBuf("Interrupted"); + case HTTP_CUSTOM_NOT_MODIFIED: + return TStringBuf("Signature detector thinks that doc is not modified"); + case HTTP_BAD_CONTENT_ENCODING: + return TStringBuf("Bad content encoding"); + case HTTP_NO_RESOURCES: + return TStringBuf("No resources"); + case HTTP_FETCHER_SHUTDOWN: + return TStringBuf("Fetcher shutdown"); + case HTTP_CHUNK_TOO_LARGE: + return TStringBuf("Chunk size is too big"); + case HTTP_SERVER_BUSY: + return TStringBuf("Server is busy"); + case HTTP_SERVICE_UNKNOWN: + return TStringBuf("Service is unknown"); + case HTTP_PROXY_UNKNOWN: + return TStringBuf("Zora: unknown error"); + case HTTP_PROXY_REQUEST_TIME_OUT: + return TStringBuf("Zora: request time out"); + case HTTP_PROXY_INTERNAL_ERROR: + return TStringBuf("Zora: internal server error"); + case HTTP_PROXY_CONNECT_FAILED: + return TStringBuf("Spider proxy connect failed"); + case HTTP_PROXY_CONNECTION_LOST: + return TStringBuf("Spider proxy connection lost"); + case HTTP_PROXY_NO_PROXY: + return TStringBuf("Spider proxy no proxy alive in region"); + case HTTP_PROXY_ERROR: + return TStringBuf("Spider proxy returned custom error"); + case HTTP_SSL_ERROR: + return TStringBuf("Ssl library returned error"); + case HTTP_CACHED_COPY_NOT_FOUND: + return TStringBuf("Cached copy for the url is not available"); + case HTTP_TIMEDOUT_WHILE_BYTES_RECEIVING: + return TStringBuf("Timed out while bytes receiving"); + + // TODO: messages for >2000 codes + + default: + return TStringBuf("Unknown HTTP code"); + } +} diff --git a/library/cpp/http/fetch/exthttpcodes.h b/library/cpp/http/fetch/exthttpcodes.h new file mode 100644 index 0000000000..6b525052cd --- /dev/null +++ b/library/cpp/http/fetch/exthttpcodes.h @@ -0,0 +1,141 @@ +#pragma once + +#include <util/system/defaults.h> +#include <library/cpp/http/misc/httpcodes.h> + +enum ExtHttpCodes { + // Custom + HTTP_EXTENDED = 1000, + HTTP_BAD_RESPONSE_HEADER = 1000, + HTTP_CONNECTION_LOST = 1001, + HTTP_BODY_TOO_LARGE = 1002, + HTTP_ROBOTS_TXT_DISALLOW = 1003, + HTTP_BAD_URL = 1004, + HTTP_BAD_MIME = 1005, + HTTP_DNS_FAILURE = 1006, + HTTP_BAD_STATUS_CODE = 1007, + HTTP_BAD_HEADER_STRING = 1008, + HTTP_BAD_CHUNK = 1009, + HTTP_CONNECT_FAILED = 1010, + HTTP_FILTER_DISALLOW = 1011, + HTTP_LOCAL_EIO = 1012, + HTTP_BAD_CONTENT_LENGTH = 1013, + HTTP_BAD_ENCODING = 1014, + HTTP_LENGTH_UNKNOWN = 1015, + HTTP_HEADER_EOF = 1016, + HTTP_MESSAGE_EOF = 1017, + HTTP_CHUNK_EOF = 1018, + HTTP_PAST_EOF = 1019, + HTTP_HEADER_TOO_LARGE = 1020, + HTTP_URL_TOO_LARGE = 1021, + HTTP_INTERRUPTED = 1022, + HTTP_CUSTOM_NOT_MODIFIED = 1023, + HTTP_BAD_CONTENT_ENCODING = 1024, + HTTP_NO_RESOURCES = 1025, + HTTP_FETCHER_SHUTDOWN = 1026, + HTTP_CHUNK_TOO_LARGE = 1027, + HTTP_SERVER_BUSY = 1028, + HTTP_SERVICE_UNKNOWN = 1029, + HTTP_PROXY_UNKNOWN = 1030, + HTTP_PROXY_REQUEST_TIME_OUT = 1031, + HTTP_PROXY_INTERNAL_ERROR = 1032, + HTTP_PROXY_CONNECT_FAILED = 1033, + HTTP_PROXY_CONNECTION_LOST = 1034, + HTTP_PROXY_NO_PROXY = 1035, + HTTP_PROXY_ERROR = 1036, + HTTP_SSL_ERROR = 1037, + HTTP_CACHED_COPY_NOT_FOUND = 1038, + HTTP_TIMEDOUT_WHILE_BYTES_RECEIVING = 1039, + HTTP_FETCHER_BAD_RESPONSE = 1040, + HTTP_FETCHER_MB_ERROR = 1041, + HTTP_SSL_CERT_ERROR = 1042, + HTTP_PROXY_REQUEST_CANCELED = 1051, + + // Custom (replace HTTP 200/304) + EXT_HTTP_EXT_SUCCESS_BEGIN = 2000, // to check if code variable is in success interval + EXT_HTTP_MIRRMOVE = 2000, + EXT_HTTP_MANUAL_DELETE = 2001, + EXT_HTTP_NOTUSED2 = 2002, + EXT_HTTP_NOTUSED3 = 2003, + EXT_HTTP_REFRESH = 2004, + EXT_HTTP_NOINDEX = 2005, + EXT_HTTP_BADCODES = 2006, + EXT_HTTP_SITESTAT = 2007, + EXT_HTTP_IOERROR = 2008, + EXT_HTTP_BASEERROR = 2009, + EXT_HTTP_PARSERROR = 2010, + EXT_HTTP_BAD_CHARSET = 2011, + EXT_HTTP_BAD_LANGUAGE = 2012, + EXT_HTTP_NUMERERROR = 2013, + EXT_HTTP_EMPTYDOC = 2014, + EXT_HTTP_HUGEDOC = 2015, + EXT_HTTP_LINKGARBAGE = 2016, + EXT_HTTP_EXDUPLICATE = 2017, + EXT_HTTP_FILTERED = 2018, + EXT_HTTP_PARSERFAIL = 2019, // parser crashed (in this case image spider will redownload such document) + EXT_HTTP_GZIPERROR = 2020, + EXT_HTTP_CLEANPARAM = 2021, + EXT_HTTP_MANUAL_DELETE_URL = 2022, + EXT_HTTP_CUSTOM_PARTIAL_CONTENT = 2023, + EXT_HTTP_EMPTY_RESPONSE = 2024, + EXT_HTTP_REL_CANONICAL = 2025, + + EXT_HTTP_EXT_SUCCESS_END = 3000, // to check if code variable is in success interval + EXT_HTTP_HOSTFILTER = 3001, + EXT_HTTP_URLFILTER = 3002, + EXT_HTTP_SUFFIXFILTER = 3003, + EXT_HTTP_DOMAINFILTER = 3004, + EXT_HTTP_EXTDOMAINFILTER = 3005, + EXT_HTTP_PORTFILTER = 3006, + EXT_HTTP_MIRROR = 3007, + EXT_HTTP_DEEPDIR = 3008, + EXT_HTTP_DUPDIRS = 3009, + EXT_HTTP_REGEXP = 3010, + EXT_HTTP_OLDDELETED = 3012, + EXT_HTTP_PENALTY = 3013, + EXT_HTTP_POLICY = 3015, + EXT_HTTP_TOOOLD = 3016, + EXT_HTTP_GARBAGE = 3017, + EXT_HTTP_FOREIGN = 3018, + EXT_HTTP_EXT_REGEXP = 3019, + EXT_HTTP_HOPS = 3020, + EXT_HTTP_SELRANK = 3021, + EXT_HTTP_NOLINKS = 3022, + EXT_HTTP_WRONGMULTILANG = 3023, + EXT_HTTP_SOFTMIRRORS = 3024, + EXT_HTTP_BIGLEVEL = 3025, + + // fast robot codes + + EXT_HTTP_FASTHOPS = 4000, + EXT_HTTP_NODOC = 4001, + + EXT_HTTP_MAX +}; + +enum HttpFlags { + // connection + ShouldDisconnect = 1, + ShouldRetry = 2, + // UNUSED 4 + + // indexer + ShouldReindex = 8, + ShouldDelete = 16, + CheckLocation = 32, + CheckLinks = 64, + MarkSuspect = 128, + // UNUSED 256 + // UNUSED 512 + MoveRedir = 1024, + CanBeFake = 2048, +}; + +const size_t EXT_HTTP_CODE_MAX = 1 << 12; + +static inline int Http2Status(int code) { + extern ui16* http2status; + return http2status[code & (EXT_HTTP_CODE_MAX - 1)]; +} + +TStringBuf ExtHttpCodeStr(int code) noexcept; diff --git a/library/cpp/http/fetch/http_digest.cpp b/library/cpp/http/fetch/http_digest.cpp new file mode 100644 index 0000000000..1eaa02b7f2 --- /dev/null +++ b/library/cpp/http/fetch/http_digest.cpp @@ -0,0 +1,206 @@ +#include "http_digest.h" + +#include <library/cpp/digest/md5/md5.h> +#include <util/stream/output.h> +#include <util/stream/str.h> + +/************************************************************/ +/************************************************************/ +static const char* WWW_PREFIX = "Authorization: Digest "; + +/************************************************************/ +httpDigestHandler::httpDigestHandler() + : User_(nullptr) + , Password_(nullptr) + , Nonce_(nullptr) + , NonceCount_(0) + , HeaderInstruction_(nullptr) +{ +} + +/************************************************************/ +httpDigestHandler::~httpDigestHandler() { + clear(); +} + +/************************************************************/ +void httpDigestHandler::clear() { + free(Nonce_); + free(HeaderInstruction_); + User_ = Password_ = nullptr; + Nonce_ = HeaderInstruction_ = nullptr; + NonceCount_ = 0; +} + +/************************************************************/ +void httpDigestHandler::setAuthorization(const char* user, const char* password) { + clear(); + if (user && password) { + User_ = user; + Password_ = password; + } +} + +/************************************************************/ +const char* httpDigestHandler::getHeaderInstruction() const { + return HeaderInstruction_; +} + +/************************************************************/ +void httpDigestHandler::generateCNonce(char* outCNonce) { + if (!*outCNonce) + sprintf(outCNonce, "%ld", (long)time(nullptr)); +} + +/************************************************************/ +inline void addMD5(MD5& ctx, const char* value) { + ctx.Update((const unsigned char*)(value), strlen(value)); +} + +inline void addMD5(MD5& ctx, const char* value, int len) { + ctx.Update((const unsigned char*)(value), len); +} + +inline void addMD5Sep(MD5& ctx) { + addMD5(ctx, ":", 1); +} + +/************************************************************/ +/* calculate H(A1) as per spec */ +void httpDigestHandler::digestCalcHA1(const THttpAuthHeader& hd, + char* outSessionKey, + char* outCNonce) { + MD5 ctx; + ctx.Init(); + addMD5(ctx, User_); + addMD5Sep(ctx); + addMD5(ctx, hd.realm); + addMD5Sep(ctx); + addMD5(ctx, Password_); + + if (hd.algorithm == 1) { //MD5-sess + unsigned char digest[16]; + ctx.Final(digest); + + generateCNonce(outCNonce); + + ctx.Init(); + ctx.Update(digest, 16); + addMD5Sep(ctx); + addMD5(ctx, hd.nonce); + addMD5Sep(ctx); + addMD5(ctx, outCNonce); + ctx.End(outSessionKey); + } + + ctx.End(outSessionKey); +}; + +/************************************************************/ +/* calculate request-digest/response-digest as per HTTP Digest spec */ +void httpDigestHandler::digestCalcResponse(const THttpAuthHeader& hd, + const char* path, + const char* method, + const char* nonceCount, + char* outResponse, + char* outCNonce) { + char HA1[33]; + digestCalcHA1(hd, HA1, outCNonce); + + char HA2[33]; + MD5 ctx; + ctx.Init(); + addMD5(ctx, method); + addMD5Sep(ctx); + addMD5(ctx, path); + //ignore auth-int + ctx.End(HA2); + + ctx.Init(); + addMD5(ctx, HA1, 32); + addMD5Sep(ctx); + addMD5(ctx, Nonce_); + addMD5Sep(ctx); + + if (hd.qop_auth) { + if (!*outCNonce) + generateCNonce(outCNonce); + + addMD5(ctx, nonceCount, 8); + addMD5Sep(ctx); + addMD5(ctx, outCNonce); + addMD5Sep(ctx); + addMD5(ctx, "auth", 4); + addMD5Sep(ctx); + } + addMD5(ctx, HA2, 32); + ctx.End(outResponse); +} + +/************************************************************/ +bool httpDigestHandler::processHeader(const THttpAuthHeader* header, + const char* path, + const char* method, + const char* cnonce) { + if (!User_ || !header || !header->use_auth || !header->realm || !header->nonce) + return false; + + if (Nonce_) { + if (strcmp(Nonce_, header->nonce)) { + free(Nonce_); + Nonce_ = nullptr; + NonceCount_ = 0; + } + } + if (!Nonce_) { + Nonce_ = strdup(header->nonce); + NonceCount_ = 0; + } + free(HeaderInstruction_); + HeaderInstruction_ = nullptr; + NonceCount_++; + + char nonceCount[20]; + sprintf(nonceCount, "%08d", NonceCount_); + + char CNonce[50]; + if (cnonce) + strcpy(CNonce, cnonce); + else + CNonce[0] = 0; + + char response[33]; + digestCalcResponse(*header, path, method, nonceCount, response, CNonce); + + //digest-response = 1#( username | realm | nonce | digest-uri + // | response | [ algorithm ] | [cnonce] | + // [opaque] | [message-qop] | + // [nonce-count] | [auth-param] ) + + TStringStream out; + out << WWW_PREFIX << "username=\"" << User_ << "\""; + out << ", realm=\"" << header->realm << "\""; + out << ", nonce=\"" << header->nonce << "\""; + out << ", uri=\"" << path << "\""; + if (header->algorithm == 1) + out << ", algorithm=MD5-sess"; + else + out << ", algorithm=MD5"; + if (header->qop_auth) + out << ", qop=auth"; + out << ", nc=" << nonceCount; + if (CNonce[0]) + out << ", cnonce=\"" << CNonce << "\""; + out << ", response=\"" << response << "\""; + if (header->opaque) + out << ", opaque=\"" << header->opaque << "\""; + out << "\r\n"; + + TString s_out = out.Str(); + HeaderInstruction_ = strdup(s_out.c_str()); + + return true; +} + +/************************************************************/ +/************************************************************/ diff --git a/library/cpp/http/fetch/http_digest.h b/library/cpp/http/fetch/http_digest.h new file mode 100644 index 0000000000..3b1872d70b --- /dev/null +++ b/library/cpp/http/fetch/http_digest.h @@ -0,0 +1,47 @@ +#pragma once + +#include "httpheader.h" + +#include <util/system/compat.h> +#include <library/cpp/http/misc/httpcodes.h> + +class httpDigestHandler { +protected: + const char* User_; + const char* Password_; + char* Nonce_; + int NonceCount_; + char* HeaderInstruction_; + + void clear(); + + void generateCNonce(char* outCNonce); + + void digestCalcHA1(const THttpAuthHeader& hd, + char* outSessionKey, + char* outCNonce); + + void digestCalcResponse(const THttpAuthHeader& hd, + const char* method, + const char* path, + const char* nonceCount, + char* outResponse, + char* outCNonce); + +public: + httpDigestHandler(); + ~httpDigestHandler(); + + void setAuthorization(const char* user, + const char* password); + bool processHeader(const THttpAuthHeader* header, + const char* path, + const char* method, + const char* cnonce = nullptr); + + bool empty() const { + return (!User_); + } + + const char* getHeaderInstruction() const; +}; diff --git a/library/cpp/http/fetch/http_socket.cpp b/library/cpp/http/fetch/http_socket.cpp new file mode 100644 index 0000000000..1524ef04a8 --- /dev/null +++ b/library/cpp/http/fetch/http_socket.cpp @@ -0,0 +1,206 @@ +#include "httpload.h" +#include "http_digest.h" + +/************************************************************/ + +#ifdef USE_GNUTLS + +#include <gcrypt.h> +#include <gnutls/gnutls.h> +#include <util/network/init.h> +#include <util/network/socket.h> +#include <util/system/mutex.h> + +/********************************************************/ +// HTTPS handler is used as implementation of +// socketAbstractHandler for work through HTTPS protocol + +class socketSecureHandler: public socketRegularHandler { +protected: + bool IsValid_; + gnutls_session Session_; + gnutls_certificate_credentials Credits_; + +public: + socketSecureHandler(); + virtual ~socketSecureHandler(); + + virtual bool Good(); + virtual int Connect(const TAddrList& addrs, TDuration Timeout); + virtual void Disconnect(); + virtual void shutdown(); + virtual bool send(const char* message, ssize_t messlen); + virtual bool peek(); + virtual ssize_t read(void* buffer, ssize_t buflen); +}; + +/********************************************************/ +/********************************************************/ +static int gcry_pthread_mutex_init(void** priv) { + int err = 0; + + try { + TMutex* lock = new TMutex; + *priv = lock; + } catch (...) { + err = -1; + } + + return err; +} + +static int gcry_pthread_mutex_destroy(void** lock) { + delete static_cast<TMutex*>(*lock); + + return 0; +} + +static int gcry_pthread_mutex_lock(void** lock) { + static_cast<TMutex*>(*lock)->Acquire(); + + return 0; +} + +static int gcry_pthread_mutex_unlock(void** lock) { + static_cast<TMutex*>(*lock)->Release(); + + return 0; +} + +static struct gcry_thread_cbs gcry_threads_pthread = + { + GCRY_THREAD_OPTION_PTHREAD, NULL, + gcry_pthread_mutex_init, gcry_pthread_mutex_destroy, + gcry_pthread_mutex_lock, gcry_pthread_mutex_unlock, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL}; + +/********************************************************/ +struct https_initor { + https_initor() { + gcry_control(GCRYCTL_SET_THREAD_CBS, &gcry_threads_pthread); + gnutls_global_init(); + InitNetworkSubSystem(); + } + + ~https_initor() { + gnutls_global_deinit(); + } +}; + +static https_initor _initor; + +/********************************************************/ +socketSecureHandler::socketSecureHandler() + : socketRegularHandler() + , IsValid_(false) + , Session_() + , Credits_() +{ +} + +/********************************************************/ +socketSecureHandler::~socketSecureHandler() { + if (IsValid_) + Disconnect(); +} + +/********************************************************/ +bool socketSecureHandler::Good() { + return Socket_.Good() && IsValid_; +} + +/********************************************************/ +int socketSecureHandler::Connect(const TAddrList& addrs, TDuration Timeout) { + IsValid_ = false; + + int ret = socketRegularHandler::Connect(addrs, Timeout); + if (ret) + return ret; + + gnutls_certificate_allocate_credentials(&Credits_); + gnutls_init(&Session_, GNUTLS_CLIENT); + gnutls_set_default_priority(Session_); + gnutls_credentials_set(Session_, GNUTLS_CRD_CERTIFICATE, Credits_); + + SOCKET fd = Socket_; + gnutls_transport_set_ptr(Session_, (gnutls_transport_ptr)fd); + + ret = gnutls_handshake(Session_); + + if (ret < 0) { + fprintf(stderr, "*** Handshake failed\n"); + gnutls_perror(ret); + + gnutls_deinit(Session_); + if (Credits_) { + gnutls_certificate_free_credentials(Credits_); + Credits_ = 0; + } + return 1; + } + + IsValid_ = true; + return !IsValid_; +} + +/********************************************************/ +void socketSecureHandler::Disconnect() { + if (IsValid_) { + gnutls_bye(Session_, GNUTLS_SHUT_RDWR); + IsValid_ = false; + gnutls_deinit(Session_); + } + + if (Credits_) { + gnutls_certificate_free_credentials(Credits_); + Credits_ = 0; + } + + socketRegularHandler::Disconnect(); +} + +/********************************************************/ +void socketSecureHandler::shutdown() { +} + +/********************************************************/ +bool socketSecureHandler::send(const char* message, ssize_t messlen) { + if (!IsValid_) + return false; + ssize_t rv = gnutls_record_send(Session_, message, messlen); + return rv >= 0; +} + +/********************************************************/ +bool socketSecureHandler::peek() { + //ssize_t rv = gnutls_record_check_pending(mSession); + //return rv>0; + return true; +} + +/********************************************************/ +ssize_t socketSecureHandler::read(void* buffer, ssize_t buflen) { + if (!IsValid_) + return false; + return gnutls_record_recv(Session_, (char*)buffer, buflen); +} + +#endif + +/************************************************************/ +socketAbstractHandler* socketHandlerFactory::chooseHandler(const THttpURL& url) { + if (url.IsValidGlobal() && url.GetScheme() == THttpURL::SchemeHTTP) + return new socketRegularHandler; + +#ifdef USE_GNUTLS + if (url.IsValidGlobal() && url.GetScheme() == THttpURL::SchemeHTTPS) + return new socketSecureHandler; +#endif + + return nullptr; +} + +/************************************************************/ +socketHandlerFactory socketHandlerFactory::sInstance; +/************************************************************/ diff --git a/library/cpp/http/fetch/httpagent.h b/library/cpp/http/fetch/httpagent.h new file mode 100644 index 0000000000..96475cc05d --- /dev/null +++ b/library/cpp/http/fetch/httpagent.h @@ -0,0 +1,316 @@ +#pragma once + +#include <cstdio> +#include <cstring> +#include <cstdlib> + +#include <library/cpp/uri/http_url.h> +#include <util/datetime/base.h> +#include <util/network/hostip.h> +#include <util/network/ip.h> +#include <util/network/sock.h> +#include <util/generic/scope.h> +#include <util/generic/utility.h> +#include <util/string/cast.h> + +#include "exthttpcodes.h" +#include "sockhandler.h" + +class TIpResolver { +public: + TAddrList Resolve(const char* host, TIpPort port) const { + try { + TAddrList result; + TNetworkAddress na(host, port); + for (auto i = na.Begin(); i != na.End(); ++i) { + const struct addrinfo& ai = *i; + switch (ai.ai_family) { + case AF_INET: + result.push_back(new NAddr::TIPv4Addr(*(sockaddr_in*)ai.ai_addr)); + break; + case AF_INET6: + result.push_back(new NAddr::TIPv6Addr(*(sockaddr_in6*)ai.ai_addr)); + break; + } + } + return result; + } catch (const TNetworkResolutionError&) { + } + return TAddrList(); + } +}; + +namespace NResolverHelpers { + Y_HAS_MEMBER(Resolve); + + template <typename TResolver> + std::enable_if_t<TClassHasResolve<TResolver>::value, TAddrList> Resolve(const TResolver& r, const char* host, TIpPort port) { + return r.Resolve(host, port); + } + + template <typename TResolver> + std::enable_if_t<!TClassHasResolve<TResolver>::value, TAddrList> Resolve(const TResolver& r, const char* host, TIpPort port) { + ui32 ip = 0; + if (r.GetHostIP(host, &ip)) { + // error + return TAddrList(); + } + if (!ip) { + return TAddrList(); + } + + return TAddrList::MakeV4Addr(ip, port); + } +} + +template <typename TBase> +class TIpResolverWrapper { +private: + TBase Base; + +public: + TIpResolverWrapper() = default; + + template <typename T> + TIpResolverWrapper(T&& base) + : Base(std::forward(base)) + { + } + + TAddrList Resolve(const char* host, TIpPort port) const { + return NResolverHelpers::Resolve(Base, host, port); + } +}; + +template <class TSocketHandler = TSimpleSocketHandler, class TDnsClient = TIpResolver> +class THttpAgent { +public: + THttpAgent() + : Persistent(0) + , Timeout(TDuration::MicroSeconds(150)) + , Hostheader(nullptr) + , Footer(nullptr) + , AltFooter(nullptr) + , PostData(nullptr) + , PostDataLen(0) + , Method(nullptr) + , MethodLen(0) + , HostheaderLen(0) + { + SetIdentification("YandexSomething/1.0", "webadmin@yandex.ru"); + } + + ~THttpAgent() { + Disconnect(); + free(Hostheader); + free(Footer); + } + + void SetIdentification(const char* user_agent, const char* http_from) { + free(Footer); + size_t len = user_agent ? strlen(user_agent) + 15 : 0; + len += http_from ? strlen(http_from) + 9 : 0; + len += 3; + Footer = (char*)malloc(len); + if (user_agent) + strcat(strcat(strcpy(Footer, "User-Agent: "), user_agent), "\r\n"); + if (http_from) + strcat(strcat(strcat(Footer, "From: "), http_from), "\r\n"); + } + + void SetUserAgentFooter(const char* altFooter) { + AltFooter = altFooter; + } + + void SetPostData(const char* postData, size_t postDataLen) { + PostData = postData; + PostDataLen = postDataLen; + } + + void SetMethod(const char* method, size_t methodLen) { + Method = method; + MethodLen = methodLen; + } + + // deprecated + ui32 GetIp() const { + return Addrs.GetV4Addr().first; + } + + int GetScheme() const { + return THttpURL::SchemeHTTP; + } + void SetTimeout(TDuration tim) { + Timeout = tim; + } + + void SetConnectTimeout(TDuration timeout) { + ConnectTimeout = timeout; + } + + int Disconnected() { + return !Persistent || !Socket.Good(); + } + + int SetHost(const char* hostname, TIpPort port) { + Disconnect(); + TAddrList addrs = DnsClient.Resolve(hostname, port); + if (!addrs.size()) { + return 1; + } + + SetHost(hostname, port, addrs); + return 0; + } + + int SetHost(const char* hostname, TIpPort port, const TAddrList& addrs) { + Disconnect(); + Addrs = addrs; + size_t reqHostheaderLen = strlen(hostname) + 20; + if (HostheaderLen < reqHostheaderLen) { + free(Hostheader); + Hostheader = (char*)malloc((HostheaderLen = reqHostheaderLen)); + } + if (port == 80) + sprintf(Hostheader, "Host: %s\r\n", hostname); + else + sprintf(Hostheader, "Host: %s:%u\r\n", hostname, port); + pHostBeg = strchr(Hostheader, ' ') + 1; + pHostEnd = strchr(pHostBeg, '\r'); + // convert hostname to lower case since some web server don't like + // uppper case (Task ROBOT-562) + for (char* p = pHostBeg; p < pHostEnd; p++) + *p = tolower(*p); + return 0; + } + + // deprecated v4-only + int SetHost(const char* hostname, TIpPort port, ui32 ip) { + return SetHost(hostname, port, TAddrList::MakeV4Addr(ip, port)); + } + + void SetHostHeader(const char* host) { + size_t reqHostheaderLen = strlen(host) + 20; + if (HostheaderLen < reqHostheaderLen) { + delete[] Hostheader; + Hostheader = new char[(HostheaderLen = reqHostheaderLen)]; + } + sprintf(Hostheader, "Host: %s\r\n", host); + } + + void SetSocket(SOCKET fd) { + Socket.SetSocket(fd); + } + + SOCKET PickOutSocket() { + return Socket.PickOutSocket(); + } + + void Disconnect() { + Socket.Disconnect(); + } + + ssize_t read(void* buffer, size_t buflen) { + return Socket.read(buffer, buflen); + } + + int RequestGet(const char* url, const char* const* headers, int persistent = 1, bool head_request = false) { + if (!Addrs.size()) + return HTTP_DNS_FAILURE; + char message[MessageMax]; + ssize_t messlen = 0; + if (Method) { + strncpy(message, Method, MethodLen); + message[MethodLen] = ' '; + messlen = MethodLen + 1; + } else if (PostData) { + strcpy(message, "POST "); + messlen = 5; + } else if (head_request) { + strcpy(message, "HEAD "); + messlen = 5; + } else { + strcpy(message, "GET "); + messlen = 4; + } +#define _AppendMessage(mes) messlen += Min(MessageMax - messlen, \ + (ssize_t)strlcpy(message + messlen, (mes), MessageMax - messlen)) + _AppendMessage(url); + _AppendMessage(" HTTP/1.1\r\n"); + if (*url == '/') //if not then Host is a proxy + _AppendMessage(Hostheader); + _AppendMessage("Connection: "); + _AppendMessage(persistent ? "Keep-Alive\r\n" : "Close\r\n"); + while (headers && *headers) + _AppendMessage(*headers++); + if (AltFooter) + _AppendMessage(AltFooter); + else + _AppendMessage(Footer); + _AppendMessage("\r\n"); +#undef _AppendMessage + if (messlen >= MessageMax) + return HTTP_HEADER_TOO_LARGE; + + if (!Persistent) + Disconnect(); + Persistent = persistent; + int connected = Socket.Good(); + for (int attempt = !connected; attempt < 2; attempt++) { + const auto connectTimeout = ConnectTimeout ? ConnectTimeout : Timeout; + if (!Socket.Good() && Socket.Connect(Addrs, connectTimeout)) + return HTTP_CONNECT_FAILED; + + int sendOk = Socket.send(message, messlen); + if (sendOk && PostData && PostDataLen) + sendOk = Socket.send(PostData, PostDataLen); + if (!sendOk) { + int err = errno; + Disconnect(); + errno = err; + continue; + } + + if (!Socket.peek()) { + int err = errno; + Disconnect(); + if (err == EINTR) { + errno = err; + return HTTP_INTERRUPTED; + } + } else { + if (!persistent) + Socket.shutdown(); + return 0; + } + } + return connected ? HTTP_CONNECTION_LOST : HTTP_CONNECT_FAILED; + } + +protected: + TSocketHandler Socket; + TIpResolverWrapper<TDnsClient> DnsClient; + TAddrList Addrs; + int Persistent; + TDuration Timeout; + TDuration ConnectTimeout; + char *Hostheader, *Footer, *pHostBeg, *pHostEnd; + const char* AltFooter; // alternative footer can be set by the caller + const char* PostData; + size_t PostDataLen; + const char* Method; + size_t MethodLen; + unsigned short HostheaderLen; + static const ssize_t MessageMax = 32768; +}; + +struct TNoTimer { + inline void OnBeforeSend() { + } + inline void OnAfterSend() { + } + inline void OnBeforeRecv() { + } + inline void OnAfterRecv() { + } +}; diff --git a/library/cpp/http/fetch/httpfetcher.h b/library/cpp/http/fetch/httpfetcher.h new file mode 100644 index 0000000000..7fc251afd2 --- /dev/null +++ b/library/cpp/http/fetch/httpfetcher.h @@ -0,0 +1,171 @@ +#pragma once + +#ifdef _MSC_VER +#include <io.h> +#endif + +#include <library/cpp/http/misc/httpdate.h> + +#include "httpagent.h" +#include "httpparser.h" + +struct TFakeBackup { + int Write(void* /*buf*/, size_t /*size*/) { + return 0; + } +}; + +template <size_t bufsize = 5000> +struct TFakeAlloc { + void Shrink(void* /*buf*/, size_t /*size*/) { + } + void* Grab(size_t /*min*/, size_t* real) { + *real = bufsize; + return buf; + } + char buf[bufsize]; +}; + +template <typename TAlloc = TFakeAlloc<>, + typename TCheck = TFakeCheck<>, + typename TWriter = TFakeBackup, + typename TAgent = THttpAgent<>> +class THttpFetcher: public THttpParser<TCheck>, public TAlloc, public TWriter, public TAgent { +public: + static const size_t TCP_MIN = 1500; + static int TerminateNow; + + THttpFetcher() + : THttpParser<TCheck>() + , TAlloc() + , TWriter() + , TAgent() + { + } + + virtual ~THttpFetcher() { + } + + int Fetch(THttpHeader* header, const char* path, const char* const* headers, int persistent, bool head_request = false) { + int ret = 0; + int fetcherr = 0; + + THttpParser<TCheck>::Init(header, head_request); + const char* scheme = HttpUrlSchemeKindToString((THttpURL::TSchemeKind)TAgent::GetScheme()); + size_t schemelen = strlen(scheme); + if (*path == '/') { + header->base = TStringBuf(scheme, schemelen); + header->base += TStringBuf("://", 3); + header->base += TStringBuf(TAgent::pHostBeg, TAgent::pHostEnd - TAgent::pHostBeg); + header->base += path; + } else { + if (strlen(path) >= FETCHER_URL_MAX) { + header->error = HTTP_URL_TOO_LARGE; + return 0; + } + header->base = path; + } + + if ((ret = TAgent::RequestGet(path, headers, persistent, head_request))) { + header->error = (i16)ret; + return 0; + } + + bool inheader = 1; + void *bufptr = nullptr, *buf = nullptr, *parsebuf = nullptr; + ssize_t got; + size_t buffree = 0, bufsize = 0, buflen = 0; + size_t maxsize = TCheck::GetMaxHeaderSize(); + do { + if (buffree < TCP_MIN) { + if (buf) { + TAlloc::Shrink(buf, buflen - buffree); + if (TWriter::Write(buf, buflen - buffree) < 0) { + buf = nullptr; + ret = EIO; + break; + } + } + if (!(buf = TAlloc::Grab(TCP_MIN, &buflen))) { + ret = ENOMEM; + break; + } + bufptr = buf; + buffree = buflen; + } + if ((got = TAgent::read(bufptr, buffree)) < 0) { + fetcherr = errno; + if (errno == EINTR) + header->error = HTTP_INTERRUPTED; + else if (errno == ETIMEDOUT) + header->error = HTTP_TIMEDOUT_WHILE_BYTES_RECEIVING; + else + header->error = HTTP_CONNECTION_LOST; + + break; + } + + parsebuf = bufptr; + bufptr = (char*)bufptr + got; + bufsize += got; + buffree -= got; + + THttpParser<TCheck>::Parse(parsebuf, got); + + if (header->error) + break; //if ANY error ocurred we will stop download that file or will have unprognosed stream position until MAX size reached + + if (inheader && THttpParser<TCheck>::GetState() != THttpParser<TCheck>::hp_in_header) { + inheader = 0; + if (TCheck::Check(header)) + break; + if (header->header_size > (long)maxsize) { + header->error = HTTP_HEADER_TOO_LARGE; + break; + } + } + if (!inheader) { + maxsize = TCheck::GetMaxBodySize(header); + } + if (header->http_status >= HTTP_EXTENDED) + break; + if (bufsize > maxsize) { + header->error = inheader ? HTTP_HEADER_TOO_LARGE : HTTP_BODY_TOO_LARGE; + break; + } + if (TerminateNow) { + header->error = HTTP_INTERRUPTED; + break; + } + } while (THttpParser<TCheck>::GetState() > THttpParser<TCheck>::hp_eof); + + i64 Adjustment = 0; + if (!header->error) { + if (header->transfer_chunked) { + Adjustment = header->header_size + header->entity_size - bufsize - 1; + } else if (header->content_length >= 0) { + Adjustment = header->header_size + header->content_length - bufsize; + } + if (Adjustment > 0) + Adjustment = 0; + } + + if (buf) { + TAlloc::Shrink(buf, buflen - buffree + Adjustment); + + if (TWriter::Write(buf, buflen - buffree) < 0) + ret = EIO; + } + TCheck::CheckEndDoc(header); + if (ret || header->error || header->http_status >= HTTP_EXTENDED || header->connection_closed) { + TAgent::Disconnect(); + if (!fetcherr) + fetcherr = errno; + } + errno = fetcherr; + return ret; + } +}; + +template <typename TAlloc, typename TCheck, typename TWriter, typename TAgent> +int THttpFetcher<TAlloc, TCheck, TWriter, TAgent>::TerminateNow = 0; diff --git a/library/cpp/http/fetch/httpfsm.h b/library/cpp/http/fetch/httpfsm.h new file mode 100644 index 0000000000..c4abdcd0d2 --- /dev/null +++ b/library/cpp/http/fetch/httpfsm.h @@ -0,0 +1,104 @@ +#pragma once + +#include "httpheader.h" + +#include <util/system/maxlen.h> +#include <util/datetime/parser.h> + +#include <time.h> + +struct THttpHeaderParser { + static constexpr int ErrFirstlineTypeMismatch = -3; + static constexpr int ErrHeader = -2; + static constexpr int Err = -1; + static constexpr int Final = 0; + static constexpr int NeedMore = 1; + static constexpr int Accepted = 2; + + int Execute(const void* inBuf, size_t len) { + return execute((unsigned char*)inBuf, (int)len); + } + + int Execute(TStringBuf str) { + return Execute(str.data(), str.size()); + } + + int Init(THttpHeader* h) { + int ret = Init((THttpBaseHeader*)(h)); + hd = h; + hd->Init(); + hreflangpos = hd->hreflangs; + hreflangspace = HREFLANG_MAX; + return ret; + } + + int Init(THttpAuthHeader* h) { + int ret = Init((THttpHeader*)(h)); + auth_hd = h; + return ret; + } + int Init(THttpRequestHeader* h) { + int ret = Init((THttpBaseHeader*)(h)); + request_hd = h; + request_hd->Init(); + return ret; + } + + THttpHeader* hd; + long I; + int Dc; + TDateTimeFieldsDeprecated DateTimeFields; + char buf[FETCHER_URL_MAX]; + size_t buflen; + char* lastchar; + + const unsigned char* langstart; + size_t langlen; + + char* hreflangpos; + size_t hreflangspace; + + bool AcceptingXRobots; + + THttpAuthHeader* auth_hd; + THttpRequestHeader* request_hd; + +private: + THttpBaseHeader* base_hd; + int cs; + +private: + int Init(THttpBaseHeader* header) { + base_hd = header; + auth_hd = nullptr; + request_hd = nullptr; + hd = nullptr; + init(); + return 0; + } + + int execute(unsigned char* inBuf, int len); + void init(); +}; + +struct THttpChunkParser { + int Execute(const void* inBuf, int len) { + return execute((unsigned char*)inBuf, len); + } + + int Init() { + init(); + return 0; + } + + int chunk_length; + char* lastchar; + long I; + int Dc; + i64 cnt64; + +private: + int cs; + int execute(unsigned char* inBuf, int len); + void init(); +}; diff --git a/library/cpp/http/fetch/httpfsm.rl6 b/library/cpp/http/fetch/httpfsm.rl6 new file mode 100644 index 0000000000..eab0328b18 --- /dev/null +++ b/library/cpp/http/fetch/httpfsm.rl6 @@ -0,0 +1,684 @@ +#include <stdio.h> +#include <time.h> + +#include <library/cpp/charset/doccodes.h> +#include <library/cpp/charset/codepage.h> +#include <library/cpp/http/misc/httpcodes.h> +#include <util/datetime/base.h> +#include <util/generic/ylimits.h> +#include <algorithm> // max + +#include <library/cpp/http/fetch/httpheader.h> +#include <library/cpp/http/fetch/httpfsm.h> + +#ifdef _MSC_VER +#pragma warning(disable: 4702) // unreachable code +#endif + +#define c(i) I = i; +#define m(i) I = std::max(I, (long)i); + +static inline int X(unsigned char c) { + return (c >= 'A' ? ((c & 0xdf) - 'A' + 10) : (c - '0')); +} + +template <typename x> +static inline void guard(x &val) { + val = (val >= -1) ? -4 - val : -2; // f(-2) = -2 +} + +template <typename x> +static inline void setguarded(x &val, long cnt) { + val = (val == -4 - -1 || cnt == -4 -val) ? cnt : -2; +} + +//////////////////////////////////////////////////////////////////// +/// HTTP PARSER +//////////////////////////////////////////////////////////////////// + +%%{ +machine http_header_parser; + +include HttpDateTimeParser "../../../../util/datetime/parser.rl6"; + +alphtype unsigned char; + +################# 2.2 Basic Rules ################# +eol = '\r'? '\n'; +ws = [ \t]; +lw = '\r'? '\n'? ws; +separator = [()<>@,;:\\"/\[\]?={}]; +token_char = [!-~] - separator; # http tokens chars +url_char = [!-~] - ["<>\[\]\\^`{}|]; # uric chars +text_char = ws | 33..126 | 128..255; +any_text_char = any - [\r\n]; + +lws = lw*; +eoh = lws eol; +token = token_char+; +ex_token = (token_char | ws)* token_char; +text = (text_char | lw)*; +any_text = (any_text_char | lw)*; +def = lws ':' lws; + +action clear_buf { buflen = 0; } +action update_buf { if (buflen < sizeof(buf)) buf[buflen++] = fc; } + +################################################### +############ response status line ################# +action set_minor { base_hd->http_minor = I; } +action set_status { + if (hd) { + hd->http_status = I; + } + if (request_hd) { + return -3; + } +} + +status_code = int3; +http_major = int; +http_minor = int; +reason_phrase = ws+ text_char*; +http_version = "http/"i http_major '.' http_minor %set_minor; +response_status_line = http_version ws+ status_code reason_phrase? eol %set_status; + +############ request status line ################# +action set_request_uri { + if (request_hd && buflen < FETCHER_URL_MAX) { + if (!request_hd->request_uri.empty()) { + return -2; + } + request_hd->request_uri =TStringBuf(buf, buflen); + } +} +action set_http_method { + if (request_hd) { + request_hd->http_method = I; + } + if (hd) { + return -3; + } +} + +http_extension_method = token; +http_method = ("options"i %{c(0)} @1 + | "get"i %{c(1)} @1 + | "head"i %{c(2)} @1 + | "post"i %{c(3)} @1 + | "put"i %{c(4)} @1 + | "delete"i %{c(5)} @1 + | "trace"i %{c(6)} @1 + | "connect"i %{c(7)} @1 + | http_extension_method %{c(8)} $0) + %set_http_method; +request_uri = (token_char | separator)+ >clear_buf $update_buf + %set_request_uri; +request_status_line = http_method ws+ request_uri ws+ http_version eoh; + +################# connection ###################### +action beg_connection { guard(base_hd->connection_closed); I = -1; } +action set_connection { setguarded(base_hd->connection_closed, I); } + +c_token = "close"i %{m(1)} + | "keep-alive"i %{m(0)}; +c_tokenlist = c_token (lws ',' lws c_token)?; +connection = "connection"i def %beg_connection c_tokenlist eoh %set_connection; + +################# content-encoding ################ +action beg_content_encoding { I = HTTP_COMPRESSION_ERROR; } +action set_content_encoding { base_hd->compression_method = + ((base_hd->compression_method == HTTP_COMPRESSION_UNSET || + base_hd->compression_method == I) ? + I : (int)HTTP_COMPRESSION_ERROR); } + +ce_tokenlist = "identity"i %{c(HTTP_COMPRESSION_IDENTITY)} + | "gzip"i %{c(HTTP_COMPRESSION_GZIP)} + | "x-gzip"i %{c(HTTP_COMPRESSION_GZIP)} + | "deflate"i %{c(HTTP_COMPRESSION_DEFLATE)} + | "compress"i %{c(HTTP_COMPRESSION_COMPRESS)} + | "x-compress"i %{c(HTTP_COMPRESSION_COMPRESS)}; +content_encoding = "content-encoding"i def %beg_content_encoding ce_tokenlist eoh %set_content_encoding; + +################# transfer-encoding ############### +action beg_encoding { guard(base_hd->transfer_chunked); } +action set_encoding { setguarded(base_hd->transfer_chunked, I); } + +e_tokenlist = "identity"i %{c(0)} + | "chunked"i %{c(1)}; +transfer_encoding = "transfer-encoding"i def %beg_encoding e_tokenlist eoh %set_encoding; + +################# content-length ################## +action beg_content_length { guard(base_hd->content_length); } +action set_content_length { setguarded(base_hd->content_length, I); } + +content_length = "content-length"i def %beg_content_length int eoh %set_content_length; + +################# content-range ################### +action beg_content_range_start { guard(base_hd->content_range_start); I = -1; } +action set_content_range_start { setguarded(base_hd->content_range_start, I); } +action beg_content_range_end { guard(base_hd->content_range_end); I = -1; } +action set_content_range_end { setguarded(base_hd->content_range_end, I); } +action beg_content_range_el { guard(base_hd->content_range_entity_length); I = -1; } +action set_content_range_el { setguarded(base_hd->content_range_entity_length, I); } + +content_range = "content-range"i def "bytes"i sp %beg_content_range_start int '-' %set_content_range_start + %beg_content_range_end int '/' %set_content_range_end + %beg_content_range_el int eoh %set_content_range_el; + +################# accept-ranges ################### +action beg_accept_ranges { + if (hd) { + guard(hd->accept_ranges); + I = -1; + } +} +action set_accept_ranges { if (hd) setguarded(hd->accept_ranges, I); } + +ar_tokenlist = "bytes"i %{c(1)} + | "none"i %{c(0)}; +accept_ranges = "accept-ranges"i def %beg_accept_ranges ar_tokenlist eoh %set_accept_ranges; + +################# content-type #################### +action beg_mime { guard(base_hd->mime_type); } +action set_mime { setguarded(base_hd->mime_type, I); } +action set_charset { + if (buflen < FETCHER_URL_MAX) { + buf[buflen++] = 0; + base_hd->charset = EncodingHintByName((const char*)buf); + } +} + +mime_type = "text/plain"i %{c(MIME_TEXT)} + | "text/html"i %{c(MIME_HTML)} + | "application/pdf"i %{c(MIME_PDF)} + | "application/rtf"i %{c(MIME_RTF)} + | "text/rtf"i %{c(MIME_RTF)} + | "application/msword"i %{c(MIME_DOC)} + | "audio/mpeg"i %{c(MIME_MPEG)} + | "text/xml"i %{c(MIME_XML)} + | "application/xml"i %{c(MIME_XML)} + | "application/rss+xml"i %{c(MIME_RSS)} + | "application/rdf+xml"i %{c(MIME_RSS)} + | "application/atom+xml"i %{c(MIME_RSS)} + | "text/vnd.wap.wml"i %{c(MIME_WML)} + | "application/x-shockwave-flash"i %{c(MIME_SWF)} + | "application/vnd.ms-excel"i %{c(MIME_XLS)} + | "application/vnd.ms-powerpoint"i %{c(MIME_PPT)} + | "image/jpeg"i %{c(MIME_IMAGE_JPG)} + | "image/jpg"i %{c(MIME_IMAGE_JPG)} + | "image/pjpeg"i %{c(MIME_IMAGE_PJPG)} + | "image/png"i %{c(MIME_IMAGE_PNG)} + | "image/gif"i %{c(MIME_IMAGE_GIF)} + | "application/xhtml+xml"i %{c(MIME_XHTMLXML)} + | "application/vnd.openxmlformats-officedocument.wordprocessingml.document"i %{c(MIME_DOCX)} + | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"i %{c(MIME_XLSX)} + | "application/vnd.openxmlformats-officedocument.presentationml.presentation"i %{c(MIME_PPTX)} + | "application/vnd.oasis.opendocument.text"i %{c(MIME_ODT)} + | "application/vnd.oasis.opendocument.presentation"i %{c(MIME_ODP)} + | "application/vnd.oasis.opendocument.spreadsheet"i %{c(MIME_ODS)} + | "application/vnd.oasis.opendocument.graphics"i %{c(MIME_ODG)} + | "image/x-ms-bmp"i %{c(MIME_IMAGE_BMP)} + | "image/bmp"i %{c(MIME_IMAGE_BMP)} + | "audio/x-wav"i %{c(MIME_WAV)} + | ( "application/x-tar"i | "application/x-ustar"i | "application/x-gtar"i | "application/zip"i | "application/x-archive"i + | "application/x-bzip2"i | "application/x-rar"i ) %{c(MIME_ARCHIVE)} + | "application/x-dosexec"i %{c(MIME_EXE)} + | "application/x-gzip"i %{c(MIME_GZIP)} + | "application/json"i %{c(MIME_JSON)} + | ("application/javascript"i | "text/javascript"i) %{c(MIME_JAVASCRIPT)} + | "application/vnd.android.package-archive"i %{c(MIME_APK)} + | ("image/x-icon"i | "image/vnd.microsoft.icon"i) %{c(MIME_IMAGE_ICON)} + ; + + +charset_name = token_char+ >clear_buf $update_buf; +mime_param = "charset"i ws* '=' ws* '"'? charset_name '"'? %set_charset @2 + | token ws* '=' ws* '"'? token '"'? @1 + | text $0; +mime_parms = (lws ';' lws mime_param)*; +content_type = "content-type"i def %beg_mime mime_type mime_parms eoh %set_mime; + +################# last modified ################### +action beg_modtime { guard(base_hd->http_time); } +action set_modtime { + setguarded(base_hd->http_time, DateTimeFields.ToTimeT(-1)); +} + +last_modified = "last-modified"i def %beg_modtime http_date eoh %set_modtime; + +################# location ######################## +action set_location { + while (buflen > 0 && (buf[buflen - 1] == ' ' || buf[buflen - 1] == '\t')) { + buflen --; + } + if (hd && buflen < FETCHER_URL_MAX) { + hd->location = TStringBuf(buf, buflen); + } +} + +action set_status_303{ if (hd) hd->http_status = 303; } + +url = url_char+ >clear_buf $update_buf; +loc_url = any_text_char+ >clear_buf $update_buf; +location = "location"i def loc_url eoh %set_location; +refresh = "refresh"i def int ';' lws "url="i loc_url eoh %set_location; + +################# x-robots-tag ################ +action set_x_robots { + if (hd && AcceptingXRobots) { + if (I > 0) + hd->x_robots_tag |= I; + + int pos = (I > 0 ? I : -I); + for (size_t i = 0; i < 5; ++i) + if (abs(pos) & (1 << i)) // permissive flags take priority + hd->x_robots_state[i] = (I < 0) ? '1' : (hd->x_robots_state[i] != '1') ? '0' : '1'; + } +} + +action accept_x_robots { + AcceptingXRobots = (bool)I; +} + +x_robots_directive = "none"i %{c(3)} | "all"i %{c(-3)} + | "noindex"i %{c(1)} | "index"i %{c(-1)} + | "nofollow"i %{c(2)} | "follow"i %{c(-2)} + | "noarchive"i %{c(4)} | "archive"i %{c(-4)} + | "noyaca"i %{c(16)} + | "noodp"i %{c(8)}; + +any_value = (any_text_char - [, \t])+ (lws (any_text_char - [, \t])+)*; +any_key = (any_text_char - [:, \t])+ (lws (any_text_char - [:, \t])+)*; + +unavailable_after_directive = "unavailable_after"i def any_value; + +yandex_robot = "yandex"i | "yandexbot"i; +other_robot = any_key - "unavailable_after"i - yandex_robot; +robot_specifier = yandex_robot %{c(1)} | other_robot %{c(0)}; + +x_robots_value = (robot_specifier def %accept_x_robots)? (unavailable_after_directive | (x_robots_directive %set_x_robots) | any_value? ); + +x_robots_tag = "x-robots-tag"i def >{ AcceptingXRobots = true; } x_robots_value (lws ',' lws x_robots_value)* eoh; + +################# rel_canonical ############### +action set_canonical { + if (hd && buflen < FETCHER_URL_MAX) { + hd->rel_canonical = TStringBuf(buf, buflen); + } +} + +rel_canonical = "link"i def '<' url ">;"i lws "rel"i lws '=' lws "\"canonical\"" eoh %set_canonical; +################# hreflang ############### +action set_hreflang { + bool first = (hreflangpos == hd->hreflangs); + size_t len2 = (first ? 0 : 1) + langlen + 1 + buflen; + if (langlen && len2 < hreflangspace) { + if (!first) { + *(hreflangpos++) = '\t'; + } + memcpy(hreflangpos, langstart, langlen); + hreflangpos += langlen; + *(hreflangpos++) = ' '; + memcpy(hreflangpos, buf, buflen); + hreflangpos += buflen; + *(hreflangpos) = 0; + hreflangspace -= len2; + } +} + +action start_lang { + langstart = fpc; + langlen = 0; +} +action end_lang { + langlen = fpc - langstart; +} +hreflang_token = (token_char - ['])+; +quote = ['"]?; #" +lang = hreflang_token >start_lang %end_lang; + +hreflang = "link"i def '<' url '>' lws ";" lws + ( ( "rel"i lws '=' lws quote "alternate" quote lws ';' lws "hreflang"i lws '=' lws quote lang quote ) + | ( "hreflang"i lws '=' lws quote lang quote lws ';' lws "rel"i lws '=' lws quote "alternate" quote ) ) + eoh %set_hreflang; +################# squid_error ################# +action set_squid_error { + hd->squid_error = 1; +} + +squid_error = "X-Yandex-Squid-Error"i def any_text eoh %set_squid_error; + +################# auth ######################## +action init_auth { + if (auth_hd) + auth_hd->use_auth=true; +} + +action update_auth_buf + { if (auth_hd && buflen < sizeof(buf)) buf[buflen++] = *fpc; } + +quoted_str = /"/ (text_char - /"/)* /"/ >2; +auth_quoted_str = ( /"/ ( ( text_char - /"/ )* >clear_buf $update_auth_buf ) /"/ ) > 2; + +# do not support auth-int, too heavy procedure + +qop_auth_option = "auth"i @1 %{if(auth_hd) auth_hd->qop_auth = true; }; + +qop_option = ( qop_auth_option @1 ) | (( token-"auth"i) $0 ); + +auth_good_param = ( "nonce"i /=/ auth_quoted_str ) + %{if (auth_hd && buflen < FETCHER_URL_MAX-1) { + buf[buflen++] = 0; + auth_hd->nonce = strdup((const char*)buf); + }} + | ( "realm"i /=/ auth_quoted_str ) + %{if (auth_hd && buflen < FETCHER_URL_MAX-1) { + buf[buflen++] = 0; + auth_hd->realm = strdup((const char*)buf); + }} + | ( "opaque"i /=/ auth_quoted_str ) + %{if (auth_hd && buflen < FETCHER_URL_MAX-1) { + buf[buflen++] = 0; + auth_hd->opaque = strdup((const char*)buf); + }} + | "stale"i /=/ "true"i + %{if (auth_hd) auth_hd->stale = true; } + | "algorithm"i /=/ "md5"i /-/ "sess"i + %{if (auth_hd) auth_hd->algorithm = 1; } + | ( "qop"i /="/ qop_option (ws* "," ws* qop_option)* /"/); + +auth_param = auth_good_param @1 | + ( (token - ( "nonce"i | "opaque"i | "realm"i | "qop"i ) ) + /=/ (token | quoted_str ) ) $0; + +auth_params = auth_param ( ws* /,/ ws* auth_param )*; + +digest_challenge = ("digest"i %init_auth ws+ auth_params) | + ((token-"digest"i) text); + +auth = "www-authenticate"i def digest_challenge eoh; + +###################### host ####################### +action set_host { + if (request_hd && buflen < HOST_MAX) { + buf[buflen++] = 0; + if (request_hd->host[0] != 0) { + return -2; + } + memcpy(request_hd->host, buf, buflen); + } +} + +host = (url_char | [:])* >clear_buf $update_buf; +host_header = "host"i def host eoh %set_host; + +###################### from ####################### +action set_from { + if (request_hd && buflen < MAXWORD_LEN) { + buf[buflen++] = 0; + if (request_hd->from[0] != 0) { + return -2; + } + memcpy(request_hd->from, buf, buflen); + } +} + +mailbox = (token "@" token) >clear_buf $update_buf; +from_header = "from"i def mailbox eoh %set_from; + +################### user-agent #################### +action set_user_agent { + if (request_hd && buflen < MAXWORD_LEN) { + buf[buflen++] = 0; + if (request_hd->user_agent[0] != 0) { + return -2; + } + memcpy(request_hd->user_agent, buf, buflen); + } +} + +user_agent = any_text_char* >clear_buf $update_buf; +user_agent_header = "user-agent"i def user_agent eoh %set_user_agent; + +############### x-yandex-langregion ################ +action set_langregion { + if (request_hd && buflen < MAX_LANGREGION_LEN) { + buf[buflen++] = 0; + if (request_hd->x_yandex_langregion[0] != 0) { + return -2; + } + memcpy(request_hd->x_yandex_langregion, buf, buflen); + } +} + +langregion = any_text_char* >clear_buf $update_buf; +langregion_header = "x-yandex-langregion"i def langregion eoh %set_langregion; + +############### x-yandex-sourcename ################ +action set_sourcename { + if (request_hd && buflen < MAXWORD_LEN) { + buf[buflen++] = 0; + if (request_hd->x_yandex_sourcename[0] != 0) { + return -2; + } + memcpy(request_hd->x_yandex_sourcename, buf, buflen); + } +} + +sourcename = any_text_char* >clear_buf $update_buf; +sourcename_header = "x-yandex-sourcename"i def sourcename eoh %set_sourcename; + +############### x-yandex-requesttype ############### +action set_requesttype { + if (request_hd && buflen < MAXWORD_LEN) { + buf[buflen++] = 0; + if (request_hd->x_yandex_requesttype[0] != 0) { + return -2; + } + memcpy(request_hd->x_yandex_requesttype, buf, buflen); + } +} + +requesttype = any_text_char* >clear_buf $update_buf; +requesttype_header = "x-yandex-requesttype"i def requesttype eoh %set_requesttype; + +################ x-yandex-fetchoptions ############### +action set_fetchoptions { + if (request_hd && buflen < MAXWORD_LEN) { + buf[buflen++] = 0; + if (request_hd->x_yandex_fetchoptions[0] != 0) { + return -2; + } + memcpy(request_hd->x_yandex_fetchoptions, buf, buflen); + } +} + +fetchoptions = any_text_char* >clear_buf $update_buf; +fetchoptions_header = "x-yandex-fetchoptions"i def fetchoptions eoh %set_fetchoptions; + +################ if-modified-since ################ +action set_if_modified_since { + if (request_hd) { + request_hd->if_modified_since = DateTimeFields.ToTimeT(-1); + } +} + +if_modified_since = "if-modified-since"i def http_date eoh + %set_if_modified_since; + +################ retry-after ################ +action set_retry_after_withdate { + if (hd) { + hd->retry_after = DateTimeFields.ToTimeT(-1); + } +} + +action set_retry_after_withdelta { + if (hd) { + hd->retry_after = TInstant::Now().Seconds() + I; + } +} + +retry_after_withdate = "retry-after"i def http_date eoh + %set_retry_after_withdate; +retry_after_withdelta = "retry-after"i def int eoh + %set_retry_after_withdelta; + +############## request-cache-control ############## +action SETMAXAGE { if (request_hd) request_hd->max_age = I; } + +delta_seconds = int; +cache_extension = token ("=" (token | quoted_str))?; +request_cache_directive = "no-cache"i + | "no-store"i + | ("max-age"i "=" delta_seconds %SETMAXAGE) + | ("max-stale"i ("=" delta_seconds)?) + | ("min-fresh"i "=" delta_seconds) + | "non-transform"i + | "only-if-cached"i + | cache_extension; +request_cache_control = "cache-control"i def request_cache_directive eoh; + +############ x-yandex-response-timeout ############# + +action set_response_timeout { + if (request_hd) { + request_hd->x_yandex_response_timeout = I; + } +} + +response_timeout = "x-yandex-response-timeout"i def int eoh + %set_response_timeout; + +############ x-yandex-request-priority ############# + +action set_request_priority { + if (request_hd) { + request_hd->x_yandex_request_priority = I; + } +} + +request_priority = "x-yandex-request-priority"i def int eoh + %set_request_priority; + +################# message header ################## +other_header = ( ex_token - "www-authenticate"i ) def any_text eoh; +message_header = other_header $0 + | connection @1 + | content_encoding @1 + | transfer_encoding @1 + | content_length @1 + | content_type @1 + | last_modified @1 + | refresh @1 + | content_range @1; +response_header = message_header $0 + | auth @1 + | accept_ranges @1 + | location @1 + | x_robots_tag @1 + | rel_canonical @1 + | hreflang @1 + | squid_error @1 + | retry_after_withdate @1 + | retry_after_withdelta @1; +request_header = message_header $0 + | from_header @1 + | host_header @1 + | user_agent_header @1 + | sourcename_header @1 + | requesttype_header @1 + | langregion_header @1 + | fetchoptions_header @1 + | if_modified_since @1 + | request_cache_control @1 + | response_timeout @1 + | request_priority @1; + +################# main ############################ +action accepted { lastchar = (char*)fpc; return 2; } + +main := ((response_status_line ('\r'? response_header)*) + | (request_status_line ('\r' ? request_header)*)) + eol @accepted; + +}%% + +%% write data; + +int THttpHeaderParser::execute(unsigned char *inBuf, int len) { + const unsigned char *p = inBuf; + const unsigned char *pe = p + len; + %% write exec; + if (cs == http_header_parser_error) + return -1; + else if (cs == http_header_parser_first_final) + return 0; + else + return 1; +} + +void THttpHeaderParser::init() { + %% write init; +} + +%%{ +machine http_chunk_parser; + +alphtype unsigned char; + +action clear_hex { cnt64 = 0; } +action update_hex { cnt64 = 16 * cnt64 + X(fc); if(cnt64 > Max<int>()) return -2; } +action set_chunk { chunk_length = static_cast<int>(cnt64); } +action accepted { lastchar = (char*)fpc; return 2; } + +eol = '\r'? '\n'; +ws = [ \t]; +sp = ' '; +lw = '\r'? '\n'? ws; +separator = [()<>@,;:\\"/\[\]?={}]; +token_char = [!-~] - separator; # http tokens chars +url_char = [!-~] - ["<>\[\]\\^`{}|]; # uric chars +text_char = ws | 33..127 | 160..255; + +lws = lw*; +eoh = lws eol; +token = token_char+; +text = (text_char | lw)*; +def = lws ':' lws; + +hex = (xdigit+) >clear_hex $update_hex; +quoted_string = '"' ((text_char - '"') $0 | '\\"' @1)* '"'; + +chunk_ext_val = token | quoted_string; +chunk_ext_name = token; +chunk_extension = ws* (';' chunk_ext_name ws* '=' ws* chunk_ext_val ws*)*; + +entity_header = token def text eoh; +trailer = entity_header*; + +chunk = (hex - '0'+) chunk_extension? %set_chunk; +last_chunk = '0'+ chunk_extension? eol trailer; +main := eol (chunk $0 | last_chunk @1) eol @accepted; + +}%% + +%% write data; + +int THttpChunkParser::execute(unsigned char *inBuf, int len) { + const unsigned char *p = inBuf; + const unsigned char *pe = p + len; + %% write exec; + if (cs == http_chunk_parser_error) + return -1; + else if (cs == http_chunk_parser_first_final) + return 0; + else + return 1; +} + +void THttpChunkParser::init() { + chunk_length = 0; + %% write init; +} diff --git a/library/cpp/http/fetch/httpfsm_ut.cpp b/library/cpp/http/fetch/httpfsm_ut.cpp new file mode 100644 index 0000000000..b018e80101 --- /dev/null +++ b/library/cpp/http/fetch/httpfsm_ut.cpp @@ -0,0 +1,591 @@ +#include "httpfsm.h" +#include "library-htfetch_ut_hreflang_in.h" +#include "library-htfetch_ut_hreflang_out.h" + +#include <util/generic/ptr.h> +#include <library/cpp/charset/doccodes.h> +#include <library/cpp/testing/unittest/registar.h> + +class THttpHeaderParserTestSuite: public TTestBase { + UNIT_TEST_SUITE(THttpHeaderParserTestSuite); + UNIT_TEST(TestRequestHeader); + UNIT_TEST(TestSplitRequestHeader); + UNIT_TEST(TestTrailingData); + UNIT_TEST(TestProxyRequestHeader); + UNIT_TEST(TestIncorrectRequestHeader); + UNIT_TEST(TestLastModified); + UNIT_TEST(TestLastModifiedCorrupted); + UNIT_TEST(TestResponseHeaderOnRequest); + UNIT_TEST(TestRequestHeaderOnResponse); + UNIT_TEST(TestXRobotsTagUnknownTags); + UNIT_TEST(TestXRobotsTagMyBot); + UNIT_TEST(TestXRobotsTagOtherBot); + UNIT_TEST(TestXRobotsTagUnavailableAfterAware); + UNIT_TEST(TestXRobotsTagUnavailableAfterWorks); + UNIT_TEST(TestXRobotsTagOverridePriority); + UNIT_TEST(TestXRobotsTagDoesNotBreakCharset); + UNIT_TEST(TestXRobotsTagAllowsMultiline); + UNIT_TEST(TestRelCanonical); + UNIT_TEST(TestHreflang); + UNIT_TEST(TestHreflangOnLongInput); + UNIT_TEST(TestMimeType); + UNIT_TEST(TestRepeatedContentEncoding); + UNIT_TEST_SUITE_END(); + +private: + THolder<THttpHeaderParser> httpHeaderParser; + +private: + void TestStart(); + void TestFinish(); + +public: + void TestRequestHeader(); + void TestSplitRequestHeader(); + void TestTrailingData(); + void TestProxyRequestHeader(); + void TestIncorrectRequestHeader(); + void TestLastModified(); + void TestLastModifiedCorrupted(); + void TestResponseHeaderOnRequest(); + void TestRequestHeaderOnResponse(); + void TestXRobotsTagUnknownTags(); + void TestXRobotsTagMyBot(); + void TestXRobotsTagOtherBot(); + void TestXRobotsTagUnavailableAfterAware(); + void TestXRobotsTagUnavailableAfterWorks(); + void TestXRobotsTagOverridePriority(); + void TestXRobotsTagDoesNotBreakCharset(); + void TestXRobotsTagAllowsMultiline(); + void TestRelCanonical(); + void TestHreflang(); + void TestHreflangOnLongInput(); + void TestMimeType(); + void TestRepeatedContentEncoding(); +}; + +void THttpHeaderParserTestSuite::TestStart() { + httpHeaderParser.Reset(new THttpHeaderParser()); +} + +void THttpHeaderParserTestSuite::TestFinish() { + httpHeaderParser.Reset(); +} + +void THttpHeaderParserTestSuite::TestRequestHeader() { + TestStart(); + THttpRequestHeader httpRequestHeader; + httpHeaderParser->Init(&httpRequestHeader); + const char* request = "GET /search?q=hi HTTP/1.1\r\n" + "Host: www.google.ru:8080\r\n\r\n"; + i32 result = httpHeaderParser->Execute(request, strlen(request)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpRequestHeader.http_method, HTTP_METHOD_GET); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.host, "www.google.ru:8080"), 0); + UNIT_ASSERT_EQUAL(httpRequestHeader.request_uri, "/search?q=hi"); + UNIT_ASSERT_EQUAL(httpRequestHeader.GetUrl(), "http://www.google.ru:8080/search?q=hi"); + UNIT_ASSERT_EQUAL(httpHeaderParser->lastchar - request + 1, + (i32)strlen(request)); + UNIT_ASSERT_EQUAL(httpRequestHeader.x_yandex_response_timeout, + DEFAULT_RESPONSE_TIMEOUT); + UNIT_ASSERT_EQUAL(httpRequestHeader.x_yandex_request_priority, + DEFAULT_REQUEST_PRIORITY); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_sourcename, ""), 0); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_requesttype, ""), 0); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_fetchoptions, ""), 0); + TestFinish(); + UNIT_ASSERT_EQUAL(httpRequestHeader.max_age, DEFAULT_MAX_AGE); +} + +void THttpHeaderParserTestSuite::TestSplitRequestHeader() { + TestStart(); + const char* request = + "GET /search?q=hi HTTP/1.1\r\n" + "Host: www.google.ru:8080 \r\n" + "\r\n"; + const size_t rlen = strlen(request); + + for (size_t n1 = 0; n1 < rlen; n1++) { + for (size_t n2 = n1; n2 < rlen; n2++) { + TString s1{request, 0, n1}; + TString s2{request, n1, n2 - n1}; + TString s3{request, n2, rlen - n2}; + UNIT_ASSERT_EQUAL(s1 + s2 + s3, request); + + THttpRequestHeader httpRequestHeader; + UNIT_ASSERT(0 == httpHeaderParser->Init(&httpRequestHeader)); + i32 result = httpHeaderParser->Execute(s1); + UNIT_ASSERT_EQUAL(result, 1); + result = httpHeaderParser->Execute(s2); + UNIT_ASSERT_EQUAL(result, 1); + result = httpHeaderParser->Execute(s3); + UNIT_ASSERT_EQUAL(result, 2); + + UNIT_ASSERT_EQUAL(httpRequestHeader.http_method, HTTP_METHOD_GET); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.host, "www.google.ru:8080"), 0); + UNIT_ASSERT_EQUAL(httpRequestHeader.request_uri, "/search?q=hi"); + } + } + + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestTrailingData() { + TestStart(); + THttpRequestHeader httpRequestHeader; + UNIT_ASSERT(0 == httpHeaderParser->Init(&httpRequestHeader)); + const char* request = + "GET /search?q=hi HTTP/1.1\r\n" + "Host: www.google.ru:8080\r\n" + "\r\n" + "high.ru"; + i32 result = httpHeaderParser->Execute(request, strlen(request)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpRequestHeader.http_method, HTTP_METHOD_GET); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.host, "www.google.ru:8080"), 0); + UNIT_ASSERT_EQUAL(httpRequestHeader.request_uri, "/search?q=hi"); + UNIT_ASSERT_EQUAL(TString(httpHeaderParser->lastchar + 1), "high.ru"); + UNIT_ASSERT_EQUAL(httpRequestHeader.http_minor, 1); + UNIT_ASSERT_EQUAL(httpRequestHeader.transfer_chunked, -1); + UNIT_ASSERT_EQUAL(httpRequestHeader.content_length, -1); + UNIT_ASSERT_EQUAL(httpRequestHeader.connection_closed, -1); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestProxyRequestHeader() { + TestStart(); + THttpRequestHeader httpRequestHeader; + httpHeaderParser->Init(&httpRequestHeader); + const char* request = + "GET http://www.google.ru:8080/search?q=hi HTTP/1.1\r\n" + "X-Yandex-Response-Timeout: 1000\r\n" + "X-Yandex-Request-Priority: 2\r\n" + "X-Yandex-Sourcename: orange\r\n" + "X-Yandex-Requesttype: userproxy\r\n" + "X-Yandex-FetchOptions: d;c\r\n" + "Cache-control: max-age=100\r\n" + "If-Modified-Since: Sat, 29 Oct 1994 19:43:31 GMT\r\n" + "User-Agent: Yandex/1.01.001 (compatible; Win16; I)\r\n" + "From: webadmin@yandex.ru\r\n\r\n"; + i32 result = httpHeaderParser->Execute(request, strlen(request)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpRequestHeader.http_method, HTTP_METHOD_GET); + UNIT_ASSERT_EQUAL(httpRequestHeader.x_yandex_response_timeout, 1000); + UNIT_ASSERT_EQUAL(httpRequestHeader.x_yandex_request_priority, 2); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_sourcename, "orange"), 0); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_requesttype, "userproxy"), 0); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.x_yandex_fetchoptions, "d;c"), 0); + UNIT_ASSERT_EQUAL(httpRequestHeader.max_age, 100); + UNIT_ASSERT_VALUES_EQUAL(httpRequestHeader.if_modified_since, + TInstant::ParseIso8601Deprecated("1994-10-29 19:43:31Z").TimeT()); + UNIT_ASSERT_EQUAL(httpRequestHeader.request_uri, + "http://www.google.ru:8080/search?q=hi"); + UNIT_ASSERT(httpRequestHeader.GetUrl() == + "http://www.google.ru:8080/search?q=hi"); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.host, ""), 0); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.from, "webadmin@yandex.ru"), 0); + UNIT_ASSERT_EQUAL(strcmp(httpRequestHeader.user_agent, + "Yandex/1.01.001 (compatible; Win16; I)"), + 0); + UNIT_ASSERT_EQUAL(httpHeaderParser->lastchar - request + 1, + (i32)strlen(request)); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestIncorrectRequestHeader() { + TestStart(); + THttpRequestHeader httpRequestHeader; + httpHeaderParser->Init(&httpRequestHeader); + const char* request = "GET /search?q=hi HTP/1.1\r\n" + "Host: www.google.ru:8080\r\n\r\n"; + i32 result = httpHeaderParser->Execute(request, strlen(request)); + UNIT_ASSERT(result != 2); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestLastModified() { + TestStart(); + THttpHeader h; + UNIT_ASSERT(0 == httpHeaderParser->Init(&h)); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "Last-Modified: Thu, 13 Aug 2009 14:27:08 GMT\r\n\r\n"; + UNIT_ASSERT(2 == httpHeaderParser->Execute(headers, strlen(headers))); + UNIT_ASSERT_VALUES_EQUAL( + TInstant::ParseIso8601Deprecated("2009-08-13 14:27:08Z").TimeT(), + h.http_time); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestLastModifiedCorrupted() { + TestStart(); + THttpHeader h; + UNIT_ASSERT(0 == httpHeaderParser->Init(&h)); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "Last-Modified: Thu, 13 Aug 2009 14:\r\n\r\n"; + UNIT_ASSERT(2 == httpHeaderParser->Execute(headers, strlen(headers))); + UNIT_ASSERT(h.http_time < 0); // XXX: don't understand what is the proper value + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestXRobotsTagUnknownTags() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "x-robots-tag: asdfasdf asdf asdf,,, , noindex,noodpXXX , NOFOLLOW ,noodpnofollow\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 3); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "00xxx"); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestXRobotsTagMyBot() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "x-robots-tag: yandex: noindex, nofollow\r\n" + "x-robots-tag: yandexbot: noarchive, noodp\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 15); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "0000x"); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestXRobotsTagOtherBot() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "x-robots-tag: google: noindex, nofollow\r\n" + "x-robots-tag: googlebot: noarchive, noodp\r\n" + "x-robots-tag: !still(-other) bot_: foo, noyaca\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 0); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "xxxxx"); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestXRobotsTagUnavailableAfterAware() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + // проверяем только что unavailable_after ничего не ломает + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "x-robots-tag: unavailable_after: 01 Jan 2999 00:00 UTC, noindex, nofollow\r\n" + "x-robots-tag: yandex: unavailable_after: 01 Jan 2999 00:00 UTC, noarchive, noodp\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 15); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "0000x"); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestXRobotsTagUnavailableAfterWorks() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + // пока не поддерживается + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "x-robots-tag: unavailable_after: 01 Jan 2000 00:00 UTC\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + //UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 1); + //UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "0xxxx"); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestXRobotsTagOverridePriority() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "x-robots-tag: all, none\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "11xxx"); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_tag, 3); // NOTE legacy behavior, should be 0 as `all` overrides + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestXRobotsTagDoesNotBreakCharset() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "X-Robots-Tag: noarchive\r\n" + "Content-Type: application/json; charset=utf-8\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.mime_type, static_cast<ui8>(MIME_JSON)); + UNIT_ASSERT_EQUAL(httpHeader.charset, static_cast<ui8>(CODES_UTF8)); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestXRobotsTagAllowsMultiline() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "X-Robots-Tag\r\n" + " :\r\n" + " unavailable_since\r\n" + " :\r\n" + " ,\r\n" + " unavailable_since\r\n" + " :\r\n" + " 01 Jan 2000\r\n" + " 00:00 UTC\r\n" + " ,\r\n" + " yandexbot\r\n" + " :\r\n" + " noindex\r\n" + " ,\r\n" + " garbage\r\n" + " ,\r\n" + " nofollow\r\n" + " ,\r\n" + " other\r\n" + " bot\r\n" + " :\r\n" + " noarchive\r\n" + " ,\r\n" + "Content-Type: application/json; charset=utf-8\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.x_robots_state, "00xxx"); + UNIT_ASSERT_EQUAL(httpHeader.mime_type, static_cast<ui8>(MIME_JSON)); + UNIT_ASSERT_EQUAL(httpHeader.charset, static_cast<ui8>(CODES_UTF8)); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestHreflang() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "link: <http://www.high.ru/>; rel='alternate'; hreflang='x-default'\r\n" + "link: <http://www.high.ru/en.html> ;rel = 'alternate' ;hreflang = en_GB \r\n" + "link: <http://www.high.ru/ru.html>;hreflang = ru_RU.KOI8-r ;rel = 'alternate' \r\n" + "\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_VALUES_EQUAL(result, 2); + // UNIT_ASSERT_VALUES_EQUAL(strcmp(httpHeader.hreflangs, "x-default http://www.high.ru/;"), 0); + UNIT_ASSERT_VALUES_EQUAL(httpHeader.hreflangs, "x-default http://www.high.ru/\ten_GB http://www.high.ru/en.html\tru_RU.KOI8-r http://www.high.ru/ru.html"); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestHreflangOnLongInput() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + TStringBuf testInput(hreflang_ut_in); + TStringBuf testOut(hreflang_ut_out); + i32 result = httpHeaderParser->Execute(testInput.data(), testInput.size()); + UNIT_ASSERT_VALUES_EQUAL(result, 2); + UNIT_ASSERT_VALUES_EQUAL(httpHeader.hreflangs, testOut); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestRelCanonical() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "Link: <http://yandex.ru>; rel = \"canonical\"\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.rel_canonical, "http://yandex.ru"); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestResponseHeaderOnRequest() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* request = "GET /search?q=hi HTP/1.1\r\n" + "Host: www.google.ru:8080\r\n\r\n"; + i32 result = httpHeaderParser->Execute(request, strlen(request)); + UNIT_ASSERT_EQUAL(result, -3); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestRequestHeaderOnResponse() { + TestStart(); + THttpRequestHeader httpRequestHeader; + httpHeaderParser->Init(&httpRequestHeader); + const char* response = "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "Last-Modified: Thu, 13 Aug 2009 14:\r\n\r\n"; + i32 result = httpHeaderParser->Execute(response, strlen(response)); + UNIT_ASSERT_EQUAL(result, -3); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestMimeType() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char* headers = + "HTTP/1.1 200 OK\r\n" + "Content-Type: application/json; charset=utf-8\r\n\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.mime_type, static_cast<ui8>(MIME_JSON)); + UNIT_ASSERT_EQUAL(httpHeader.charset, static_cast<ui8>(CODES_UTF8)); + TestFinish(); +} + +void THttpHeaderParserTestSuite::TestRepeatedContentEncoding() { + TestStart(); + THttpHeader httpHeader; + httpHeaderParser->Init(&httpHeader); + const char *headers = + "HTTP/1.1 200 OK\r\n" + "Server: nginx\r\n" + "Date: Mon, 15 Oct 2018 10:40:44 GMT\r\n" + "Content-Type: text/plain\r\n" + "Transfer-Encoding: chunked\r\n" + "Connection: keep-alive\r\n" + "Last-Modified: Mon, 15 Oct 2018 03:48:54 GMT\r\n" + "ETag: W/\"5bc40e26-a956d\"\r\n" + "X-Autoru-LB: lb-03-sas.prod.vertis.yandex.net\r\n" + "Content-Encoding: gzip\r\n" + "Content-Encoding: gzip\r\n" + "X-UA-Bot: 1\r\n" + "\r\n"; + i32 result = httpHeaderParser->Execute(headers, strlen(headers)); + UNIT_ASSERT_EQUAL(result, 2); + UNIT_ASSERT_EQUAL(httpHeader.error, 0); + UNIT_ASSERT_EQUAL(httpHeader.compression_method, 3); + TestFinish(); +} + +UNIT_TEST_SUITE_REGISTRATION(THttpHeaderParserTestSuite); + +Y_UNIT_TEST_SUITE(TestHttpChunkParser) { + static THttpChunkParser initParser() { + THttpChunkParser parser; + parser.Init(); + return parser; + } + + static THttpChunkParser parseByteByByte(const TStringBuf& blob, const TVector<int>& states) { + UNIT_ASSERT(states.size() <= blob.size()); + THttpChunkParser parser{initParser()}; + for (size_t n = 0; n < states.size(); n++) { + const TStringBuf d{blob, n, 1}; + int code = parser.Execute(d.data(), d.size()); + Cout << TString(d).Quote() << " " << code << Endl; + UNIT_ASSERT_EQUAL(code, states[n]); + } + return parser; + } + + static THttpChunkParser parseBytesWithLastState(const TStringBuf& blob, const int last_state) { + TVector<int> states(blob.size() - 1, 1); + states.push_back(last_state); + return parseByteByByte(blob, states); + } + + Y_UNIT_TEST(TestWithoutEolHead) { + const TStringBuf blob{ + "4\r\n" + "____\r\n"}; + TVector<int> states{ + -1, /* 1, -1, + 1, -1, 1, -1, 1, -1 */}; + // as soon as error happens parser state should be considered + // undefined, state is meaningless after the very first `-1` + // moreover, testenv produces `states[1] == -1` for this input and + // my local build produces `states[1] == 1`. + parseByteByByte(blob, states); + } + + Y_UNIT_TEST(TestTrivialChunk) { + const TStringBuf blob{ + "\r\n" + "4\r\n"}; + THttpChunkParser parser(parseBytesWithLastState(blob, 2)); + UNIT_ASSERT_EQUAL(parser.chunk_length, 4); + UNIT_ASSERT_EQUAL(parser.cnt64, 4); + } + + Y_UNIT_TEST(TestNegative) { + const TStringBuf blob{ + "\r\n" + "-1"}; + TVector<int> states{ + 1, 1, + -1, + /* 1 */}; + parseByteByByte(blob, states); + } + + Y_UNIT_TEST(TestLeadingZero) { + const TStringBuf blob{ + "\r\n" + "042\r\n"}; + THttpChunkParser parser(parseBytesWithLastState(blob, 2)); + UNIT_ASSERT_EQUAL(parser.chunk_length, 0x42); + } + + Y_UNIT_TEST(TestIntOverflow) { + const TStringBuf blob{ + "\r\n" + "deadbeef"}; + THttpChunkParser parser(parseBytesWithLastState(blob, -2)); + UNIT_ASSERT_EQUAL(parser.chunk_length, 0); + UNIT_ASSERT_EQUAL(parser.cnt64, 0xdeadbeef); + } + + Y_UNIT_TEST(TestTrivialChunkWithTail) { + const TStringBuf blob{ + "\r\n" + "4\r\n" + "_" // first byte of the chunk + }; + TVector<int> states{ + 1, 1, + 1, 1, 2, + -1}; + parseByteByByte(blob, states); + } + + Y_UNIT_TEST(TestLastChunk) { + // NB: current parser does not permit whitespace before `foo`, + // but I've never seen the feature in real-life traffic + const TStringBuf blob{ + "\r\n" + "000 ;foo = bar \r\n" + "Trailer: bar\r\n" + "\r\n"}; + THttpChunkParser parser(parseBytesWithLastState(blob, 2)); + UNIT_ASSERT_EQUAL(parser.chunk_length, 0); + } +} diff --git a/library/cpp/http/fetch/httpheader.cpp b/library/cpp/http/fetch/httpheader.cpp new file mode 100644 index 0000000000..7d2225b8b7 --- /dev/null +++ b/library/cpp/http/fetch/httpheader.cpp @@ -0,0 +1,7 @@ +#include "httpheader.h" + +const i64 DEFAULT_RETRY_AFTER = -1; +const i64 DEFAULT_IF_MODIFIED_SINCE = -1; +const i32 DEFAULT_MAX_AGE = -1; +const i8 DEFAULT_REQUEST_PRIORITY = -1; +const i32 DEFAULT_RESPONSE_TIMEOUT = -1; diff --git a/library/cpp/http/fetch/httpheader.h b/library/cpp/http/fetch/httpheader.h new file mode 100644 index 0000000000..b2810bbd41 --- /dev/null +++ b/library/cpp/http/fetch/httpheader.h @@ -0,0 +1,287 @@ +#pragma once + +#include "exthttpcodes.h" + +#include <library/cpp/mime/types/mime.h> + +#include <util/system/defaults.h> +#include <util/system/compat.h> +#include <util/generic/string.h> +#include <util/generic/ylimits.h> +#include <util/system/maxlen.h> + +#include <ctime> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <algorithm> + +// This is ugly solution but here a lot of work to do it the right way. +#define FETCHER_URL_MAX 8192 + +extern const i64 DEFAULT_RETRY_AFTER; /// == -1 +extern const i64 DEFAULT_IF_MODIFIED_SINCE; /// == -1 +extern const i32 DEFAULT_MAX_AGE; /// == -1 +extern const i8 DEFAULT_REQUEST_PRIORITY; /// == -1 +extern const i32 DEFAULT_RESPONSE_TIMEOUT; /// == -1 + +#define HTTP_PREFIX "http://" +#define MAX_LANGREGION_LEN 4 +#define MAXWORD_LEN 55 + +enum HTTP_COMPRESSION { + HTTP_COMPRESSION_UNSET = 0, + HTTP_COMPRESSION_ERROR = 1, + HTTP_COMPRESSION_IDENTITY = 2, + HTTP_COMPRESSION_GZIP = 3, + HTTP_COMPRESSION_DEFLATE = 4, + HTTP_COMPRESSION_COMPRESS = 5, + HTTP_COMPRESSION_MAX = 6 +}; + +enum HTTP_METHOD { + HTTP_METHOD_UNDEFINED = -1, + HTTP_METHOD_OPTIONS, + HTTP_METHOD_GET, + HTTP_METHOD_HEAD, + HTTP_METHOD_POST, + HTTP_METHOD_PUT, + HTTP_METHOD_DELETE, + HTTP_METHOD_TRACE, + HTTP_METHOD_CONNECT, + HTTP_METHOD_EXTENSION +}; + +enum HTTP_CONNECTION { + HTTP_CONNECTION_UNDEFINED = -1, + HTTP_CONNECTION_KEEP_ALIVE = 0, + HTTP_CONNECTION_CLOSE = 1 +}; + +/// Class represents general http header fields. +struct THttpBaseHeader { +public: + i16 error; + i32 header_size; + i32 entity_size; + i64 content_length; + i64 http_time; // seconds since epoch + i64 content_range_start; // Content-Range: first-byte-pos + i64 content_range_end; // Content-Range: last-byte-pos + i64 content_range_entity_length; // Content-Range: entity-length + i8 http_minor; + i8 mime_type; + i8 charset; + i8 compression_method; + i8 transfer_chunked; + i8 connection_closed; + TString base; + +public: + void Init() { + error = 0; + header_size = 0; + entity_size = 0; + content_length = -1; + http_time = -1; + http_minor = -1; + mime_type = -1; + charset = -1; + compression_method = HTTP_COMPRESSION_UNSET; + transfer_chunked = -1; + connection_closed = HTTP_CONNECTION_UNDEFINED; + content_range_start = -1; + content_range_end = -1; + content_range_entity_length = -1; + base.clear(); + } + + void Print() const { + printf("content_length: %" PRIi64 "\n", content_length); + printf("http_time: %" PRIi64 "\n", http_time); + printf("http_minor: %" PRIi8 "\n", http_minor); + printf("mime_type: %" PRIi8 "\n", mime_type); + printf("charset: %" PRIi8 "\n", charset); + printf("compression_method: %" PRIi8 "\n", compression_method); + printf("transfer_chunked: %" PRIi8 "\n", transfer_chunked); + printf("connection_closed: %" PRIi8 "\n", connection_closed); + printf("content_range_start: %" PRIi64 "\n", content_range_start); + printf("content_range_end: %" PRIi64 "\n", content_range_end); + printf("content_range_entity_length: %" PRIi64 "\n", content_range_entity_length); + printf("base: \"%s\"\n", base.c_str()); + printf("error: %" PRIi16 "\n", error); + } + + int SetBase(const char* path, + const char* hostNamePtr = nullptr, + int hostNameLength = 0) { + if (*path == '/') { + base = "http://"; + base += TStringBuf(hostNamePtr, hostNameLength); + base += path; + } else { + base = path; + } + return error; + } +}; + +enum { HREFLANG_MAX = FETCHER_URL_MAX * 2 }; +/// Class represents Http Response Header. +struct THttpHeader: public THttpBaseHeader { +public: + i8 accept_ranges; + i8 squid_error; + i8 x_robots_tag; // deprecated, use x_robots_state instead + i16 http_status; + TString location; + TString rel_canonical; + char hreflangs[HREFLANG_MAX]; + i64 retry_after; + TString x_robots_state; // 'xxxxx' format, see `library/html/zoneconf/parsefunc.cpp` + +public: + void Init() { + THttpBaseHeader::Init(); + accept_ranges = -1; + squid_error = 0; + x_robots_tag = 0; + rel_canonical.clear(); + http_status = -1; + location.clear(); + hreflangs[0] = 0; + retry_after = DEFAULT_RETRY_AFTER; + x_robots_state = "xxxxx"; + } + + void Print() const { + THttpBaseHeader::Print(); + printf("http_status: %" PRIi16 "\n", http_status); + printf("squid_error: %" PRIi8 "\n", squid_error); + printf("accept_ranges: %" PRIi8 "\n", accept_ranges); + printf("location: \"%s\"\n", location.c_str()); + printf("retry_after: %" PRIi64 "\n", retry_after); + } +}; + +struct THttpRequestHeader: public THttpBaseHeader { +public: + TString request_uri; + char host[HOST_MAX]; + char from[MAXWORD_LEN]; + char user_agent[MAXWORD_LEN]; + char x_yandex_langregion[MAX_LANGREGION_LEN]; + char x_yandex_sourcename[MAXWORD_LEN]; + char x_yandex_requesttype[MAXWORD_LEN]; + char x_yandex_fetchoptions[MAXWORD_LEN]; + i8 http_method; + i8 x_yandex_request_priority; + i32 x_yandex_response_timeout; + i32 max_age; + i64 if_modified_since; + +public: + THttpRequestHeader() { + Init(); + } + + void Init() { + request_uri.clear(); + host[0] = 0; + from[0] = 0; + user_agent[0] = 0; + x_yandex_langregion[0] = 0; + x_yandex_sourcename[0] = 0; + x_yandex_requesttype[0] = 0; + x_yandex_fetchoptions[0] = 0; + http_method = HTTP_METHOD_UNDEFINED; + x_yandex_request_priority = DEFAULT_REQUEST_PRIORITY; + x_yandex_response_timeout = DEFAULT_RESPONSE_TIMEOUT; + max_age = DEFAULT_MAX_AGE; + if_modified_since = DEFAULT_IF_MODIFIED_SINCE; + THttpBaseHeader::Init(); + } + + void Print() const { + THttpBaseHeader::Print(); + printf("request_uri: \"%s\"\n", request_uri.c_str()); + printf("host: \"%s\"\n", host); + printf("from: \"%s\"\n", from); + printf("user_agent: \"%s\"\n", user_agent); + printf("http_method: %" PRIi8 "\n", http_method); + printf("response_timeout: %" PRIi32 "\n", x_yandex_response_timeout); + printf("max_age: %" PRIi32 "\n", max_age); + printf("if_modified_since: %" PRIi64 "\n", if_modified_since); + } + + /// It doesn't care about errors in request or headers, where + /// request_uri equals to '*'. + /// This returns copy of the string, which you have to delete. + TString GetUrl() { + TString url; + if (host[0] == 0 || !strcmp(host, "")) { + url = request_uri; + } else { + url = HTTP_PREFIX; + url += host; + url += request_uri; + } + return url; + } + + char* GetUrl(char* buffer, size_t size) { + if (host[0] == 0 || !strcmp(host, "")) { + strlcpy(buffer, request_uri.c_str(), size); + } else { + snprintf(buffer, size, "http://%s%s", host, request_uri.c_str()); + } + return buffer; + } +}; + +class THttpAuthHeader: public THttpHeader { +public: + char* realm; + char* nonce; + char* opaque; + bool stale; + int algorithm; + bool qop_auth; + bool use_auth; + + //we do not provide auth-int variant as too heavy + //bool qop_auth_int; + + THttpAuthHeader() + : realm(nullptr) + , nonce(nullptr) + , opaque(nullptr) + , stale(false) + , algorithm(0) + , qop_auth(false) + , use_auth(true) + { + THttpHeader::Init(); + } + + ~THttpAuthHeader() { + free(realm); + free(nonce); + free(opaque); + } + + void Print() { + THttpHeader::Print(); + if (use_auth) { + if (realm) + printf("realm: \"%s\"\n", realm); + if (nonce) + printf("nonce: \"%s\"\n", nonce); + if (opaque) + printf("opaque: \"%s\"\n", opaque); + printf("stale: %d\n", stale); + printf("algorithm: %d\n", algorithm); + printf("qop_auth: %d\n", qop_auth); + } + } +}; diff --git a/library/cpp/http/fetch/httpload.cpp b/library/cpp/http/fetch/httpload.cpp new file mode 100644 index 0000000000..82ea8900b5 --- /dev/null +++ b/library/cpp/http/fetch/httpload.cpp @@ -0,0 +1,373 @@ +#include "httpload.h" + +/************************************************************/ +/************************************************************/ +httpAgentReader::httpAgentReader(httpSpecialAgent& agent, + const char* baseUrl, + bool assumeConnectionClosed, + bool use_auth, + int bufSize) + : Header_() + , Agent_(agent) + , Buffer_(new char[bufSize]) + , BufPtr_(Buffer_) + , BufSize_(bufSize) + , BufRest_(0) +{ + HeadRequest = false; + Header = &Header_; + if (use_auth) + HeaderParser.Init(&Header_); + else + HeaderParser.Init(Header); + setAssumeConnectionClosed(assumeConnectionClosed ? 1 : 0); + Header_.SetBase(baseUrl); + + if (Header_.error) + State = hp_error; + else + State = hp_in_header; +} + +/************************************************************/ +httpAgentReader::~httpAgentReader() { + delete[] Buffer_; +} + +/************************************************************/ +void httpAgentReader::readBuf() { + assert(BufRest_ == 0); + if (!BufPtr_) { + BufRest_ = -1; + return; + } + + BufRest_ = Agent_.read(Buffer_, BufSize_); + if (BufRest_ <= 0) { + BufRest_ = -1; + BufPtr_ = nullptr; + } else { + BufPtr_ = Buffer_; + + //cout << "BUF: " << mBuffer << endl << endl; + } +} + +/************************************************************/ +const THttpHeader* httpAgentReader::readHeader() { + while (State == hp_in_header) { + if (!step()) { + Header_.error = HTTP_CONNECTION_LOST; + return nullptr; + } + ParseGeneric(BufPtr_, BufRest_); + } + if (State == hp_eof || State == hp_error) { + BufPtr_ = nullptr; + BufRest_ = -1; + } + if (State == hp_error || Header_.error) + return nullptr; + return &Header_; +} + +/************************************************************/ +long httpAgentReader::readPortion(void*& buf) { + assert(State != hp_in_header); + + long Chunk = 0; + do { + if (BufSize_ == 0 && !BufPtr_) + return 0; + + if (!step()) + return 0; + + Chunk = ParseGeneric(BufPtr_, BufRest_); + buf = BufPtr_; + + if (State == hp_error && Header_.entity_size > Header_.content_length) { + Chunk -= (Header_.entity_size - Header_.content_length); + BufPtr_ = (char*)BufPtr_ + Chunk; + BufRest_ = 0; + State = hp_eof; + Header_.error = 0; + break; + } + + BufPtr_ = (char*)BufPtr_ + Chunk; + BufRest_ -= Chunk; + + if (State == hp_eof || State == hp_error) { + BufRest_ = -1; + BufPtr_ = nullptr; + } + } while (!Chunk); + return Chunk; +} + +/************************************************************/ +bool httpAgentReader::skipTheRest() { + void* b; + while (!eof()) + readPortion(b); + return (State == hp_eof); +} + +/************************************************************/ +/************************************************************/ +httpLoadAgent::httpLoadAgent(bool handleAuthorization, + socketHandlerFactory& factory) + : Factory_(factory) + , HandleAuthorization_(handleAuthorization) + , URL_() + , PersistentConn_(false) + , Reader_(nullptr) + , Headers_() + , ErrCode_(0) + , RealHost_(nullptr) +{ +} + +/************************************************************/ +httpLoadAgent::~httpLoadAgent() { + delete Reader_; + free(RealHost_); +} + +/************************************************************/ +void httpLoadAgent::clearReader() { + if (Reader_) { + bool opened = false; + if (PersistentConn_) { + const THttpHeader* H = Reader_->readHeader(); + if (H && !H->connection_closed) { + Reader_->skipTheRest(); + opened = true; + } + } + if (!opened) + Disconnect(); + delete Reader_; + Reader_ = nullptr; + } + ErrCode_ = 0; +} +/************************************************************/ +void httpLoadAgent::setRealHost(const char* hostname) { + free(RealHost_); + if (hostname) + RealHost_ = strdup(hostname); + else + RealHost_ = nullptr; + ErrCode_ = 0; +} + +/************************************************************/ +void httpLoadAgent::setIMS(const char* ifModifiedSince) { + char ims_buf[100]; + snprintf(ims_buf, 100, "If-Modified-Since: %s\r\n", + ifModifiedSince); + Headers_.push_back(ims_buf); +} + +/************************************************************/ +void httpLoadAgent::addHeaderInstruction(const char* instr) { + Headers_.push_back(instr); +} + +/************************************************************/ +void httpLoadAgent::dropHeaderInstructions() { + Headers_.clear(); +} + +/************************************************************/ +bool httpLoadAgent::startRequest(const THttpURL& url, + bool persistent, + const TAddrList& addrs) + +{ + clearReader(); + ErrCode_ = 0; + + URL_.Clear(); + URL_ = url; + PersistentConn_ = persistent; + if (!URL_.IsValidAbs()) + return false; + if (!HandleAuthorization_ && !URL_.IsNull(THttpURL::FlagAuth)) + return false; + + return doSetHost(addrs) && doStartRequest(); +} + +/************************************************************/ +bool httpLoadAgent::startRequest(const char* url, + const char* url_to_merge, + bool persistent, + const TAddrList& addrs) { + clearReader(); + + URL_.Clear(); + PersistentConn_ = persistent; + + long flags = THttpURL::FeatureSchemeKnown | THttpURL::FeaturesNormalizeSet; + if (HandleAuthorization_) + flags |= THttpURL::FeatureAuthSupported; + + if (URL_.Parse(url, flags, url_to_merge) || !URL_.IsValidGlobal()) + return false; + + return doSetHost(addrs) && doStartRequest(); +} + +/************************************************************/ +bool httpLoadAgent::startRequest(const char* url, + const char* url_to_merge, + bool persistent, + ui32 ip) { + clearReader(); + + URL_.Clear(); + PersistentConn_ = persistent; + + long flags = THttpURL::FeatureSchemeKnown | THttpURL::FeaturesNormalizeSet; + if (HandleAuthorization_) + flags |= THttpURL::FeatureAuthSupported; + + if (URL_.Parse(url, flags, url_to_merge) || !URL_.IsValidGlobal()) + return false; + + return doSetHost(TAddrList::MakeV4Addr(ip, URL_.GetPort())) && doStartRequest(); +} + +/************************************************************/ +bool httpLoadAgent::doSetHost(const TAddrList& addrs) { + socketAbstractHandler* h = Factory_.chooseHandler(URL_); + if (!h) + return false; + Socket.setHandler(h); + + if (addrs.size()) { + ErrCode_ = SetHost(URL_.Get(THttpURL::FieldHost), + URL_.GetPort(), addrs); + } else { + ErrCode_ = SetHost(URL_.Get(THttpURL::FieldHost), + URL_.GetPort()); + } + if (ErrCode_) + return false; + + if (RealHost_) { + free(Hostheader); + Hostheader = (char*)malloc(strlen(RealHost_) + 20); + sprintf(Hostheader, "Host: %s\r\n", RealHost_); + } + + if (!URL_.IsNull(THttpURL::FlagAuth)) { + if (!HandleAuthorization_) { + ErrCode_ = HTTP_UNAUTHORIZED; + return false; + } + + Digest_.setAuthorization(URL_.Get(THttpURL::FieldUsername), + URL_.Get(THttpURL::FieldPassword)); + } + + return true; +} + +/************************************************************/ +bool httpLoadAgent::setHost(const char* host_url, + const TAddrList& addrs) { + clearReader(); + + URL_.Clear(); + PersistentConn_ = true; + + long flags = THttpURL::FeatureSchemeKnown | THttpURL::FeaturesNormalizeSet; + if (HandleAuthorization_) + flags |= THttpURL::FeatureAuthSupported; + + if (URL_.Parse(host_url, flags) || !URL_.IsValidGlobal()) + return false; + + return doSetHost(addrs); +} + +/************************************************************/ +bool httpLoadAgent::startOneRequest(const char* local_url) { + clearReader(); + + THttpURL lURL; + if (lURL.Parse(local_url, THttpURL::FeaturesNormalizeSet) || lURL.IsValidGlobal()) + return false; + + URL_.SetInMemory(THttpURL::FieldPath, lURL.Get(THttpURL::FieldPath)); + URL_.SetInMemory(THttpURL::FieldQuery, lURL.Get(THttpURL::FieldQuery)); + URL_.Rewrite(); + + return doStartRequest(); +} + +/************************************************************/ +bool httpLoadAgent::doStartRequest() { + TString urlStr = URL_.PrintS(THttpURL::FlagPath | THttpURL::FlagQuery); + if (!urlStr) + urlStr = "/"; + + for (int step = 0; step < 10; step++) { + const char* digestHeader = Digest_.getHeaderInstruction(); + + unsigned i = (digestHeader) ? 2 : 1; + const char** headers = + (const char**)(alloca((i + Headers_.size()) * sizeof(char*))); + + for (i = 0; i < Headers_.size(); i++) + headers[i] = Headers_[i].c_str(); + if (digestHeader) + headers[i++] = digestHeader; + headers[i] = nullptr; + + ErrCode_ = RequestGet(urlStr.c_str(), headers, PersistentConn_); + + if (ErrCode_) { + Disconnect(); + return false; + } + + TString urlBaseStr = URL_.PrintS(THttpURL::FlagNoFrag); + + clearReader(); + Reader_ = new httpAgentReader(*this, urlBaseStr.c_str(), + !PersistentConn_, !Digest_.empty()); + + if (Reader_->readHeader()) { + //mReader->getHeader()->Print(); + if (getHeader()->http_status == HTTP_UNAUTHORIZED && + step < 1 && + Digest_.processHeader(getAuthHeader(), + urlStr.c_str(), + "GET")) { + //mReader->skipTheRest(); + delete Reader_; + Reader_ = nullptr; + ErrCode_ = 0; + Disconnect(); + continue; + } + + return true; + } + Disconnect(); + clearReader(); + + return false; + } + + ErrCode_ = HTTP_UNAUTHORIZED; + return false; +} + +/************************************************************/ +/************************************************************/ diff --git a/library/cpp/http/fetch/httpload.h b/library/cpp/http/fetch/httpload.h new file mode 100644 index 0000000000..e22e4b809e --- /dev/null +++ b/library/cpp/http/fetch/httpload.h @@ -0,0 +1,307 @@ +#pragma once + +#include "httpagent.h" +#include "httpparser.h" +#include "http_digest.h" + +#include <util/system/compat.h> +#include <util/string/vector.h> +#include <util/network/ip.h> +#include <library/cpp/uri/http_url.h> +#include <library/cpp/http/misc/httpcodes.h> + +/********************************************************/ +// Section 1: socket handlers +/********************************************************/ +// The following classes allows to adopt template scheme +// THttpAgent for work with socket by flexible +// object-style scheme. + +/********************************************************/ +// This class is used as a base one for flexible +// socket handling +class socketAbstractHandler { +public: + virtual bool Good() = 0; + + virtual int Connect(const TAddrList& addrs, TDuration Timeout) = 0; + + virtual void Disconnect() = 0; + + virtual void shutdown() = 0; + + virtual bool send(const char* message, ssize_t messlen) = 0; + + virtual bool peek() = 0; + + virtual ssize_t read(void* buffer, ssize_t buflen) = 0; + + virtual ~socketAbstractHandler() { + } + +protected: + socketAbstractHandler() { + } +}; + +/********************************************************/ +// This class is used as a proxy between THttpAgent and +// socketAbstractHandler +// (it is used by template scheme, +// so it does not have virtual methods) +class TSocketHandlerPtr { +protected: + socketAbstractHandler* Handler_; + +public: + TSocketHandlerPtr() + : Handler_(nullptr) + { + } + + virtual ~TSocketHandlerPtr() { + delete Handler_; + } + + int Good() { + return (Handler_ && Handler_->Good()); + } + + int Connect(const TAddrList& addrs, TDuration Timeout) { + return (Handler_) ? Handler_->Connect(addrs, Timeout) : 1; + } + + void Disconnect() { + if (Handler_) + Handler_->Disconnect(); + } + + void shutdown() { + if (Handler_) + Handler_->shutdown(); + } + + bool send(const char* message, ssize_t messlen) { + return (Handler_) ? Handler_->send(message, messlen) : false; + } + + virtual bool peek() { + return (Handler_) ? Handler_->peek() : false; + } + + virtual ssize_t read(void* buffer, ssize_t buflen) { + return (Handler_) ? Handler_->read(buffer, buflen) : 0; + } + + void setHandler(socketAbstractHandler* handler) { + if (Handler_) + delete Handler_; + Handler_ = handler; + } +}; + +/********************************************************/ +// Here is httpAgent that uses socketAbstractHandler class +// ant its derivatives +using httpSpecialAgent = THttpAgent<TSocketHandlerPtr>; + +/********************************************************/ +// Regular handler is used as implementation of +// socketAbstractHandler for work through HTTP protocol +class socketRegularHandler: public socketAbstractHandler { +protected: + TSimpleSocketHandler Socket_; + +public: + socketRegularHandler() + : Socket_() + { + } + + bool Good() override { + return Socket_.Good(); + } + + int Connect(const TAddrList& addrs, TDuration Timeout) override { + return Socket_.Connect(addrs, Timeout); + } + + void Disconnect() override { + Socket_.Disconnect(); + } + + void shutdown() override { + //Do not block writing to socket + //There are servers that works in a bad way with this + //mSocket.shutdown(); + } + + bool send(const char* message, ssize_t messlen) override { + return Socket_.send(message, messlen); + } + + bool peek() override { + return Socket_.peek(); + } + + ssize_t read(void* buffer, ssize_t buflen) override { + return Socket_.read(buffer, buflen); + } +}; + +/********************************************************/ +// The base factory that allows to choose an appropriate +// socketAbstractHandler implementation by url schema + +class socketHandlerFactory { +public: + virtual ~socketHandlerFactory() { + } + + //returns mHandler_HTTP for correct HTTP-based url + virtual socketAbstractHandler* chooseHandler(const THttpURL& url); + + static socketHandlerFactory sInstance; +}; + +/********************************************************/ +// Section 2: the configurates tool to parse an HTTP-response +/********************************************************/ + +class httpAgentReader: public THttpParserGeneric<1> { +protected: + THttpAuthHeader Header_; + httpSpecialAgent& Agent_; + + char* Buffer_; + void* BufPtr_; + int BufSize_; + long BufRest_; + + void readBuf(); + + bool step() { + if (BufRest_ == 0) + readBuf(); + if (eof()) + return false; + return true; + } + +public: + httpAgentReader(httpSpecialAgent& agent, + const char* baseUrl, + bool assumeConnectionClosed, + bool use_auth = false, + int bufSize = 0x1000); + + ~httpAgentReader(); + + bool eof() { + return BufRest_ < 0; + } + + int error() { + return Header_.error; + } + + void setError(int errCode) { + Header_.error = errCode; + } + + const THttpAuthHeader* getAuthHeader() { + return &Header_; + } + + const THttpHeader* readHeader(); + long readPortion(void*& buf); + bool skipTheRest(); +}; + +/********************************************************/ +// Section 3: the main class +/********************************************************/ +class httpLoadAgent: public httpSpecialAgent { +protected: + socketHandlerFactory& Factory_; + bool HandleAuthorization_; + THttpURL URL_; + bool PersistentConn_; + httpAgentReader* Reader_; + TVector<TString> Headers_; + int ErrCode_; + char* RealHost_; + httpDigestHandler Digest_; + + void clearReader(); + bool doSetHost(const TAddrList& addrs); + bool doStartRequest(); + +public: + httpLoadAgent(bool handleAuthorization = false, + socketHandlerFactory& factory = socketHandlerFactory::sInstance); + ~httpLoadAgent(); + + void setRealHost(const char* host); + void setIMS(const char* ifModifiedSince); + void addHeaderInstruction(const char* instr); + void dropHeaderInstructions(); + + bool startRequest(const char* url, + const char* url_to_merge = nullptr, + bool persistent = false, + const TAddrList& addrs = TAddrList()); + + // deprecated v4-only + bool startRequest(const char* url, + const char* url_to_merge, + bool persistent, + ui32 ip); + + bool startRequest(const THttpURL& url, + bool persistent = false, + const TAddrList& addrs = TAddrList()); + + bool setHost(const char* host_url, + const TAddrList& addrs = TAddrList()); + + bool startOneRequest(const char* local_url); + + const THttpAuthHeader* getAuthHeader() { + if (Reader_ && Reader_->getAuthHeader()->use_auth) + return Reader_->getAuthHeader(); + return nullptr; + } + + const THttpHeader* getHeader() { + if (Reader_) + return Reader_->getAuthHeader(); + return nullptr; + } + + const THttpURL& getURL() { + return URL_; + } + + bool eof() { + if (Reader_) + return Reader_->eof(); + return true; + } + + int error() { + if (ErrCode_) + return ErrCode_; + if (Reader_) + return Reader_->error(); + return HTTP_BAD_URL; + } + + long readPortion(void*& buf) { + if (Reader_) + return Reader_->readPortion(buf); + return -1; + } +}; + +/********************************************************/ diff --git a/library/cpp/http/fetch/httpparser.h b/library/cpp/http/fetch/httpparser.h new file mode 100644 index 0000000000..769828e4ae --- /dev/null +++ b/library/cpp/http/fetch/httpparser.h @@ -0,0 +1,372 @@ +#pragma once + +#include "httpfsm.h" +#include "httpheader.h" + +#include <library/cpp/mime/types/mime.h> +#include <util/system/yassert.h> +#include <library/cpp/http/misc/httpcodes.h> + +template <size_t headermax = 100 << 10, size_t bodymax = 1 << 20> +struct TFakeCheck { + bool Check(THttpHeader* /*header*/) { + return false; + } + void CheckDocPart(void* /*buf*/, size_t /*len*/, THttpHeader* /*header*/) { + } //for every part of DocumentBody will be called + void CheckEndDoc(THttpHeader* /*header*/) { + } + size_t GetMaxHeaderSize() { + return headermax; + } + size_t GetMaxBodySize(THttpHeader*) { + return bodymax; + } +}; + +class THttpParserBase { +public: + enum States { + hp_error, + hp_eof, + hp_in_header, + hp_read_alive, + hp_read_closed, + hp_begin_chunk_header, + hp_chunk_header, + hp_read_chunk + }; + + States GetState() { + return State; + } + + void setAssumeConnectionClosed(int value) { + AssumeConnectionClosed = value; + } + + THttpHeader* GetHttpHeader() const { + return Header; + } + +protected: + int CheckHeaders() { + if (Header->http_status < HTTP_OK || Header->http_status == HTTP_NO_CONTENT || Header->http_status == HTTP_NOT_MODIFIED) { + Header->content_length = 0; + Header->transfer_chunked = 0; + } + if (Header->transfer_chunked < -1) { + Header->error = HTTP_BAD_ENCODING; + return 1; + } else if (Header->transfer_chunked == -1) { + Header->transfer_chunked = 0; + } + if (!Header->transfer_chunked && Header->content_length < -1) { + Header->error = HTTP_BAD_CONTENT_LENGTH; + return 1; + } + if (Header->http_status == HTTP_OK) { + if (Header->compression_method != HTTP_COMPRESSION_UNSET && + Header->compression_method != HTTP_COMPRESSION_IDENTITY && + Header->compression_method != HTTP_COMPRESSION_GZIP && + Header->compression_method != HTTP_COMPRESSION_DEFLATE) + { + Header->error = HTTP_BAD_CONTENT_ENCODING; + return 1; + } + } + if (Header->connection_closed == -1) + Header->connection_closed = (Header->http_minor == 0 || + AssumeConnectionClosed); + if (!Header->transfer_chunked && !Header->connection_closed && Header->content_length < 0 && !HeadRequest) { + Header->error = HTTP_LENGTH_UNKNOWN; + return 1; + } + if (Header->http_time < 0) + Header->http_time = 0; + if (Header->mime_type < 0) + Header->mime_type = MIME_UNKNOWN; + return 0; + } + + THttpHeaderParser HeaderParser; + THttpChunkParser ChunkParser; + States State; + long ChunkSize; + THttpHeader* Header; + int AssumeConnectionClosed; + bool HeadRequest; +}; + +template <int isReader, typename TCheck = TFakeCheck<>> +class THttpParserGeneric: public THttpParserBase, public TCheck { +protected: + long ParseGeneric(void*& buf, long& size) { + if (!size) { + switch (State) { + case hp_error: + case hp_eof: + break; + case hp_read_closed: + State = hp_eof; + break; + case hp_in_header: + Header->error = HTTP_HEADER_EOF; + State = hp_error; + break; + case hp_read_alive: + case hp_read_chunk: + if (HeadRequest) + State = hp_eof; + else { + Header->error = HTTP_MESSAGE_EOF; + State = hp_error; + } + break; + case hp_begin_chunk_header: + case hp_chunk_header: + if (HeadRequest) + State = hp_eof; + else { + Header->error = HTTP_CHUNK_EOF; + State = hp_error; + } + break; + } + return 0; + } + while (size) { + int ret; + + switch (State) { + case hp_error: + return 0; + + case hp_eof: + return 0; + + case hp_in_header: + if ((ret = HeaderParser.Execute(buf, size)) < 0) { + Header->error = HTTP_BAD_HEADER_STRING; + State = hp_error; + return 0; + } else if (ret == 2) { + Header->header_size += i32(HeaderParser.lastchar - (char*)buf + 1); + size -= long(HeaderParser.lastchar - (char*)buf + 1); + buf = HeaderParser.lastchar + 1; + State = CheckHeaders() ? hp_error + : Header->transfer_chunked ? hp_begin_chunk_header + : Header->content_length == 0 ? hp_eof + : Header->content_length > 0 ? hp_read_alive + : hp_read_closed; + if (State == hp_begin_chunk_header) { + // unget \n for chunk reader + buf = (char*)buf - 1; + size++; + } + if (isReader) + return size; + } else { + Header->header_size += size; + size = 0; + } + break; + + case hp_read_alive: + Header->entity_size += size; + if (Header->entity_size >= Header->content_length) { + State = hp_eof; + } + + TCheck::CheckDocPart(buf, size, Header); + if (isReader) + return size; + size = 0; + break; + + case hp_read_closed: + Header->entity_size += size; + TCheck::CheckDocPart(buf, size, Header); + if (isReader) + return size; + size = 0; + break; + + case hp_begin_chunk_header: + ChunkParser.Init(); + State = hp_chunk_header; + [[fallthrough]]; + + case hp_chunk_header: + if ((ret = ChunkParser.Execute(buf, size)) < 0) { + Header->error = i16(ret == -2 ? HTTP_CHUNK_TOO_LARGE : HTTP_BAD_CHUNK); + State = hp_error; + return 0; + } else if (ret == 2) { + Header->entity_size += i32(ChunkParser.lastchar - (char*)buf + 1); + size -= long(ChunkParser.lastchar - (char*)buf + 1); + buf = ChunkParser.lastchar + 1; + ChunkSize = ChunkParser.chunk_length; + Y_ASSERT(ChunkSize >= 0); + State = ChunkSize ? hp_read_chunk : hp_eof; + } else { + Header->entity_size += size; + size = 0; + } + break; + + case hp_read_chunk: + if (size >= ChunkSize) { + Header->entity_size += ChunkSize; + State = hp_begin_chunk_header; + TCheck::CheckDocPart(buf, ChunkSize, Header); + if (isReader) + return ChunkSize; + size -= ChunkSize; + buf = (char*)buf + ChunkSize; + } else { + Header->entity_size += size; + ChunkSize -= size; + TCheck::CheckDocPart(buf, size, Header); + if (isReader) + return size; + size = 0; + } + break; + } + } + return size; + } +}; + +template <class TCheck = TFakeCheck<>> +class THttpParser: public THttpParserGeneric<0, TCheck> { + typedef THttpParserGeneric<0, TCheck> TBaseT; //sorry avoiding gcc 3.4.6 BUG! +public: + void Init(THttpHeader* H, bool head_request = false) { + TBaseT::Header = H; + TBaseT::HeaderParser.Init(TBaseT::Header); + TBaseT::State = TBaseT::hp_in_header; + TBaseT::AssumeConnectionClosed = 0; + TBaseT::HeadRequest = head_request; + } + + void Parse(void* buf, long size) { + TBaseT::ParseGeneric(buf, size); + } +}; + +class TMemoReader { +public: + int Init(void* buf, long bufsize) { + Buf = buf; + Bufsize = bufsize; + return 0; + } + long Read(void*& buf) { + Y_ASSERT(Bufsize >= 0); + if (!Bufsize) { + Bufsize = -1; + return 0; + } + buf = Buf; + long ret = Bufsize; + Bufsize = 0; + return ret; + } + +protected: + long Bufsize; + void* Buf; +}; + +template <class Reader> +class THttpReader: public THttpParserGeneric<1>, public Reader { + typedef THttpParserGeneric<1> TBaseT; + +public: + using TBaseT::AssumeConnectionClosed; + using TBaseT::Header; + using TBaseT::ParseGeneric; + using TBaseT::State; + + int Init(THttpHeader* H, int parsHeader, int assumeConnectionClosed = 0, bool headRequest = false) { + Header = H; + Eoferr = 1; + Size = 0; + AssumeConnectionClosed = assumeConnectionClosed; + HeadRequest = headRequest; + return parsHeader ? ParseHeader() : SkipHeader(); + } + + long Read(void*& buf) { + long Chunk; + do { + if (!Size) { + if (Eoferr != 1) + return Eoferr; + else if ((Size = (long)Reader::Read(Ptr)) < 0) { + Header->error = HTTP_CONNECTION_LOST; + return Eoferr = -1; + } + } + Chunk = ParseGeneric(Ptr, Size); + buf = Ptr; + Ptr = (char*)Ptr + Chunk; + Size -= Chunk; + if (State == hp_eof) { + Size = 0; + Eoferr = 0; + } else if (State == hp_error) + return Eoferr = -1; + } while (!Chunk); + return Chunk; + } + +protected: + int ParseHeader() { + HeaderParser.Init(Header); + State = hp_in_header; + while (State == hp_in_header) { + if ((Size = (long)Reader::Read(Ptr)) < 0) + return Eoferr = -1; + ParseGeneric(Ptr, Size); + } + if (State == hp_error) + return Eoferr = -1; + if (State == hp_eof) + Eoferr = 0; + return 0; + } + + int SkipHeader() { + long hdrsize = Header->header_size; + while (hdrsize) { + if ((Size = (long)Reader::Read(Ptr)) <= 0) + return Eoferr = -1; + if (Size >= hdrsize) { + Size -= hdrsize; + Ptr = (char*)Ptr + hdrsize; + break; + } + hdrsize -= Size; + } + State = Header->transfer_chunked ? hp_begin_chunk_header + : Header->content_length == 0 ? hp_eof + : Header->content_length > 0 ? hp_read_alive + : hp_read_closed; + Header->entity_size = 0; + if (State == hp_eof) + Eoferr = 0; + else if (State == hp_begin_chunk_header) { + // unget \n for chunk reader + Ptr = (char*)Ptr - 1; + ++Size; + } + return 0; + } + + void* Ptr; + long Size; + int Eoferr; +}; diff --git a/library/cpp/http/fetch/httpparser_ut.cpp b/library/cpp/http/fetch/httpparser_ut.cpp new file mode 100644 index 0000000000..3b3b938e7a --- /dev/null +++ b/library/cpp/http/fetch/httpparser_ut.cpp @@ -0,0 +1,231 @@ +#include "httpparser.h" + +#include <library/cpp/testing/unittest/registar.h> + +#define ENUM_OUT(arg) \ + case type ::arg: { \ + out << #arg; \ + return; \ + } + +template <> +void Out<THttpParserBase::States>(IOutputStream& out, THttpParserBase::States st) { + using type = THttpParserBase::States; + switch (st) { + ENUM_OUT(hp_error) + ENUM_OUT(hp_eof) + ENUM_OUT(hp_in_header) + ENUM_OUT(hp_read_alive) + ENUM_OUT(hp_read_closed) + ENUM_OUT(hp_begin_chunk_header) + ENUM_OUT(hp_chunk_header) + ENUM_OUT(hp_read_chunk) + } +} + +namespace { + class TSomethingLikeFakeCheck; + + using TTestHttpParser = THttpParser<TSomethingLikeFakeCheck>; + + class TSomethingLikeFakeCheck { + TString Body_; + + public: + const TString& Body() const { + return Body_; + } + + // other functions are not really called by THttpParser + void CheckDocPart(const void* buf, size_t len, THttpHeader* /* header */) { + TString s(static_cast<const char*>(buf), len); + Cout << "State = " << static_cast<TTestHttpParser*>(this)->GetState() << ", CheckDocPart(" << s.Quote() << ")\n"; + Body_ += s; + } + }; + +} + +Y_UNIT_TEST_SUITE(TestHttpParser) { + Y_UNIT_TEST(TestTrivialRequest) { + const TString blob{ + "GET /search?q=hi HTTP/1.1\r\n" + "Host: www.google.ru:8080 \r\n" + "\r\n"}; + THttpHeader hdr; + THttpParser<> parser; + parser.Init(&hdr); + parser.Parse((void*)blob.data(), blob.size()); + UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_error); // can't parse request as response + } + + // XXX: `entity_size` is i32 and `content_length` is i64! + Y_UNIT_TEST(TestTrivialResponse) { + const TString blob{ + "HTTP/1.1 200 Ok\r\n" + "Content-Length: 2\r\n" + "\r\n" + "OK"}; + THttpHeader hdr; + TTestHttpParser parser; + parser.Init(&hdr); + parser.Parse((void*)blob.data(), blob.size()); + UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof); + UNIT_ASSERT_EQUAL(parser.Body(), "OK"); + UNIT_ASSERT_EQUAL(hdr.header_size, strlen( + "HTTP/1.1 200 Ok\r\n" + "Content-Length: 2\r\n" + "\r\n")); + UNIT_ASSERT_EQUAL(hdr.entity_size, strlen("OK")); + } + + // XXX: `entity_size` is off by one in TE:chunked case. + Y_UNIT_TEST(TestChunkedResponse) { + const TString blob{ + "HTTP/1.1 200 OK\r\n" + "Transfer-Encoding: chunked\r\n" + "\r\n" + "2\r\n" + "Ok\r\n" + "8\r\n" + "AllRight\r\n" + "0\r\n" + "\r\n"}; + THttpHeader hdr; + TTestHttpParser parser; + parser.Init(&hdr); + parser.Parse((void*)blob.data(), blob.size()); + UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof); + UNIT_ASSERT_EQUAL(parser.Body(), "OkAllRight"); + UNIT_ASSERT_EQUAL(hdr.header_size, strlen( + "HTTP/1.1 200 OK\r\n" + "Transfer-Encoding: chunked\r\n" + "\r\n")); + const int off_by_one_err = -1; // XXX: it really looks so + UNIT_ASSERT_EQUAL(hdr.entity_size + off_by_one_err, strlen( + "2\r\n" + "Ok\r\n" + "8\r\n" + "AllRight\r\n" + "0\r\n" + "\r\n")); + } + + static const TString PipelineClenBlob_{ + "HTTP/1.1 200 Ok\r\n" + "Content-Length: 4\r\n" + "\r\n" + "OK\r\n" + "HTTP/1.1 200 Zz\r\n" + "Content-Length: 4\r\n" + "\r\n" + "ZZ\r\n"}; + + void AssertPipelineClen(TTestHttpParser & parser, const THttpHeader& hdr) { + UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof); + UNIT_ASSERT_EQUAL(4, hdr.content_length); + UNIT_ASSERT_EQUAL(hdr.header_size, strlen( + "HTTP/1.1 200 Ok\r\n" + "Content-Length: 4\r\n" + "\r\n")); + } + + Y_UNIT_TEST(TestPipelineClenByteByByte) { + const TString& blob = PipelineClenBlob_; + THttpHeader hdr; + TTestHttpParser parser; + parser.Init(&hdr); + for (size_t i = 0; i < blob.size(); ++i) { + const TStringBuf d{blob, i, 1}; + parser.Parse((void*)d.data(), d.size()); + Cout << TString(d).Quote() << " -> " << parser.GetState() << Endl; + } + AssertPipelineClen(parser, hdr); + UNIT_ASSERT_EQUAL(parser.Body(), "OK\r\n"); + UNIT_ASSERT_EQUAL(hdr.entity_size, hdr.content_length); + } + + // XXX: Content-Length is ignored, Body() looks unexpected! + Y_UNIT_TEST(TestPipelineClenOneChunk) { + const TString& blob = PipelineClenBlob_; + THttpHeader hdr; + TTestHttpParser parser; + parser.Init(&hdr); + parser.Parse((void*)blob.data(), blob.size()); + AssertPipelineClen(parser, hdr); + UNIT_ASSERT_EQUAL(parser.Body(), + "OK\r\n" + "HTTP/1.1 200 Zz\r\n" + "Content-Length: 4\r\n" + "\r\n" + "ZZ\r\n"); + UNIT_ASSERT_EQUAL(hdr.entity_size, strlen( + "OK\r\n" + "HTTP/1.1 200 Zz\r\n" + "Content-Length: 4\r\n" + "\r\n" + "ZZ\r\n")); + } + + static const TString PipelineChunkedBlob_{ + "HTTP/1.1 200 OK\r\n" + "Transfer-Encoding: chunked\r\n" + "\r\n" + "2\r\n" + "Ok\r\n" + "8\r\n" + "AllRight\r\n" + "0\r\n" + "\r\n" + "HTTP/1.1 200 OK\r\n" + "Transfer-Encoding: chunked\r\n" + "\r\n" + "2\r\n" + "Yo\r\n" + "8\r\n" + "uWin!Iam\r\n" + "0\r\n" + "\r\n"}; + + void AssertPipelineChunked(TTestHttpParser & parser, const THttpHeader& hdr) { + UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof); + UNIT_ASSERT_EQUAL(parser.Body(), "OkAllRight"); + UNIT_ASSERT_EQUAL(-1, hdr.content_length); + UNIT_ASSERT_EQUAL(hdr.header_size, strlen( + "HTTP/1.1 200 OK\r\n" + "Transfer-Encoding: chunked\r\n" + "\r\n")); + const int off_by_one_err = -1; + UNIT_ASSERT_EQUAL(hdr.entity_size + off_by_one_err, strlen( + "2\r\n" + "Ok\r\n" + "8\r\n" + "AllRight\r\n" + "0\r\n" + "\r\n")); + } + + Y_UNIT_TEST(TestPipelineChunkedByteByByte) { + const TString& blob = PipelineChunkedBlob_; + THttpHeader hdr; + TTestHttpParser parser; + parser.Init(&hdr); + for (size_t i = 0; i < blob.size(); ++i) { + const TStringBuf d{blob, i, 1}; + parser.Parse((void*)d.data(), d.size()); + Cout << TString(d).Quote() << " -> " << parser.GetState() << Endl; + if (blob.size() / 2 - 1 <= i) // last \n sets EOF + UNIT_ASSERT_EQUAL(parser.GetState(), parser.hp_eof); + } + AssertPipelineChunked(parser, hdr); + } + + Y_UNIT_TEST(TestPipelineChunkedOneChunk) { + const TString& blob = PipelineChunkedBlob_; + THttpHeader hdr; + TTestHttpParser parser; + parser.Init(&hdr); + parser.Parse((void*)blob.data(), blob.size()); + AssertPipelineChunked(parser, hdr); + } +} diff --git a/library/cpp/http/fetch/httpzreader.h b/library/cpp/http/fetch/httpzreader.h new file mode 100644 index 0000000000..68eb00853d --- /dev/null +++ b/library/cpp/http/fetch/httpzreader.h @@ -0,0 +1,295 @@ +#pragma once + +#include "httpheader.h" +#include "httpparser.h" +#include "exthttpcodes.h" + +#include <util/system/defaults.h> +#include <util/generic/yexception.h> + +#include <contrib/libs/zlib/zlib.h> + +#include <errno.h> + +#ifndef ENOTSUP +#define ENOTSUP 45 +#endif + +template <class Reader> +class TCompressedHttpReader: public THttpReader<Reader> { + typedef THttpReader<Reader> TBase; + +public: + using TBase::AssumeConnectionClosed; + using TBase::Header; + using TBase::ParseGeneric; + using TBase::State; + + static constexpr size_t DefaultBufSize = 64 << 10; + static constexpr unsigned int DefaultWinSize = 15; + + TCompressedHttpReader() + : CompressedInput(false) + , BufSize(0) + , CurContSize(0) + , MaxContSize(0) + , Buf(nullptr) + , ZErr(0) + , ConnectionClosed(0) + , IgnoreTrailingGarbage(true) + { + memset(&Stream, 0, sizeof(Stream)); + } + + ~TCompressedHttpReader() { + ClearStream(); + + if (Buf) { + free(Buf); + Buf = nullptr; + } + } + + void SetConnectionClosed(int cc) { + ConnectionClosed = cc; + } + + void SetIgnoreTrailingGarbage(bool ignore) { + IgnoreTrailingGarbage = ignore; + } + + int Init( + THttpHeader* H, + int parsHeader, + const size_t maxContSize = Max<size_t>(), + const size_t bufSize = DefaultBufSize, + const unsigned int winSize = DefaultWinSize, + bool headRequest = false) + { + ZErr = 0; + CurContSize = 0; + MaxContSize = maxContSize; + + int ret = TBase::Init(H, parsHeader, ConnectionClosed, headRequest); + if (ret) + return ret; + + ret = SetCompression(H->compression_method, bufSize, winSize); + return ret; + } + + long Read(void*& buf) { + if (!CompressedInput) { + long res = TBase::Read(buf); + if (res > 0) { + CurContSize += (size_t)res; + if (CurContSize > MaxContSize) { + ZErr = E2BIG; + return -1; + } + } + return res; + } + + while (true) { + if (Stream.avail_in == 0) { + void* tmpin = Stream.next_in; + long res = TBase::Read(tmpin); + Stream.next_in = (Bytef*)tmpin; + if (res <= 0) + return res; + Stream.avail_in = (uInt)res; + } + + Stream.next_out = Buf; + Stream.avail_out = (uInt)BufSize; + buf = Buf; + + int err = inflate(&Stream, Z_SYNC_FLUSH); + + //Y_ASSERT(Stream.avail_in == 0); + + switch (err) { + case Z_OK: + // there is no data in next_out yet + if (BufSize == Stream.avail_out) + continue; + [[fallthrough]]; // don't break or return; continue with Z_STREAM_END case + + case Z_STREAM_END: + if (Stream.total_out > MaxContSize) { + ZErr = E2BIG; + return -1; + } + if (!IgnoreTrailingGarbage && BufSize == Stream.avail_out && Stream.avail_in > 0) { + Header->error = EXT_HTTP_GZIPERROR; + ZErr = EFAULT; + Stream.msg = (char*)"trailing garbage"; + return -1; + } + return long(BufSize - Stream.avail_out); + + case Z_NEED_DICT: + case Z_DATA_ERROR: + Header->error = EXT_HTTP_GZIPERROR; + ZErr = EFAULT; + return -1; + + case Z_MEM_ERROR: + ZErr = ENOMEM; + return -1; + + default: + ZErr = EINVAL; + return -1; + } + } + + return -1; + } + + const char* ZMsg() const { + return Stream.msg; + } + + int ZError() const { + return ZErr; + } + + size_t GetCurContSize() const { + return CompressedInput ? Stream.total_out : CurContSize; + } + +protected: + int SetCompression(const int compression, const size_t bufSize, + const unsigned int winSize) { + ClearStream(); + + int winsize = winSize; + switch ((enum HTTP_COMPRESSION)compression) { + case HTTP_COMPRESSION_UNSET: + case HTTP_COMPRESSION_IDENTITY: + CompressedInput = false; + return 0; + case HTTP_COMPRESSION_GZIP: + CompressedInput = true; + winsize += 16; // 16 indicates gzip, see zlib.h + break; + case HTTP_COMPRESSION_DEFLATE: + CompressedInput = true; + winsize = -winsize; // negative indicates raw deflate stream, see zlib.h + break; + case HTTP_COMPRESSION_COMPRESS: + case HTTP_COMPRESSION_ERROR: + default: + CompressedInput = false; + ZErr = ENOTSUP; + return -1; + } + + if (bufSize != BufSize) { + if (Buf) + free(Buf); + Buf = (ui8*)malloc(bufSize); + if (!Buf) { + ZErr = ENOMEM; + return -1; + } + BufSize = bufSize; + } + + int err = inflateInit2(&Stream, winsize); + switch (err) { + case Z_OK: + Stream.total_in = 0; + Stream.total_out = 0; + Stream.avail_in = 0; + return 0; + + case Z_DATA_ERROR: // never happens, see zlib.h + CompressedInput = false; + ZErr = EFAULT; + return -1; + + case Z_MEM_ERROR: + CompressedInput = false; + ZErr = ENOMEM; + return -1; + + default: + CompressedInput = false; + ZErr = EINVAL; + return -1; + } + } + + void ClearStream() { + if (CompressedInput) { + inflateEnd(&Stream); + CompressedInput = false; + } + } + + z_stream Stream; + bool CompressedInput; + size_t BufSize; + size_t CurContSize, MaxContSize; + ui8* Buf; + int ZErr; + int ConnectionClosed; + bool IgnoreTrailingGarbage; +}; + +class zlib_exception: public yexception { +}; + +template <class Reader> +class SCompressedHttpReader: public TCompressedHttpReader<Reader> { + typedef TCompressedHttpReader<Reader> TBase; + +public: + using TBase::ZError; + using TBase::ZMsg; + + SCompressedHttpReader() + : TBase() + { + } + + int Init( + THttpHeader* H, + int parsHeader, + const size_t maxContSize = Max<size_t>(), + const size_t bufSize = TBase::DefaultBufSize, + const unsigned int winSize = TBase::DefaultWinSize, + bool headRequest = false) + { + int ret = TBase::Init(H, parsHeader, maxContSize, bufSize, winSize, headRequest); + return (int)HandleRetValue((long)ret); + } + + long Read(void*& buf) { + long ret = TBase::Read(buf); + return HandleRetValue(ret); + } + +protected: + long HandleRetValue(long ret) { + switch (ZError()) { + case 0: + return ret; + case ENOMEM: + ythrow yexception() << "SCompressedHttpReader: not enough memory"; + case EINVAL: + ythrow yexception() << "SCompressedHttpReader: zlib error: " << ZMsg(); + case ENOTSUP: + ythrow yexception() << "SCompressedHttpReader: unsupported compression method"; + case EFAULT: + ythrow zlib_exception() << "SCompressedHttpReader: " << ZMsg(); + case E2BIG: + ythrow zlib_exception() << "SCompressedHttpReader: Content exceeds maximum length"; + default: + ythrow yexception() << "SCompressedHttpReader: unknown error"; + } + } +}; diff --git a/library/cpp/http/fetch/library-htfetch_ut_hreflang_in.h b/library/cpp/http/fetch/library-htfetch_ut_hreflang_in.h new file mode 100644 index 0000000000..0df89bdc79 --- /dev/null +++ b/library/cpp/http/fetch/library-htfetch_ut_hreflang_in.h @@ -0,0 +1,155 @@ +#pragma once + +char hreflang_ut_in[] = "HTTP/1.1 200 OK\n" + "Date: Thu, 15 Nov 2012 22:38:28 GMT\n" + "Server: Apache/2\n" + "X-Powered-By: PHP/5.2.17\n" + "Set-Cookie: PHPSESSID=6d69474d1cc019d7d82714c9472bc6d6; path=/\n" + "Expires: Thu, 19 Nov 1981 08:52:00 GMT\n" + "Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0\n" + "Pragma: no-cache\n" + "Link: <http://www.forexticket.cn.com/zh/currency/converter-EEK-XAG>; rel='alternate'; hreflang='zh-CN'\n" + "Link: <http://www.forexticket.tw/zh/currency/converter-EEK-XAG>; rel='alternate'; hreflang='zh-TW'\n" + "Link: <http://www.forexticket.hk/zh/currency/converter-EEK-XAG>; rel='alternate'; hreflang='zh-HK'\n" + "Link: <http://www.forexticket.sg/zh/currency/converter-EEK-XAG>; rel='alternate'; hreflang='zh-SG'\n" + "Link: <http://www.forexticket.in/hi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='hi-IN'\n" + "Link: <http://www.forexticket.com.fj/hi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='hi-FJ'\n" + "Link: <http://www.forexticket.in/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-IN'\n" + "Link: <http://www.forexticket.us/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-US'\n" + "Link: <http://www.forexticket.com.pk/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-PK'\n" + "Link: <http://www.forexticket-bd.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-BD'\n" + "Link: <http://www.forexticket-ng.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-NG'\n" + "Link: <http://www.forexticket.co.uk/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-GB'\n" + "Link: <http://www.forexticket.co.za/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-ZA'\n" + "Link: <http://www.forexticket.co.ke/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-KE'\n" + "Link: <http://www.forexticket.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-CA'\n" + "Link: <http://www.forexticket-gh.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-GH'\n" + "Link: <http://www.forexticket.biz/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-AU'\n" + "Link: <http://www.forexticket.cm/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-CM'\n" + "Link: <http://www.forexticket-kh.com/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-KH'\n" + "Link: <http://www.forexticket.hk/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-HK'\n" + "Link: <http://www.forexticket.la/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-LA'\n" + "Link: <http://www.forexticket.sg/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-SG'\n" + "Link: <http://www.forexticket.co.nz/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-NZ'\n" + "Link: <http://www.forexticket.com.pr/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-PR'\n" + "Link: <http://www.forexticket.com.fj/en/currency/converter-EEK-XAG>; rel='alternate'; hreflang='en-FJ'\n" + "Link: <http://www.forexticket.us/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-US'\n" + "Link: <http://www.forexticket.mx/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-MX'\n" + "Link: <http://www.forexticket.co/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-CO'\n" + "Link: <http://www.forexticket.com.ar/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-AR'\n" + "Link: <http://www.forexticket-pe.com/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-PE'\n" + "Link: <http://www.forexticket.co.ve/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-VE'\n" + "Link: <http://www.forexticket.cl/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-CL'\n" + "Link: <http://www.forexticket.ec/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-EC'\n" + "Link: <http://www.forexticket.com.gt/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-GT'\n" + "Link: <http://www.forexticket.bo/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-BO'\n" + "Link: <http://www.forexticket.hn/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-HN'\n" + "Link: <http://www.forexticket.com.py/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-PY'\n" + "Link: <http://www.forexticket.es/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-ES'\n" + "Link: <http://www.forexticket.com.sv/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-SV'\n" + "Link: <http://www.forexticket.com.ni/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-NI'\n" + "Link: <http://www.forexticket.co.cr/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-CR'\n" + "Link: <http://www.forexticket.com.pr/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-PR'\n" + "Link: <http://www.forexticket.com.uy/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-UY'\n" + "Link: <http://www.forexticket.com.pa/es/cambio/divisas-EEK-XAG>; rel='alternate'; hreflang='es-PA'\n" + "Link: <http://www.forexticket.asia.com/id/currency/converter-EEK-XAG>; rel='alternate'; hreflang='id-ID'\n" + "Link: <http://www.forexticket.com.br/pt/moeda/conversor-EEK-XAG>; rel='alternate'; hreflang='pt-BR'\n" + "Link: <http://www.forexticket-mz.com/pt/moeda/conversor-EEK-XAG>; rel='alternate'; hreflang='pt-MZ'\n" + "Link: <http://www.forexticket.com.pt/pt/moeda/conversor-EEK-XAG>; rel='alternate'; hreflang='pt-PT'\n" + "Link: <http://www.forexticket.tl/pt/moeda/conversor-EEK-XAG>; rel='alternate'; hreflang='pt-TL'\n" + "Link: <http://www.forexticket.ru/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-RU'\n" + "Link: <http://www.forexticket-kz.com/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-KZ'\n" + "Link: <http://www.forexticket-tj.com/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-TJ'\n" + "Link: <http://www.forexticket-kg.com/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-KG'\n" + "Link: <http://www.forexticket-ge.com/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-GE'\n" + "Link: <http://www.forexticket.mn/ru/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ru-MN'\n" + "Link: <http://www.forexticket.jp/ja/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ja-JP'\n" + "Link: <http://www.forexticket-ph.com/tl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='tl-PH'\n" + "Link: <http://www.forexticket.vn/vi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='vi-VN'\n" + "Link: <http://www.forexticket.de/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-DE'\n" + "Link: <http://www.forexticket.be/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-BE'\n" + "Link: <http://www.forexticket.at/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-AT'\n" + "Link: <http://www.forexticket.ch/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-CH'\n" + "Link: <http://www.forexticket.lu/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-LU'\n" + "Link: <http://www.forexticket.li/de/waehrungsumrechner/devisen-EEK-XAG>; rel='alternate'; hreflang='de-LI'\n" + "Link: <http://www.forexticket.de/de/waehrungsumrechner/devisen-EEK-XAG>; rel='canonical'\n" + "Link: <http://www.forexticket-eg.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-EG'\n" + "Link: <http://www.forexticket-dz.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-DZ'\n" + "Link: <http://www.forexticket-ma.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-MA'\n" + "Link: <http://www.forexticket-iq.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-IQ'\n" + "Link: <http://www.forexticket-sa.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-SA'\n" + "Link: <http://www.forexticket-sy.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-SY'\n" + "Link: <http://www.forexticket-tn.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-TN'\n" + "Link: <http://www.forexticket-td.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-TD'\n" + "Link: <http://www.forexticket-so.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-SO'\n" + "Link: <http://www.forexticket.co.il/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-IL'\n" + "Link: <http://www.forexticket-jo.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-JO'\n" + "Link: <http://www.forexticket.ae/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-AE'\n" + "Link: <http://www.forexticket-lb.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-LB'\n" + "Link: <http://www.forexticket-om.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-OM'\n" + "Link: <http://www.forexticket-kw.com/ar/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ar-KW'\n" + "Link: <http://www.forexticket-tr.com/tr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='tr-TR'\n" + "Link: <http://www.forexticket-bg.com/tr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='tr-BG'\n" + "Link: <http://www.forexticket-cy.com/tr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='tr-CY'\n" + "Link: <http://www.forexticket.ir/fa/currency/converter-EEK-XAG>; rel='alternate'; hreflang='fa-IR'\n" + "Link: <http://www.forexticket.af/fa/currency/converter-EEK-XAG>; rel='alternate'; hreflang='fa-AF'\n" + "Link: <http://www.forexticket.cd/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-CD'\n" + "Link: <http://www.forexticket.fr/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-FR'\n" + "Link: <http://www.forexticket.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-CA'\n" + "Link: <http://www.forexticket.mg/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-MG'\n" + "Link: <http://www.forexticket.cm/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-CM'\n" + "Link: <http://www.forexticket-kh.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-KH'\n" + "Link: <http://www.forexticket-ml.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-ML'\n" + "Link: <http://www.forexticket-sn.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-SN'\n" + "Link: <http://www.forexticket-tn.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-TN'\n" + "Link: <http://www.forexticket-td.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-TD'\n" + "Link: <http://www.forexticket.be/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-BE'\n" + "Link: <http://www.forexticket-gn.com/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-GN'\n" + "Link: <http://www.forexticket.ht/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-HT'\n" + "Link: <http://www.forexticket.ch/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-CH'\n" + "Link: <http://www.forexticket.la/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-LA'\n" + "Link: <http://www.forexticket.lu/fr/conversion/monnaie-EEK-XAG>; rel='alternate'; hreflang='fr-LU'\n" + "Link: <http://www.forexticket-th.com/th/currency/converter-EEK-XAG>; rel='alternate'; hreflang='th-TH'\n" + "Link: <http://www.forexticket.co.uk/cy/currency/converter-EEK-XAG>; rel='alternate'; hreflang='cy-GB'\n" + "Link: <http://www.forexticket.co.uk/ga/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ga-GB'\n" + "Link: <http://www.forexticket.it/it/convertitore/valuta-EEK-XAG>; rel='alternate'; hreflang='it-IT'\n" + "Link: <http://www.forexticket.ch/it/convertitore/valuta-EEK-XAG>; rel='alternate'; hreflang='it-CH'\n" + "Link: <http://www.forexticket.co.za/af/currency/converter-EEK-XAG>; rel='alternate'; hreflang='af-ZA'\n" + "Link: <http://www.forexticket.kr/ko/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ko-KR'\n" + "Link: <http://www.forexticket-ua.com/uk/currency/converter-EEK-XAG>; rel='alternate'; hreflang='uk-UA'\n" + "Link: <http://www.forexticket-tz.com/sw/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sw-TZ'\n" + "Link: <http://www.forexticket.co.ke/sw/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sw-KE'\n" + "Link: <http://www.forexticket.pl/pl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='pl-PL'\n" + "Link: <http://www.forexticket.com.my/ms/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ms-MY'\n" + "Link: <http://www.forexticket.sg/ms/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ms-SG'\n" + "Link: <http://www.forexticket.ro/ro/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ro-RO'\n" + "Link: <http://www.forexticket.nl/nl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='nl-NL'\n" + "Link: <http://www.forexticket.be/nl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='nl-BE'\n" + "Link: <http://www.forexticket.gr/el/currency/converter-EEK-XAG>; rel='alternate'; hreflang='el-GR'\n" + "Link: <http://www.forexticket-al.com/el/currency/converter-EEK-XAG>; rel='alternate'; hreflang='el-AL'\n" + "Link: <http://www.forexticket-cy.com/el/currency/converter-EEK-XAG>; rel='alternate'; hreflang='el-CY'\n" + "Link: <http://www.forexticket.cz/cs/currency/converter-EEK-XAG>; rel='alternate'; hreflang='cs-CZ'\n" + "Link: <http://www.forexticket.hu/hu/currency/converter-EEK-XAG>; rel='alternate'; hreflang='hu-HU'\n" + "Link: <http://www.forexticket.se/sv/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sv-SE'\n" + "Link: <http://www.forexticket.eu/sv/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sv-FI'\n" + "Link: <http://www.forexticket.co.il/iw/currency/converter-EEK-XAG>; rel='alternate'; hreflang='iw-IL'\n" + "Link: <http://www.forexticket.co.il/yi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='yi-IL'\n" + "Link: <http://www.forexticket-bg.com/bg/currency/converter-EEK-XAG>; rel='alternate'; hreflang='bg-BG'\n" + "Link: <http://www.forexticket.es/ca/currency/converter-EEK-XAG>; rel='alternate'; hreflang='ca-ES'\n" + "Link: <http://www.forexticket.es/gl/currency/converter-EEK-XAG>; rel='alternate'; hreflang='gl-ES'\n" + "Link: <http://www.forexticket.dk/da/currency/converter-EEK-XAG>; rel='alternate'; hreflang='da-DK'\n" + "Link: <http://www.forexticket.eu/fi/currency/converter-EEK-XAG>; rel='alternate'; hreflang='fi-FI'\n" + "Link: <http://www.forexticket-hr.com/hr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='hr-HR'\n" + "Link: <http://www.forexticket-hr.com/sr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sr-HR'\n" + "Link: <http://www.forexticket.me/sr/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sr-ME'\n" + "Link: <http://www.forexticket.lt/lt/currency/converter-EEK-XAG>; rel='alternate'; hreflang='lt-LT'\n" + "Link: <http://www.forexticket-al.com/sq/currency/converter-EEK-XAG>; rel='alternate'; hreflang='sq-AL'\n" + "Link: <http://www.forexticket.lv/lv/currency/converter-EEK-XAG>; rel='alternate'; hreflang='lv-LV'\n" + "Link: <http://www.forexticket.co.ee/et/currency/converter-EEK-XAG>; rel='alternate'; hreflang='et-EE'\n" + "Vary: Accept-Encoding,User-Agent\n" + "Content-Encoding: gzip\n" + "Keep-Alive: timeout=1, max=100\n" + "Connection: Keep-Alive\n" + "Transfer-Encoding: chunked\n" + "Content-Type: text/html\n" + "\n"; diff --git a/library/cpp/http/fetch/library-htfetch_ut_hreflang_out.h b/library/cpp/http/fetch/library-htfetch_ut_hreflang_out.h new file mode 100644 index 0000000000..bef8bacff5 --- /dev/null +++ b/library/cpp/http/fetch/library-htfetch_ut_hreflang_out.h @@ -0,0 +1,3 @@ +#pragma once + +char hreflang_ut_out[] = "zh-CN http://www.forexticket.cn.com/zh/currency/converter-EEK-XAG\tzh-TW http://www.forexticket.tw/zh/currency/converter-EEK-XAG\tzh-HK http://www.forexticket.hk/zh/currency/converter-EEK-XAG\tzh-SG http://www.forexticket.sg/zh/currency/converter-EEK-XAG\thi-IN http://www.forexticket.in/hi/currency/converter-EEK-XAG\thi-FJ http://www.forexticket.com.fj/hi/currency/converter-EEK-XAG\ten-IN http://www.forexticket.in/en/currency/converter-EEK-XAG\ten-US http://www.forexticket.us/en/currency/converter-EEK-XAG\ten-PK http://www.forexticket.com.pk/en/currency/converter-EEK-XAG\ten-BD http://www.forexticket-bd.com/en/currency/converter-EEK-XAG\ten-NG http://www.forexticket-ng.com/en/currency/converter-EEK-XAG\ten-GB http://www.forexticket.co.uk/en/currency/converter-EEK-XAG\ten-ZA http://www.forexticket.co.za/en/currency/converter-EEK-XAG\ten-KE http://www.forexticket.co.ke/en/currency/converter-EEK-XAG\ten-CA http://www.forexticket.com/en/currency/converter-EEK-XAG\ten-GH http://www.forexticket-gh.com/en/currency/converter-EEK-XAG\ten-AU http://www.forexticket.biz/en/currency/converter-EEK-XAG\ten-CM http://www.forexticket.cm/en/currency/converter-EEK-XAG\ten-KH http://www.forexticket-kh.com/en/currency/converter-EEK-XAG\ten-HK http://www.forexticket.hk/en/currency/converter-EEK-XAG\ten-LA http://www.forexticket.la/en/currency/converter-EEK-XAG\ten-SG http://www.forexticket.sg/en/currency/converter-EEK-XAG\ten-NZ http://www.forexticket.co.nz/en/currency/converter-EEK-XAG\ten-PR http://www.forexticket.com.pr/en/currency/converter-EEK-XAG\ten-FJ http://www.forexticket.com.fj/en/currency/converter-EEK-XAG\tes-US http://www.forexticket.us/es/cambio/divisas-EEK-XAG\tes-MX http://www.forexticket.mx/es/cambio/divisas-EEK-XAG\tes-CO http://www.forexticket.co/es/cambio/divisas-EEK-XAG\tes-AR http://www.forexticket.com.ar/es/cambio/divisas-EEK-XAG\tes-PE http://www.forexticket-pe.com/es/cambio/divisas-EEK-XAG\tes-VE http://www.forexticket.co.ve/es/cambio/divisas-EEK-XAG\tes-CL http://www.forexticket.cl/es/cambio/divisas-EEK-XAG\tes-EC http://www.forexticket.ec/es/cambio/divisas-EEK-XAG\tes-GT http://www.forexticket.com.gt/es/cambio/divisas-EEK-XAG\tes-BO http://www.forexticket.bo/es/cambio/divisas-EEK-XAG\tes-HN http://www.forexticket.hn/es/cambio/divisas-EEK-XAG\tes-PY http://www.forexticket.com.py/es/cambio/divisas-EEK-XAG\tes-ES http://www.forexticket.es/es/cambio/divisas-EEK-XAG\tes-SV http://www.forexticket.com.sv/es/cambio/divisas-EEK-XAG\tes-NI http://www.forexticket.com.ni/es/cambio/divisas-EEK-XAG\tes-CR http://www.forexticket.co.cr/es/cambio/divisas-EEK-XAG\tes-PR http://www.forexticket.com.pr/es/cambio/divisas-EEK-XAG\tes-UY http://www.forexticket.com.uy/es/cambio/divisas-EEK-XAG\tes-PA http://www.forexticket.com.pa/es/cambio/divisas-EEK-XAG\tid-ID http://www.forexticket.asia.com/id/currency/converter-EEK-XAG\tpt-BR http://www.forexticket.com.br/pt/moeda/conversor-EEK-XAG\tpt-MZ http://www.forexticket-mz.com/pt/moeda/conversor-EEK-XAG\tpt-PT http://www.forexticket.com.pt/pt/moeda/conversor-EEK-XAG\tpt-TL http://www.forexticket.tl/pt/moeda/conversor-EEK-XAG\tru-RU http://www.forexticket.ru/ru/currency/converter-EEK-XAG\tru-KZ http://www.forexticket-kz.com/ru/currency/converter-EEK-XAG\tru-TJ http://www.forexticket-tj.com/ru/currency/converter-EEK-XAG\tru-KG http://www.forexticket-kg.com/ru/currency/converter-EEK-XAG\tru-GE http://www.forexticket-ge.com/ru/currency/converter-EEK-XAG\tru-MN http://www.forexticket.mn/ru/currency/converter-EEK-XAG\tja-JP http://www.forexticket.jp/ja/currency/converter-EEK-XAG\ttl-PH http://www.forexticket-ph.com/tl/currency/converter-EEK-XAG\tvi-VN http://www.forexticket.vn/vi/currency/converter-EEK-XAG\tde-DE http://www.forexticket.de/de/waehrungsumrechner/devisen-EEK-XAG\tde-BE http://www.forexticket.be/de/waehrungsumrechner/devisen-EEK-XAG\tde-AT http://www.forexticket.at/de/waehrungsumrechner/devisen-EEK-XAG\tde-CH http://www.forexticket.ch/de/waehrungsumrechner/devisen-EEK-XAG\tde-LU http://www.forexticket.lu/de/waehrungsumrechner/devisen-EEK-XAG\tde-LI http://www.forexticket.li/de/waehrungsumrechner/devisen-EEK-XAG\tar-EG http://www.forexticket-eg.com/ar/currency/converter-EEK-XAG\tar-DZ http://www.forexticket-dz.com/ar/currency/converter-EEK-XAG\tar-MA http://www.forexticket-ma.com/ar/currency/converter-EEK-XAG\tar-IQ http://www.forexticket-iq.com/ar/currency/converter-EEK-XAG\tar-SA http://www.forexticket-sa.com/ar/currency/converter-EEK-XAG\tar-SY http://www.forexticket-sy.com/ar/currency/converter-EEK-XAG\tar-TN http://www.forexticket-tn.com/ar/currency/converter-EEK-XAG\tar-TD http://www.forexticket-td.com/ar/currency/converter-EEK-XAG\tar-SO http://www.forexticket-so.com/ar/currency/converter-EEK-XAG\tar-IL http://www.forexticket.co.il/ar/currency/converter-EEK-XAG\tar-JO http://www.forexticket-jo.com/ar/currency/converter-EEK-XAG\tar-AE http://www.forexticket.ae/ar/currency/converter-EEK-XAG\tar-LB http://www.forexticket-lb.com/ar/currency/converter-EEK-XAG\tar-OM http://www.forexticket-om.com/ar/currency/converter-EEK-XAG\tar-KW http://www.forexticket-kw.com/ar/currency/converter-EEK-XAG\ttr-TR http://www.forexticket-tr.com/tr/currency/converter-EEK-XAG\ttr-BG http://www.forexticket-bg.com/tr/currency/converter-EEK-XAG\ttr-CY http://www.forexticket-cy.com/tr/currency/converter-EEK-XAG\tfa-IR http://www.forexticket.ir/fa/currency/converter-EEK-XAG\tfa-AF http://www.forexticket.af/fa/currency/converter-EEK-XAG\tfr-CD http://www.forexticket.cd/fr/conversion/monnaie-EEK-XAG\tfr-FR http://www.forexticket.fr/fr/conversion/monnaie-EEK-XAG\tfr-CA http://www.forexticket.com/fr/conversion/monnaie-EEK-XAG\tfr-MG http://www.forexticket.mg/fr/conversion/monnaie-EEK-XAG\tfr-CM http://www.forexticket.cm/fr/conversion/monnaie-EEK-XAG\tfr-KH http://www.forexticket-kh.com/fr/conversion/monnaie-EEK-XAG\tfr-ML http://www.forexticket-ml.com/fr/conversion/monnaie-EEK-XAG\tfr-SN http://www.forexticket-sn.com/fr/conversion/monnaie-EEK-XAG\tfr-TN http://www.forexticket-tn.com/fr/conversion/monnaie-EEK-XAG\tfr-TD http://www.forexticket-td.com/fr/conversion/monnaie-EEK-XAG\tfr-BE http://www.forexticket.be/fr/conversion/monnaie-EEK-XAG\tfr-GN http://www.forexticket-gn.com/fr/conversion/monnaie-EEK-XAG\tfr-HT http://www.forexticket.ht/fr/conversion/monnaie-EEK-XAG\tfr-CH http://www.forexticket.ch/fr/conversion/monnaie-EEK-XAG\tfr-LA http://www.forexticket.la/fr/conversion/monnaie-EEK-XAG\tfr-LU http://www.forexticket.lu/fr/conversion/monnaie-EEK-XAG\tth-TH http://www.forexticket-th.com/th/currency/converter-EEK-XAG\tcy-GB http://www.forexticket.co.uk/cy/currency/converter-EEK-XAG\tga-GB http://www.forexticket.co.uk/ga/currency/converter-EEK-XAG\tit-IT http://www.forexticket.it/it/convertitore/valuta-EEK-XAG\tit-CH http://www.forexticket.ch/it/convertitore/valuta-EEK-XAG\taf-ZA http://www.forexticket.co.za/af/currency/converter-EEK-XAG\tko-KR http://www.forexticket.kr/ko/currency/converter-EEK-XAG\tuk-UA http://www.forexticket-ua.com/uk/currency/converter-EEK-XAG\tsw-TZ http://www.forexticket-tz.com/sw/currency/converter-EEK-XAG\tsw-KE http://www.forexticket.co.ke/sw/currency/converter-EEK-XAG\tpl-PL http://www.forexticket.pl/pl/currency/converter-EEK-XAG\tms-MY http://www.forexticket.com.my/ms/currency/converter-EEK-XAG\tms-SG http://www.forexticket.sg/ms/currency/converter-EEK-XAG\tro-RO http://www.forexticket.ro/ro/currency/converter-EEK-XAG\tnl-NL http://www.forexticket.nl/nl/currency/converter-EEK-XAG\tnl-BE http://www.forexticket.be/nl/currency/converter-EEK-XAG\tel-GR http://www.forexticket.gr/el/currency/converter-EEK-XAG\tel-AL http://www.forexticket-al.com/el/currency/converter-EEK-XAG\tel-CY http://www.forexticket-cy.com/el/currency/converter-EEK-XAG\tcs-CZ http://www.forexticket.cz/cs/currency/converter-EEK-XAG\thu-HU http://www.forexticket.hu/hu/currency/converter-EEK-XAG\tsv-SE http://www.forexticket.se/sv/currency/converter-EEK-XAG\tsv-FI http://www.forexticket.eu/sv/currency/converter-EEK-XAG\tiw-IL http://www.forexticket.co.il/iw/currency/converter-EEK-XAG\tyi-IL http://www.forexticket.co.il/yi/currency/converter-EEK-XAG\tbg-BG http://www.forexticket-bg.com/bg/currency/converter-EEK-XAG\tca-ES http://www.forexticket.es/ca/currency/converter-EEK-XAG\tgl-ES http://www.forexticket.es/gl/currency/converter-EEK-XAG\tda-DK http://www.forexticket.dk/da/currency/converter-EEK-XAG\tfi-FI http://www.forexticket.eu/fi/currency/converter-EEK-XAG\thr-HR http://www.forexticket-hr.com/hr/currency/converter-EEK-XAG\tsr-HR http://www.forexticket-hr.com/sr/currency/converter-EEK-XAG\tsr-ME http://www.forexticket.me/sr/currency/converter-EEK-XAG\tlt-LT http://www.forexticket.lt/lt/currency/converter-EEK-XAG\tsq-AL http://www.forexticket-al.com/sq/currency/converter-EEK-XAG\tlv-LV http://www.forexticket.lv/lv/currency/converter-EEK-XAG\tet-EE http://www.forexticket.co.ee/et/currency/converter-EEK-XAG"; diff --git a/library/cpp/http/fetch/sockhandler.h b/library/cpp/http/fetch/sockhandler.h new file mode 100644 index 0000000000..e18149f657 --- /dev/null +++ b/library/cpp/http/fetch/sockhandler.h @@ -0,0 +1,130 @@ +#pragma once + +#include <library/cpp/logger/all.h> + +#include <util/generic/buffer.h> +#include <util/generic/map.h> +#include <util/generic/vector.h> +#include <util/network/address.h> +#include <util/network/ip.h> +#include <util/network/socket.h> +#include <util/system/mutex.h> +#include <util/system/yassert.h> + +#include <cerrno> +#include <util/generic/noncopyable.h> + +class TAddrList: public TVector<NAddr::IRemoteAddrRef> { +private: + using TBase = TVector<NAddr::IRemoteAddrRef>; + +public: + //msvc doesn't support base class constructor inheritance + TAddrList() = default; + + template <typename T> + TAddrList(T&& arg) + : TBase(std::forward<T>(arg)) + { + } + + template <typename T1, typename T2> + TAddrList(T1&& arg1, T2&& arg2) + : TBase(std::forward<T1>(arg1), std::forward<T2>(arg2)) + { + } + + TAddrList(std::initializer_list<NAddr::IRemoteAddrRef> list) + : TBase(list) + { + } + + static TAddrList MakeV4Addr(ui32 ip, TIpPort port) { + return TAddrList({new NAddr::TIPv4Addr(TIpAddress(htonl(ip), htons(port)))}); + } + + std::pair<ui32, TIpPort> GetV4Addr() const { + for (const auto& addrRef : *this) { + const sockaddr* sa = addrRef->Addr(); + if (sa->sa_family == AF_INET) { + const sockaddr_in* sin = reinterpret_cast<const sockaddr_in*>(sa); + return std::make_pair(ntohl(sin->sin_addr.s_addr), ntohs(sin->sin_port)); + } + } + return std::make_pair(0, 0); + } +}; + +class TSimpleSocketHandler { +public: + TSimpleSocketHandler() = default; + + int Good() const { + return static_cast<bool>(Socket); + } + + int Connect(const TAddrList& addrs, TDuration timeout) { + try { + for (const auto& item : addrs) { + const sockaddr* sa = item->Addr(); + TSocketHolder s(socket(sa->sa_family, SOCK_STREAM, 0)); + if (s.Closed()) { + continue; + } + +#ifndef WIN32 + if (fcntl(s, F_SETFD, FD_CLOEXEC)) // no inherit on fork()/exec() + return errno ? errno : EBADF; +#endif + if (connect(s, sa, item->Len())) { + s.Close(); + continue; + } + + Socket.Reset(new TSocket(s.Release())); + Socket->SetSocketTimeout(timeout.Seconds(), timeout.MilliSecondsOfSecond()); + Socket->SetZeroLinger(); + Socket->SetKeepAlive(true); + return 0; + } + } catch (...) { + return EBADF; + } + return errno ? errno : EBADF; + } + + void Disconnect() { + if (!Socket) + return; + Socket->ShutDown(SHUT_RDWR); + Socket.Destroy(); + } + + void SetSocket(SOCKET fd) { + Socket.Reset(new TSocket(fd)); + } + + void shutdown() { + Socket->ShutDown(SHUT_WR); + } + + int send(const void* message, size_t messlen) { + return ((ssize_t)messlen == Socket->Send(message, messlen)); + } + + int peek() { + char buf[1]; + return (1 == recv(*Socket, buf, 1, MSG_PEEK)); + } + + ssize_t read(void* buffer, size_t buflen) { + return Socket->Recv(buffer, buflen); + } + + THolder<TSocket> PickOutSocket() { + return std::move(Socket); + } + +protected: + THolder<TSocket> Socket; +}; diff --git a/library/cpp/http/fetch/ut/ya.make b/library/cpp/http/fetch/ut/ya.make new file mode 100644 index 0000000000..7486986b36 --- /dev/null +++ b/library/cpp/http/fetch/ut/ya.make @@ -0,0 +1,12 @@ +UNITTEST_FOR(library/cpp/http/fetch) + +OWNER( + g:zora +) + +SRCS( + httpfsm_ut.cpp + httpparser_ut.cpp +) + +END() diff --git a/library/cpp/http/fetch/ya.make b/library/cpp/http/fetch/ya.make new file mode 100644 index 0000000000..7737127463 --- /dev/null +++ b/library/cpp/http/fetch/ya.make @@ -0,0 +1,38 @@ +LIBRARY() + +OWNER( + g:zora +) + +PEERDIR( + contrib/libs/zlib + library/cpp/charset + library/cpp/digest/md5 + library/cpp/http/misc + library/cpp/logger + library/cpp/mime/types + library/cpp/uri +) + +SRCS( + http_digest.cpp + http_socket.cpp + httpheader.cpp + httpload.cpp + exthttpcodes.cpp + httpfsm.rl6 + httpagent.h + httpfetcher.h + httpheader.h + httpparser.h + httpzreader.h + sockhandler.h +) + +GENERATE_ENUM_SERIALIZATION(httpheader.h) + +SET(RAGEL6_FLAGS -CF1) + +END() + +RECURSE_FOR_TESTS(ut) |