diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/uri/common.h | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/uri/common.h')
-rw-r--r-- | library/cpp/uri/common.h | 511 |
1 files changed, 511 insertions, 0 deletions
diff --git a/library/cpp/uri/common.h b/library/cpp/uri/common.h new file mode 100644 index 0000000000..8025357763 --- /dev/null +++ b/library/cpp/uri/common.h @@ -0,0 +1,511 @@ +#pragma once + +#include <util/stream/output.h> +#include <util/system/compat.h> +#include <util/generic/strbuf.h> + +namespace NUri { + namespace NEncode { + class TEncoder; + class TEncodeMapperBase; + struct TCharFlags; + } + + namespace NParse { + class TRange; + } + + class TParser; + + struct TField { +#define FIELD_NAME(f) Field##f +#define FIELD_FLAG(f) Flag##f = 1U << FIELD_NAME(f) + + enum EField { + FIELD_NAME(Scheme), + FIELD_NAME(User), + FIELD_NAME(Pass), + FIELD_NAME(Host), + FIELD_NAME(Port), + FIELD_NAME(Path), + FIELD_NAME(Query), + FIELD_NAME(Frag), + + // add fields above + FieldUrlMAX, + // reset count so actual field offsets are not interrupted + FieldUrlLast = FieldUrlMAX - 1, + // add extra fields below + + FIELD_NAME(HostAscii), + + // add extra fields above + FieldAllMAX, + // add aliases below + + FieldUsername = FieldUser, + FieldPassword = FieldPass, + FieldFragment = FieldFrag, + }; + + enum EFlags { + FIELD_FLAG(Scheme), + FIELD_FLAG(User), + FIELD_FLAG(Pass), + FIELD_FLAG(Host), + FIELD_FLAG(Port), + FIELD_FLAG(Path), + FIELD_FLAG(Query), + FIELD_FLAG(Frag), + FIELD_FLAG(UrlMAX), + FIELD_FLAG(HostAscii), + FIELD_FLAG(AllMAX), + + FlagHostPort = FlagHost | FlagPort, + FlagAuth = FlagUser | FlagPass, + FlagFragment = FlagFrag, + FlagAction = FlagScheme | FlagHostPort | FlagPath, + FlagNoFrag = FlagAction | FlagQuery, + FlagUrlFields = FlagUrlMAX - 1, + FlagAll = FlagUrlFields, // obsolete, for backwards compatibility + FlagAllFields = FlagAllMAX - 1 + }; + +#undef FIELD_NAME +#undef FIELD_FLAG + }; + + struct TState { + enum EParsed { + ParsedOK = 0, + ParsedEmpty = 1, + ParsedOpaque = 2, + ParsedRootless = ParsedOpaque, + ParsedBadFormat, // must follow all non-error states immediately + ParsedBadPath, + ParsedTooLong, + ParsedBadPort, + ParsedBadAuth, + ParsedBadScheme, + ParsedBadHost, + + // add before this line + ParsedMAX + }; + }; + + struct TScheme { + // don't forget to define a SchemeRegistry entry + enum EKind { + SchemeEmpty + // add schemes below this line + , + SchemeHTTP, + SchemeHTTPS, + SchemeFTP, + SchemeFILE, + SchemeWS, + SchemeWSS + // add schemes above this line + , + SchemeUnknown + }; + }; + + class TFeature { + friend class NEncode::TEncoder; + friend class NEncode::TEncodeMapperBase; + friend struct NEncode::TCharFlags; + friend class TParser; + friend class NParse::TRange; + +#define FEATURE_NAME(f) _BitFeature##f +#define FEATURE_FLAG_NAME(f) Feature##f +#define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1UL << FEATURE_NAME(f) + + protected: + enum EBit { + //============================== + // Cases interpreted as errors: + //============================== + + // allows authorization user/password in URL + FEATURE_NAME(AuthSupported), + + // allows all known schemes in URL + FEATURE_NAME(SchemeKnown), + + // allows all schemes, not only known + FEATURE_NAME(SchemeFlexible), + + // allow opaque (RFC 2396) or rootless (RFC 3986) urls + FEATURE_NAME(AllowRootless), + + //============================== + // Cases interpreted for processing (if required): + // (effects on result of Parse method) + //============================== + + // path needs normalization + // (simplification of directory tree: /../, /./, etc. + FEATURE_NAME(PathOperation), + + // don't force empty path to "/" + FEATURE_NAME(AllowEmptyPath), + + // in scheme and host segments: + // change upper case letters onto lower case ones + FEATURE_NAME(ToLower), + + // decode unreserved symbols + FEATURE_NAME(DecodeUnreserved), + + // legacy: decode standard symbols which may be safe for some fields + FEATURE_NAME(DecodeStandardExtra), + + // decode symbols allowed (not necessarily safe to decode) only for a given field + // (do not use directly, instead use FeatureDecodeSafe mask below) + FEATURE_NAME(DecodeFieldAllowed), + + // handling of spaces + FEATURE_NAME(EncodeSpace), + + // in query segment: change escaped space to '+' + FEATURE_NAME(EncodeSpaceAsPlus), + + // escape all string 'markup' symbols + FEATURE_NAME(EncodeForSQL), + + // encoding of extended ascii symbols (8-bit) + FEATURE_NAME(EncodeExtendedASCII), + + // decoding of extended ascii symbols (8-bit) + FEATURE_NAME(DecodeExtendedASCII), + + // encoding of extended delimiter set + FEATURE_NAME(EncodeExtendedDelim), + + // decoding of extended delimiter set + FEATURE_NAME(DecodeExtendedDelim), + + // control characters [0x00 .. 0x20) + FEATURE_NAME(EncodeCntrl), + + // raw percent character + FEATURE_NAME(EncodePercent), + + // hash fragments + // https://developers.google.com/webmasters/ajax-crawling/docs/specification + // move and encode #! fragments to the query + FEATURE_NAME(HashBangToEscapedFragment), + // move and decode _escaped_fragment_ to the fragment + FEATURE_NAME(EscapedToHashBangFragment), + + // reject absolute paths started by "/../" + FEATURE_NAME(PathDenyRootParent), + + // paths started by "/../" - ignore head + FEATURE_NAME(PathStripRootParent), + + // tries to fix errors (in particular, in fragment) + FEATURE_NAME(TryToFix), + + // check host for DNS compliance + FEATURE_NAME(CheckHost), + + // allow IDN hosts + // host is converted to punycode and stored in FieldHostAscii + // @note host contains characters in the charset of the document + // and percent-encoded characters in UTF-8 (RFC 3986, 3.2.2) + // @note if host contains no extended-ASCII characters and after + // percent-decoding cannot be converted from UTF-8 to UCS-4, + // try to recode from the document charset (if not UTF-8) + FEATURE_NAME(AllowHostIDN), + + // forces AllowHostIDN, but host is replaced with punycode + // forces CheckHost since this replacement is irreversible + FEATURE_NAME(ConvertHostIDN), + + // robot interpreted network paths as BadFormat urls + FEATURE_NAME(DenyNetworkPath), + + // robot interprets URLs without a host as BadFormat + FEATURE_NAME(RemoteOnly), + + /* non-RFC use case: + * 1. do not allow relative-path-only URIs when they can conflict with + * "host/path" (that is, only "./path" or "../path" are allowed); + * 2. if neither scheme nor userinfo are present but port is, it must + * be non-empty, to avoid conflict with "scheme:/..."; + * 3. if AllowRootless is not specified, rootless (or opaque) URIs are + * not recognized; + * 4. if AllowRootless is specified, disallow userinfo, preferring + * "scheme:pa@th" over "user:pass@host", and even "host:port" when + * host contains only scheme-legal characters. + */ + FEATURE_NAME(NoRelPath), + + // standard prefers that all hex escapes were using uppercase A-F + FEATURE_NAME(UpperEncoded), + + // internal usage: decode all encoded symbols + FEATURE_NAME(DecodeANY), + + // add before this line + _FeatureMAX + }; + + protected: + enum EPrivate : ui32 { + FEATURE_FLAG(DecodeANY), + FEATURE_FLAG(DecodeFieldAllowed), + FEATURE_FLAG(DecodeStandardExtra), + }; + + public: + enum EPublic : ui32 { + FeatureMAX = _FeatureMAX, + FEATURE_FLAG(AuthSupported), + FEATURE_FLAG(SchemeKnown), + FEATURE_FLAG(SchemeFlexible), + FEATURE_FLAG(AllowRootless), + FEATURE_FLAG_NAME(AllowOpaque) = FEATURE_FLAG_NAME(AllowRootless), + FEATURE_FLAG(PathOperation), + FEATURE_FLAG(AllowEmptyPath), + FEATURE_FLAG(ToLower), + FEATURE_FLAG(DecodeUnreserved), + FEATURE_FLAG(EncodeSpace), + FEATURE_FLAG(EncodeSpaceAsPlus), + FEATURE_FLAG(EncodeForSQL), + FEATURE_FLAG(EncodeExtendedASCII), + FEATURE_FLAG(DecodeExtendedASCII), + FEATURE_FLAG(EncodeExtendedDelim), + FEATURE_FLAG(DecodeExtendedDelim), + FEATURE_FLAG(EncodeCntrl), + FEATURE_FLAG(EncodePercent), + FEATURE_FLAG(HashBangToEscapedFragment), + FEATURE_FLAG(EscapedToHashBangFragment), + FEATURE_FLAG(PathDenyRootParent), + FEATURE_FLAG(PathStripRootParent), + FEATURE_FLAG(TryToFix), + FEATURE_FLAG(CheckHost), + FEATURE_FLAG(AllowHostIDN), + FEATURE_FLAG(ConvertHostIDN), + FEATURE_FLAG(DenyNetworkPath), + FEATURE_FLAG(RemoteOnly), + FEATURE_FLAG(NoRelPath), + FEATURE_FLAG_NAME(HierURI) = FEATURE_FLAG_NAME(NoRelPath), + FEATURE_FLAG(UpperEncoded), + }; + +#undef FEATURE_NAME +#undef FEATURE_FLAG + + public: + //============================== + enum ESets { + // these are guaranteed and will change buffer size + + FeatureDecodeStandard = 0 | FeatureDecodeUnreserved | FeatureDecodeStandardExtra, + + FeaturesDecodeExtended = 0 | FeatureDecodeExtendedASCII | FeatureDecodeExtendedDelim, + + FeaturesDecode = 0 | FeatureDecodeUnreserved | FeatureDecodeStandard | FeaturesDecodeExtended, + + FeaturesEncodeExtended = 0 | FeatureEncodeExtendedASCII | FeatureEncodeExtendedDelim, + + FeaturesEncode = 0 | FeatureEncodeForSQL | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent | FeaturesEncodeExtended, + + // these are not guaranteed to apply to a given field + + FeatureDecodeAllowed = 0 | FeatureDecodeUnreserved | FeatureDecodeFieldAllowed, + + FeaturesMaybeDecode = 0 | FeaturesDecode | FeatureDecodeAllowed, + + FeaturesMaybeEncode = 0 | FeaturesEncode, + + FeaturesEncodeDecode = 0 | FeaturesMaybeEncode | FeaturesMaybeDecode, + + FeaturesAllEncoder = 0 | FeaturesEncodeDecode | FeatureDecodeANY | FeatureToLower | FeatureUpperEncoded | FeatureEncodeSpaceAsPlus, + + //============================== + FeaturesNormalizeSet = 0 | FeaturePathOperation | FeatureToLower | FeatureDecodeAllowed | FeatureEncodeSpaceAsPlus | FeatureEncodeForSQL | FeaturePathStripRootParent | FeatureTryToFix | FeatureUpperEncoded, + + FeaturesDefault = 0 // it reproduces old parsedURL + | FeaturePathOperation | FeaturePathDenyRootParent | FeatureCheckHost, + + // essentially allows all valid RFC urls and keeps them as-is + FeaturesBare = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureAllowEmptyPath, + + FeaturesAll = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureCheckHost | FeaturesNormalizeSet, + + // Deprecated, use FeaturesRecommended + FeaturesRobotOld = 0 + // http://tools.ietf.org/html/rfc3986#section-6.2.2 + | FeatureToLower // 6.2.2.1 + | FeatureUpperEncoded // 6.2.2.1 + | FeatureDecodeUnreserved // 6.2.2.2 + | FeaturePathOperation // 6.2.2.3 + | FeaturePathDenyRootParent | FeatureSchemeKnown | FeatureConvertHostIDN | FeatureRemoteOnly | FeatureHashBangToEscapedFragment | FeatureCheckHost, + + // these are mutually exclusive + FeaturesPath = 0 | FeaturePathDenyRootParent | FeaturePathStripRootParent, + + FeaturesEscapedFragment = 0 | FeatureEscapedToHashBangFragment | FeatureHashBangToEscapedFragment, + + FeaturesCheckSpecialChar = 0 | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent, + + FeaturesEncodePChar = 0 | FeatureUpperEncoded | FeaturesEncodeDecode | FeaturesCheckSpecialChar, + + // http://wiki.yandex-team.ru/robot/newDesign/dups/normolization + FeaturesRecommended = 0 | FeatureSchemeKnown | FeatureRemoteOnly | FeatureToLower | FeatureCheckHost | FeatureConvertHostIDN | FeatureHashBangToEscapedFragment | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodeExtendedASCII | FeatureUpperEncoded | FeatureDecodeUnreserved | FeaturePathOperation | FeaturePathStripRootParent, + + FeaturesRobot = FeaturesRecommended + }; + }; + + static inline int strnicmp(const char* lt, const char* rt, size_t len) { + return lt == rt ? 0 : ::strnicmp(lt, rt, len); + } + + static inline int CompareNoCasePrefix(const TStringBuf& lt, const TStringBuf& rt) { + return strnicmp(lt.data(), rt.data(), rt.length()); + } + + static inline bool EqualNoCase(const TStringBuf& lt, const TStringBuf& rt) { + return lt.length() == rt.length() && 0 == CompareNoCasePrefix(lt, rt); + } + + static inline int CompareNoCase(const TStringBuf& lt, const TStringBuf& rt) { + if (lt.length() == rt.length()) + return CompareNoCasePrefix(lt, rt); + return lt.length() < rt.length() ? -1 : 1; + } + + class TSchemeInfo { + public: + const TScheme::EKind Kind; + const ui16 Port; + const TStringBuf Str; + const ui32 FldReq; + TSchemeInfo(TScheme::EKind kind, TStringBuf str, ui32 fldReq = 0, ui16 port = 0) + : Kind(kind) + , Port(port) + , Str(str) + , FldReq(fldReq) + { + } + bool Matches(const TStringBuf& scheme) const { + return EqualNoCase(scheme, Str); + } + + public: + static const TSchemeInfo& Get(const TStringBuf& scheme); + static const TSchemeInfo& Get(TScheme::EKind scheme) { + return Registry[scheme]; + } + static TScheme::EKind GetKind(const TStringBuf& scheme) { + return Get(scheme).Kind; + } + static TStringBuf GetCanon(TScheme::EKind scheme) { + return Get(scheme).Str; + } + static ui16 GetDefaultPort(TScheme::EKind scheme) { + return Get(scheme).Port; + } + + private: + static const TSchemeInfo Registry[]; + }; + + struct TParseFlags { + const ui64 Allow; + const ui64 Extra; + TParseFlags(ui64 allow = 0, ui64 extra = 0) + : Allow(allow) + , Extra(extra) + { + } + ui64 operator&(const TParseFlags& flags) const { + return (Allow & flags.Allow) | (Extra & flags.Extra); + } + ui64 operator&(ui64 flags) const { + return (Allow & flags); + } + TParseFlags operator|(const TParseFlags& flags) const { + return TParseFlags(Allow | flags.Allow, Extra | flags.Extra); + } + TParseFlags Exclude(ui64 flags) const { + return TParseFlags(Allow & ~flags, Extra & ~flags); + } + }; + +#define FEATURE_NAME(f) _BitFeature##f +#define FEATURE_FLAG_NAME(f) Feature##f +#define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1UL << FEATURE_NAME(f) + + struct TQueryArg { + TStringBuf Name; + TStringBuf Value; + + private: + enum EBit { + FEATURE_NAME(Filter), + FEATURE_NAME(SortByName), + FEATURE_NAME(RemoveEmptyQuery), + FEATURE_NAME(RewriteDirty), + _FeatureMAX + }; + + public: + enum EPublic : ui32 { + FeatureMAX = _FeatureMAX, + FEATURE_FLAG(Filter), + FEATURE_FLAG(SortByName), + FEATURE_FLAG(RemoveEmptyQuery), + FEATURE_FLAG(RewriteDirty), + }; + + enum EProcessed { + // OK and clean. + ProcessedOK = 0, + + // OK, but query stored in internal buffer and TUri::Rewrite() is required. + ProcessedDirty = 1, + + ProcessedMalformed = 2, + ProcessedTooMany = 3, + }; + }; + + typedef bool (*TQueryArgFilter)(const TQueryArg& arg, void* filterData); + +#undef FEATURE_NAME +#undef FEATURE_FLAG_NAME +#undef FEATURE_FLAG + + const char* FieldToString(const TField::EField& t); + const char* ParsedStateToString(const TState::EParsed& t); + const char* SchemeKindToString(const TScheme::EKind& t); + +} + +Y_DECLARE_OUT_SPEC(inline, NUri::TField::EField, out, t) { + out << NUri::FieldToString(t); +} + +Y_DECLARE_OUT_SPEC(inline, NUri::TScheme::EKind, out, t) { + out << NUri::SchemeKindToString(t); +} + +Y_DECLARE_OUT_SPEC(inline, NUri::TState::EParsed, out, t) { + out << NUri::ParsedStateToString(t); +} + +static inline ui16 DefaultPort(NUri::TScheme::EKind scheme) { + return NUri::TSchemeInfo::GetDefaultPort(scheme); +} + +static inline NUri::TScheme::EKind SchemeKind(const TStringBuf& scheme) { + return NUri::TSchemeInfo::GetKind(scheme); +} |