diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/uri/parsefsm.rl6 | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/uri/parsefsm.rl6')
-rw-r--r-- | library/cpp/uri/parsefsm.rl6 | 501 |
1 files changed, 501 insertions, 0 deletions
diff --git a/library/cpp/uri/parsefsm.rl6 b/library/cpp/uri/parsefsm.rl6 new file mode 100644 index 00000000000..70977236503 --- /dev/null +++ b/library/cpp/uri/parsefsm.rl6 @@ -0,0 +1,501 @@ +#include <library/cpp/uri/parse.h> + +#ifdef __clang__ + #pragma clang diagnostic ignored "-Wunused-variable" +#endif + +%%{ + machine TParser; + + #================================================ + # RFC 3986 http://tools.ietf.org/html/rfc3986 + # with some modifications + #================================================ + # The RegEx + # + # http://www.ics.uci.edu/pub/ietf/uri/#Related + # ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? + # 12 3 4 5 6 7 8 9 + #results in the following subexpression matches: + # $1 = http: + # $2 = http + # $3 = //www.ics.uci.edu + # $4 = www.ics.uci.edu + # $5 = /pub/ietf/uri/ + # $6 = <undefined> + # $7 = <undefined> + # $8 = #Related + # $9 = Related + # + # So $2:scheme $4:authority $5:path $7:query $9:fragment + #================================================ + + + #================================================ + # List of all ASCII characters and where they can be used + #================================================ + + # 0-31 x00-1F cntrl ext_cntrl + # 32 x20 space ext_space + # 33 x21 ! sub_delims + # 34 x22 " ext_delims + # 35 x23 # gen_delims / f=frag + # 36 x24 $ sub_delims + # 37 x25 % PCT + # 38 x26 & sub_delims + # 39 x27 ' sub_delims + # 40 x28 ( sub_delims + # 41 x29 ) sub_delims + # 42 x2A * sub_delims + # 43 x2B + sub_delims + # 44 x2C , sub_delims + # 45 x2D - unreserved + # 46 x2E . unreserved + # 47 x2F / gen_delims / f=path,qry,frag + # 48-57 x30-39 0-9 unreserved + # 58 x3A : gen_delims / f=pass,path,qry,frag + # 59 x3B ; sub_delims + # 60 x3C < ext_delims + # 61 x3D = sub_delims + # 62 x3E > ext_delims + # 63 x3F ? gen_delims / f=qry,frag + # 64 x40 @ gen_delims / f=path,qry,frag + # 65-90 x41-5A A-Z unreserved + # 91 x5B [ gen_delims / ext_delims + # 92 x5C \ ext_delims + # 93 x5D ] gen_delims / ext_delims + # 94 x5E ^ ext_delims + # 95 x5F _ unreserved + # 96 x60 ` ext_delims + # 97-122 x61-7A a-z unreserved + # 123 x7B { ext_delims + # 124 x7C | ext_delims + # 125 x7D } ext_delims + # 126 x7E ~ unreserved + # 127 x7F DEL ext_cntrl + # 128-255 x80-FF ext_ascii + + + #================================================ + # Actions used in multiple definitions + #================================================ + + action act_req_enc_sql { REQ(fpc, FeatureEncodeForSQL) } + + # REQ must apply to a char in range but not after the range has been reset + action act_req_pathop { REQ(fpc - 1, FeaturePathOperation) } + + action act_clr_scheme { CLR(fpc, Scheme) } + action act_clr_user { CLR(fpc, User) } + action act_clr_host { CLR(fpc, Host) } + action act_beg_host { BEG(fpc, Host) } + action act_end_host { END(fpc, Host) } + action act_beg_path { BEG(fpc, Path) } + action act_end_path { END(fpc, Path) } + + + #================================================ + # RFC 3986 ABNFs + #================================================ + + DIGIT = digit; + + ALPHA = ( upper >{ REQ(fpc, FeatureToLower) } ) | + lower; + + ALNUM = ALPHA | DIGIT; + + PCT = "%" >{ PctBeg(fpc); } ; + + HEXDIG = ( + DIGIT >{ HexDigit(fpc, fc); } + | [A-F] >{ HexUpper(fpc, fc); } + | [a-f] >{ HexLower(fpc, fc); } + ); + + # HexSet sets REQ so must apply in range + HEXNUM = ( HEXDIG HEXDIG ) %{ HexSet(fpc - 1); }; + + pct_encoded = PCT HEXNUM; + + unreserved = ALNUM | "-" | "." | "_" | "~"; + + gen_delims = ":" | "/" | "?" | "#" | "[" | "]" | "@"; + + sub_delims = "!" | "$" | "&" | "(" | ")" + | "*" | "+" | "," | ";" | "=" + | ( ['] >act_req_enc_sql ); + + + #================================================ + # Local ABNFs + #================================================ + + VALID = ^(cntrl | space) | " "; + + # safe character sequences + safe = unreserved | pct_encoded | sub_delims; + + # MOD: Yandex extensions + + ext_ascii = (VALID - ascii) >{ REQ(fpc, FeatureEncodeExtendedASCII) }; + ext_delims = ( "[" | "]" | "|" | "{" | "}" | "`" | "^" | "<" | ">" + | ( ["\\] >act_req_enc_sql ) + ) >{ REQ(fpc, FeatureEncodeExtendedDelim) }; # " fix hilite + ext_space = " " >{ REQ(fpc, FeatureEncodeSpace) }; + ext_cntrl = cntrl >{ REQ(fpc, FeatureEncodeCntrl) }; + + pct_maybe_encoded = PCT (HEXDIG | HEXNUM)? ; + ext_safe = unreserved + | pct_maybe_encoded + | sub_delims + | ext_delims + | ext_space + | ext_cntrl + | ext_ascii; + + # pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + # uric (RFC 2396) + # MOD: extension to format, add extended delimiters and 8-bit ascii + + pchar_nc = ext_safe | "@"; + pchar = pchar_nc | ":"; + path_sep = "/"; + uric = pchar | path_sep | "?"; + + + #================================================ + # Fields + #================================================ + # Single fields use fXXX as machine definitions + + + #================================================ + # Scheme + # scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + #================================================ + + scheme = ( ALPHA ( ALPHA | DIGIT | "+" | "-" | "." )** ); + fscheme = scheme >{ BEG(fpc, Scheme) } %{ END(fpc, Scheme) }; + + + #================================================ + # UserInfo + # userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + #================================================ + + # MOD: split into a pair of sections: username and password + + fuser = ( ext_safe )** >{ BEG(fpc, User) } %{ END(fpc, User) }; + fpass = ( ext_safe | ":" )** >{ BEG(fpc, Pass) } %{ END(fpc, Pass) }; + userinfo = ( fuser ( ":" fpass )? ) ( "@" %act_clr_host @^act_clr_user ); + + + #================================================ + # Hostname + # host = IP-literal / IPv4address / reg-name + #================================================ + + # MOD: simplify IP-literal for now + IPv6address = (HEXDIG | ":" | ".")+; + IP_literal = "[" IPv6address "]"; + + # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + # MOD: simplify dec-octet which originally matches only 0-255 + + dec_octet = DIGIT+; + IPv4address = dec_octet "." dec_octet "." dec_octet "." dec_octet; + + # MOD: non-empty; will use host? + # reg-name = *( unreserved / pct-encoded / sub-delims ) + ### todo: allow ':' (need to fix grammar to disambiguate port) + achar = any - (0x00 .. 0x20) - '/' - '#' - '?' - ':' - '%'; + upperhalf = any - (0x00 .. 0x7F); + hostname = (((achar | pct_encoded)+) & (any* (alnum | upperhalf) any*)); + reg_name = hostname - IPv4address - IP_literal; + + # uses first-match-wins approach + host = IP_literal | IPv4address | (reg_name - IPv4address); + fhost = host? >act_beg_host %act_end_host; + fhost_nempty = host >act_beg_host %act_end_host; + + + #================================================ + # Port + # port = *DIGIT + #================================================ + + # MOD: use fport? for empty + fport = DIGIT+ >{ BEG(fpc, Port) } %{ END(fpc, Port) }; + + + #================================================ + # Authority + # authority = [ userinfo "@" ] host [ ":" port ] + #================================================ + + authority = userinfo? fhost ( ":" fport? )? ; + + + #================================================ + # Path + #================================================ + # path = path-abempty ; begins with "/" or is empty + # / path-absolute ; begins with "/" but not "//" + # / path-noscheme ; begins with a non-colon segment + # / path-rootless ; begins with a segment + # / path-empty ; zero characters + #================================================ + + # checkPath rules + + checkPathHead = + "." ( "."? path_sep VALID* )? %act_req_pathop ; + + checkPathTail = + VALID* + ( path_sep "."{1,2} ) %act_req_pathop ; + + checkPathMid = VALID* + ( path_sep "."{,2} path_sep ) %act_req_pathop + VALID*; + + checkAbsPath = checkPathMid | checkPathTail | VALID*; + checkRelPath = checkPathHead | checkAbsPath; + + # segment = *pchar + segment = pchar**; + + # segment-nz = 1*pchar + segment_nz = pchar+; + + # segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) + segment_nz_nc = pchar_nc+; + + sep_segment = path_sep segment; + + # non-standard definitions + + fpath_abnempty = + ( + ( sep_segment+ ) + & checkAbsPath + ) + >act_beg_path %act_end_path + ; + + fpath_relative = + ( + "." + ( "."? sep_segment+ )? + ) + >act_beg_path %act_req_pathop %act_end_path + ; + + # standard definitions + + # do not save empty paths, they behave differently in relative resolutions + fpath_empty = zlen; + + fpath_abempty = fpath_abnempty?; + + fpath_absolute = + ( + ( path_sep ( segment_nz sep_segment* )? ) + & checkAbsPath + ) + >act_beg_path %act_end_path + ; + + fpath_noscheme = + ( + ( segment_nz_nc sep_segment* ) + & checkRelPath + ) + >act_beg_path %act_end_path + ; + + fpath_rootless = + ( + ( segment_nz sep_segment* ) + ) + >act_beg_path %act_end_path + ; + + #================================================ + # Query and fragment + # query = *( pchar / "/" / "?" ) + # fragment = *( pchar / "/" / "?" ) + #================================================ + + # MOD: fragment allows '#' characters + + fquery = (uric )** >{ BEG(fpc, Query) } %{ END(fpc, Query) }; + ffrag = (uric | "#")** >{ BEG(fpc, Frag) } %{ END(fpc, Frag) }; + query_frag = ("?" fquery)? ("#" ffrag)? ; + + + #================================================ + # final ABNFs + # URI-reference = URI / relative-ref + #================================================ + # URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + # hier-part = "//" authority path-abempty + # / path-absolute + # / path-rootless + # / path-empty + # relative-ref = relative-part [ "?" query ] [ "#" fragment ] + # relative-part = "//" authority path-abempty + # / path-absolute + # / path-noscheme + # / path-empty + + net_path = "//" authority fpath_abempty; + + URI = + fscheme ":" + ( + net_path + | fpath_absolute + | fpath_rootless + | fpath_empty + ) + $^act_clr_scheme + query_frag + ; + + relative_ref = + ( + net_path + | fpath_absolute + | fpath_noscheme + | fpath_empty + ) + %act_clr_scheme + query_frag + ; + + # non-standard definitions + + URI_no_rootless = + fscheme ":" + ( + net_path + | fpath_absolute + | fpath_empty + ) + $^act_clr_scheme + query_frag + ; + + host_path = + ( + fhost_nempty fpath_abempty + | (fhost_nempty - scheme) ":" fport fpath_abempty + ) + @^act_clr_host + ; + + # no userinfo, path absolute, empty or clearly relative, starting with "./" | "../" + relative_ref_host_pabem = + ( + net_path + | host_path + | fpath_absolute + | fpath_relative + | fpath_empty + ) + %act_clr_scheme + query_frag + ; + + # port must be non-empty, to avoid clash with "scheme:/..." + auth_path = + ( + fhost_nempty ( ":" fport )? fpath_abempty + | userinfo fhost ( ":" fport? )? fpath_abempty + ) + @^act_clr_host + @^act_clr_user + ; + + # userinfo, path absolute, empty or clearly relative, starting with "./" | "../" + relative_ref_auth_pabem = + ( + net_path + | auth_path + | fpath_absolute + | fpath_relative + | fpath_empty + ) + %act_clr_scheme + query_frag + ; + + + # machine instantiations + + URI_ref_no_rootless := + ( + URI_no_rootless + # scheme://user@host preferred over user://pass@host/path + | relative_ref_auth_pabem + ) + ; + + URI_ref_no_relpath := + ( + relative_ref_host_pabem + # host:port/path preferred over scheme:path/rootless + | (URI - relative_ref_host_pabem) + ) + ; + + URI_ref := + ( + relative_ref + | URI + ) + ; + + write data; + +}%% + +namespace NUri { + +bool TParser::doParse(const char* str_beg, size_t length) +{ + const char* p = str_beg; + const char* pe = str_beg + length; + const char* eof = pe; + int cs; + +#define BEG(ptr, fld) startSection (ptr, TField::Field ## fld); +#define END(ptr, fld) finishSection(ptr, TField::Field ## fld); +#define SET(val, fld) storeSection(val, TField::Field ## fld); +#define CLR(ptr, fld) ResetSection (TField::Field ## fld, ptr); +#define REQ(ptr, req) setRequirement(ptr, TFeature :: req); + + %% write init nocs; + + if (0 == (Flags & TFeature::FeatureNoRelPath)) { + cs = TParser_en_URI_ref; + } else if (0 == (Flags & TFeature::FeatureAllowRootless)) { + cs = TParser_en_URI_ref_no_rootless; + } else { + cs = TParser_en_URI_ref_no_relpath; + } + + %% write exec; + +#undef BEG +#undef END +#undef SET +#undef CLR +#undef REQ + + return cs >= TParser_first_final; +} + +} |