#include <library/cpp/uri/parse.h>
#ifdef __clang__
#pragma clang diagnostic ignored "-Wunused-variable"
machine TParser;
# RFC 3986 http://tools.ietf.org/html/rfc3986
# with some modifications
# The RegEx
# http://www.ics.uci.edu/pub/ietf/uri/#Related
# ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
# 12 3 4 5 6 7 8 9
#results in the following subexpression matches:
# $1 = http:
# $2 = http
# $3 = //www.ics.uci.edu
# $4 = www.ics.uci.edu
# $5 = /pub/ietf/uri/
# $6 = <undefined>
# $7 = <undefined>
# $8 = #Related
# $9 = Related
# So $2:scheme $4:authority $5:path $7:query $9:fragment
# List of all ASCII characters and where they can be used
# 0-31 x00-1F cntrl ext_cntrl
# 32 x20 space ext_space
# 33 x21 ! sub_delims
# 34 x22 " ext_delims
# 35 x23 # gen_delims / f=frag
# 36 x24 $ sub_delims
# 37 x25 % PCT
# 38 x26 & sub_delims
# 39 x27 ' sub_delims
# 40 x28 ( sub_delims
# 41 x29 ) sub_delims
# 42 x2A * sub_delims
# 43 x2B + sub_delims
# 44 x2C , sub_delims
# 45 x2D - unreserved
# 46 x2E . unreserved
# 47 x2F / gen_delims / f=path,qry,frag
# 48-57 x30-39 0-9 unreserved
# 58 x3A : gen_delims / f=pass,path,qry,frag
# 59 x3B ; sub_delims
# 60 x3C < ext_delims
# 61 x3D = sub_delims
# 62 x3E > ext_delims
# 63 x3F ? gen_delims / f=qry,frag
# 64 x40 @ gen_delims / f=path,qry,frag
# 65-90 x41-5A A-Z unreserved
# 91 x5B [ gen_delims / ext_delims
# 92 x5C \ ext_delims
# 93 x5D ] gen_delims / ext_delims
# 94 x5E ^ ext_delims
# 95 x5F _ unreserved
# 96 x60 ` ext_delims
# 97-122 x61-7A a-z unreserved
# 123 x7B { ext_delims
# 124 x7C | ext_delims
# 125 x7D } ext_delims
# 126 x7E ~ unreserved
# 127 x7F DEL ext_cntrl
# 128-255 x80-FF ext_ascii
# Actions used in multiple definitions
action act_req_enc_sql { REQ(fpc, FeatureEncodeForSQL) }
# REQ must apply to a char in range but not after the range has been reset
action act_req_pathop { REQ(fpc - 1, FeaturePathOperation) }
action act_clr_scheme { CLR(fpc, Scheme) }
action act_clr_user { CLR(fpc, User) }
action act_clr_host { CLR(fpc, Host) }
action act_beg_host { BEG(fpc, Host) }
action act_end_host { END(fpc, Host) }
action act_beg_path { BEG(fpc, Path) }
action act_end_path { END(fpc, Path) }
# RFC 3986 ABNFs
DIGIT = digit;
ALPHA = ( upper >{ REQ(fpc, FeatureToLower) } ) |
PCT = "%" >{ PctBeg(fpc); } ;
DIGIT >{ HexDigit(fpc, fc); }
| [A-F] >{ HexUpper(fpc, fc); }
| [a-f] >{ HexLower(fpc, fc); }
# HexSet sets REQ so must apply in range
HEXNUM = ( HEXDIG HEXDIG ) %{ HexSet(fpc - 1); };
pct_encoded = PCT HEXNUM;
unreserved = ALNUM | "-" | "." | "_" | "~";
gen_delims = ":" | "/" | "?" | "#" | "[" | "]" | "@";
sub_delims = "!" | "$" | "&" | "(" | ")"
| "*" | "+" | "," | ";" | "="
| ( ['] >act_req_enc_sql );
# Local ABNFs
VALID = ^(cntrl | space) | " ";
# safe character sequences
safe = unreserved | pct_encoded | sub_delims;
# MOD: Yandex extensions
ext_ascii = (VALID - ascii) >{ REQ(fpc, FeatureEncodeExtendedASCII) };
ext_delims = ( "[" | "]" | "|" | "{" | "}" | "`" | "^" | "<" | ">"
| ( ["\\] >act_req_enc_sql )
) >{ REQ(fpc, FeatureEncodeExtendedDelim) }; # " fix hilite
ext_space = " " >{ REQ(fpc, FeatureEncodeSpace) };
ext_cntrl = cntrl >{ REQ(fpc, FeatureEncodeCntrl) };
pct_maybe_encoded = PCT (HEXDIG | HEXNUM)? ;
ext_safe = unreserved
| pct_maybe_encoded
| sub_delims
| ext_delims
| ext_space
| ext_cntrl
| ext_ascii;
# pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
# uric (RFC 2396)
# MOD: extension to format, add extended delimiters and 8-bit ascii
pchar_nc = ext_safe | "@";
pchar = pchar_nc | ":";
path_sep = "/";
uric = pchar | path_sep | "?";
# Fields
# Single fields use fXXX as machine definitions
# Scheme
# scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
scheme = ( ALPHA ( ALPHA | DIGIT | "+" | "-" | "." )** );
fscheme = scheme >{ BEG(fpc, Scheme) } %{ END(fpc, Scheme) };
# UserInfo
# userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
# MOD: split into a pair of sections: username and password
fuser = ( ext_safe )** >{ BEG(fpc, User) } %{ END(fpc, User) };
fpass = ( ext_safe | ":" )** >{ BEG(fpc, Pass) } %{ END(fpc, Pass) };
userinfo = ( fuser ( ":" fpass )? ) ( "@" %act_clr_host @^act_clr_user );
# Hostname
# host = IP-literal / IPv4address / reg-name
# MOD: simplify IP-literal for now
IPv6address = (HEXDIG | ":" | ".")+;
IP_literal = "[" IPv6address "]";
# IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
# MOD: simplify dec-octet which originally matches only 0-255
dec_octet = DIGIT+;
IPv4address = dec_octet "." dec_octet "." dec_octet "." dec_octet;
# MOD: non-empty; will use host?
# reg-name = *( unreserved / pct-encoded / sub-delims )
### todo: allow ':' (need to fix grammar to disambiguate port)
achar = any - (0x00 .. 0x20) - '/' - '#' - '?' - ':' - '%';
upperhalf = any - (0x00 .. 0x7F);
hostname = (((achar | pct_encoded)+) & (any* (alnum | upperhalf) any*));
reg_name = hostname - IPv4address - IP_literal;
# uses first-match-wins approach
host = IP_literal | IPv4address | (reg_name - IPv4address);
fhost = host? >act_beg_host %act_end_host;
fhost_nempty = host >act_beg_host %act_end_host;
# Port
# port = *DIGIT
# MOD: use fport? for empty
fport = DIGIT+ >{ BEG(fpc, Port) } %{ END(fpc, Port) };
# Authority
# authority = [ userinfo "@" ] host [ ":" port ]
authority = userinfo? fhost ( ":" fport? )? ;
# Path
# path = path-abempty ; begins with "/" or is empty
# / path-absolute ; begins with "/" but not "//"
# / path-noscheme ; begins with a non-colon segment
# / path-rootless ; begins with a segment
# / path-empty ; zero characters
# checkPath rules
checkPathHead =
"." ( "."? path_sep VALID* )? %act_req_pathop ;
checkPathTail =
( path_sep "."{1,2} ) %act_req_pathop ;
checkPathMid = VALID*
( path_sep "."{,2} path_sep ) %act_req_pathop
checkAbsPath = checkPathMid | checkPathTail | VALID*;
checkRelPath = checkPathHead | checkAbsPath;
# segment = *pchar
segment = pchar**;
# segment-nz = 1*pchar
segment_nz = pchar+;
# segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
segment_nz_nc = pchar_nc+;
sep_segment = path_sep segment;
# non-standard definitions
fpath_abnempty =
( sep_segment+ )
& checkAbsPath
>act_beg_path %act_end_path
fpath_relative =
( "."? sep_segment+ )?
>act_beg_path %act_req_pathop %act_end_path
# standard definitions
# do not save empty paths, they behave differently in relative resolutions
fpath_empty = zlen;
fpath_abempty = fpath_abnempty?;
fpath_absolute =
( path_sep ( segment_nz sep_segment* )? )
& checkAbsPath
>act_beg_path %act_end_path
fpath_noscheme =
( segment_nz_nc sep_segment* )
& checkRelPath
>act_beg_path %act_end_path
fpath_rootless =
( segment_nz sep_segment* )
>act_beg_path %act_end_path
# Query and fragment
# query = *( pchar / "/" / "?" )
# fragment = *( pchar / "/" / "?" )
# MOD: fragment allows '#' characters
fquery = (uric )** >{ BEG(fpc, Query) } %{ END(fpc, Query) };
ffrag = (uric | "#")** >{ BEG(fpc, Frag) } %{ END(fpc, Frag) };
query_frag = ("?" fquery)? ("#" ffrag)? ;
# final ABNFs
# URI-reference = URI / relative-ref
# URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
# hier-part = "//" authority path-abempty
# / path-absolute
# / path-rootless
# / path-empty
# relative-ref = relative-part [ "?" query ] [ "#" fragment ]
# relative-part = "//" authority path-abempty
# / path-absolute
# / path-noscheme
# / path-empty
net_path = "//" authority fpath_abempty;
fscheme ":"
| fpath_absolute
| fpath_rootless
| fpath_empty
relative_ref =
| fpath_absolute
| fpath_noscheme
| fpath_empty
# non-standard definitions
URI_no_rootless =
fscheme ":"
| fpath_absolute
| fpath_empty
host_path =
fhost_nempty fpath_abempty
| (fhost_nempty - scheme) ":" fport fpath_abempty
# no userinfo, path absolute, empty or clearly relative, starting with "./" | "../"
relative_ref_host_pabem =
| host_path
| fpath_absolute
| fpath_relative
| fpath_empty
# port must be non-empty, to avoid clash with "scheme:/..."
auth_path =
fhost_nempty ( ":" fport )? fpath_abempty
| userinfo fhost ( ":" fport? )? fpath_abempty
# userinfo, path absolute, empty or clearly relative, starting with "./" | "../"
relative_ref_auth_pabem =
| auth_path
| fpath_absolute
| fpath_relative
| fpath_empty
# machine instantiations
URI_ref_no_rootless :=
# scheme://user@host preferred over user://pass@host/path
| relative_ref_auth_pabem
URI_ref_no_relpath :=
# host:port/path preferred over scheme:path/rootless
| (URI - relative_ref_host_pabem)
URI_ref :=
write data;
namespace NUri {
bool TParser::doParse(const char* str_beg, size_t length)
const char* p = str_beg;
const char* pe = str_beg + length;
const char* eof = pe;
int cs;
#define BEG(ptr, fld) startSection (ptr, TField::Field ## fld);
#define END(ptr, fld) finishSection(ptr, TField::Field ## fld);
#define SET(val, fld) storeSection(val, TField::Field ## fld);
#define CLR(ptr, fld) ResetSection (TField::Field ## fld, ptr);
#define REQ(ptr, req) setRequirement(ptr, TFeature :: req);
%% write init nocs;
if (0 == (Flags & TFeature::FeatureNoRelPath)) {
cs = TParser_en_URI_ref;
} else if (0 == (Flags & TFeature::FeatureAllowRootless)) {
cs = TParser_en_URI_ref_no_rootless;
} else {
cs = TParser_en_URI_ref_no_relpath;
%% write exec;
#undef BEG
#undef END
#undef SET
#undef CLR
#undef REQ
return cs >= TParser_first_final;