aboutsummaryrefslogtreecommitdiffstats
path: root/yql/essentials/minikql/jsonpath/JsonPath.g
blob: f32f98d185330013db55ba83a60473824af577de (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
grammar JsonPath;

options {
    language = Cpp;
    memoize = true;
}

// Root rule. Input is a mode followed by jsonpath expression
jsonpath: (STRICT | LAX)? expr EOF;

// Generic jsonpath expression
expr: or_expr;

// Arithmetic and boolean operations
// Operator precedence:
// 1. Unary plus, minus and logical not
// 2. Multiplication, division, modulus
// 3. Addition, substraction
// 4. Compare operators (<, <=, >, >=)
// 5. Equality operators (==, !=, <>)
// 6. Logical and
// 7. Logical or
// NOTE: We execute JsonPath using bottom up approach. Thus
// operations with higher precedence must be located "deeper" inside AST
or_expr: and_expr (OR and_expr)*;
and_expr: equal_expr (AND equal_expr)*;
equal_expr: compare_expr ((EQUAL | NOT_EQUAL | NOT_EQUAL_SQL) compare_expr)?;
compare_expr: add_expr ((LESS | LESS_EQUAL | GREATER | GREATER_EQUAL) add_expr)?;
add_expr: mul_expr ((PLUS | MINUS) mul_expr)*;
mul_expr: unary_expr ((ASTERISK | SLASH | PERCENT) unary_expr)*;
unary_expr: (PLUS | MINUS | NOT)? predicate_expr;

// Predicates, `"string" starts with "str"`
// NOTE: `is unknown` predicate is defined separately in primary rule. This is done
// because if we add it as an alternative to predicate_expr, ANTLR would need backtacking.
// For example it would not be possible to tell if expression like `( ... ) is unknown` is
// related to `starts with` (and braces are part of plain_expr rule) or it is related to
// `is unknown` rule (and braces are not included in plain_expr).
predicate_expr:
    (plain_expr (starts_with_expr | like_regex_expr)?)
    | (EXISTS LBRACE expr RBRACE);

starts_with_expr: STARTS WITH plain_expr;
like_regex_expr: LIKE_REGEX STRING_VALUE (FLAG STRING_VALUE)?;

// Plain expression serves as an argument to binary and unary operators
plain_expr: accessor_expr;

accessor_expr: primary accessor_op*;
accessor_op: member_accessor | wildcard_member_accessor | array_accessor | wildcard_array_accessor | filter | method;

// Member acceccors, `$.key` and `$.*`
member_accessor: DOT (identifier | STRING_VALUE);
wildcard_member_accessor: DOT ASTERISK;

// Array accessors, `$[0, 1 to 3, last]` and `$[*]`
array_subscript: expr (TO expr)?;
array_accessor: LBRACE_SQUARE array_subscript (COMMA array_subscript)* RBRACE_SQUARE;
wildcard_array_accessor: LBRACE_SQUARE ASTERISK RBRACE_SQUARE;

// Filters, `$ ? (@.age >= 18)`
filter: QUESTION LBRACE expr RBRACE;

// Methods, `$.abs().ceiling()`
method: DOT (ABS_METHOD | FLOOR_METHOD | CEILING_METHOD | DOUBLE_METHOD | TYPE_METHOD | SIZE_METHOD | KEYVALUE_METHOD) LBRACE RBRACE;

// Primaries are objects to perform operations on:
// 1. All literals:
//   - Numbers, `1.23e-5`
//   - Bool, `false` and `true`
//   - Null, `null`
//   - Strings, `"привет"`, `\r\n\t`
// 2. Current object, `$`
// 3. Current filtering object, `@`
// 4. Variables, `$my_cool_variable`
// 5. Last array index, `last`
// 6. Parenthesized jsonpath expression, `($.key + $[0])`
primary:
    NUMBER
    | DOLLAR
    | LAST
    | (LBRACE expr RBRACE (IS UNKNOWN)?)
    | VARIABLE
    | TRUE
    | FALSE
    | NULL
    | STRING_VALUE
    | AT;

// Identifier for member accessors and variable names, `$.key` and `$variable_name`
// JsonPath supports using keywords as identifiers. We need to mention keywords in
// identifer rule because otherwise ANTLR will treat them as a separate token.
// For instance input `$.to` without this modification will be treated as
// `DOLLAR DOT TO`, not `DOLLAR DOT IDENTIFIER`
identifier: IDENTIFIER | keyword;

keyword:
    ABS_METHOD
    | CEILING_METHOD
    | DOUBLE_METHOD
    | EXISTS
    | FALSE
    | FLAG
    | FLOOR_METHOD
    | IS
    | KEYVALUE_METHOD
    | LAST
    | LAX
    | LIKE_REGEX
    | NULL
    | SIZE_METHOD
    | STARTS
    | STRICT
    | TO
    | TRUE
    | TYPE_METHOD
    | UNKNOWN
    | WITH;

//
// Lexer
//

AND:           '&&';
ASTERISK:      '*';
AT:            '@';
BACKSLASH:     '\\';
COMMA:         ',';
DOLLAR:        '$';
DOT:           '.';
EQUAL:         '==';
GREATER_EQUAL: '>=';
GREATER:       '>';
LBRACE_SQUARE: '[';
LBRACE:        '(';
LESS_EQUAL:    '<=';
LESS:          '<';
MINUS:         '-';
NOT_EQUAL_SQL: '<>';
NOT_EQUAL:     '!=';
NOT:           '!';
OR:            '||';
PERCENT:       '%';
PLUS:          '+';
QUESTION:      '?';
QUOTE_DOUBLE:  '"';
QUOTE_SINGLE:  '\'';
RBRACE_SQUARE: ']';
RBRACE:        ')';
SLASH:         '/';
UNDERSCORE:    '_';

// Keywords
ABS_METHOD: 'abs';
CEILING_METHOD: 'ceiling';
DOUBLE_METHOD: 'double';
EXISTS: 'exists';
FALSE: 'false';
FLAG: 'flag';
FLOOR_METHOD: 'floor';
IS: 'is';
KEYVALUE_METHOD: 'keyvalue';
LAST: 'last';
LAX: 'lax';
LIKE_REGEX: 'like_regex';
NULL: 'null';
SIZE_METHOD: 'size';
STARTS: 'starts';
STRICT: 'strict';
TO: 'to';
TRUE: 'true';
TYPE_METHOD: 'type';
UNKNOWN: 'unknown';
WITH: 'with';

// String literal
fragment STRING_CORE_SINGLE: ( ~(QUOTE_SINGLE | BACKSLASH) | (BACKSLASH .) )*;
fragment STRING_CORE_DOUBLE: ( ~(QUOTE_DOUBLE | BACKSLASH) | (BACKSLASH .) )*;
fragment STRING_SINGLE: (QUOTE_SINGLE STRING_CORE_SINGLE QUOTE_SINGLE);
fragment STRING_DOUBLE: (QUOTE_DOUBLE STRING_CORE_DOUBLE QUOTE_DOUBLE);

STRING_VALUE: (STRING_SINGLE | STRING_DOUBLE);

// Number literal
fragment DIGIT: '0'..'9';
fragment DIGITS: DIGIT+;
fragment REAL_PART: DOT DIGITS;
fragment EXP_PART: ('e' | 'E') (PLUS | MINUS)? DIGITS;

NUMBER: DIGITS REAL_PART? EXP_PART?;

// Javascript identifier
fragment ID_START: ('a'..'z' | 'A'..'Z' | UNDERSCORE);
fragment ID_CORE: (ID_START | DIGIT | DOLLAR);

IDENTIFIER: ID_START (ID_CORE)*;

// Jsonpath variable
VARIABLE: DOLLAR (ID_CORE)*;

WS: (' '|'\r'|'\t'|'\n') {$channel=HIDDEN;};
// FIXME: WS and COMMENT tokens are currently required.
// FIXME: Since there are no comments in JSONPATH, we split whitespace characters between WS and COMMENT
COMMENT: ('\u000C') {$channel=HIDDEN;};