aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/src/Parser/tokenizer.c
diff options
context:
space:
mode:
authorshadchin <shadchin@yandex-team.com>2024-02-12 07:53:52 +0300
committerDaniil Cherednik <dcherednik@ydb.tech>2024-02-14 14:26:16 +0000
commit31f2a419764a8ba77c2a970cfc80056c6cd06756 (patch)
treec1995d239eba8571cefc640f6648e1d5dd4ce9e2 /contrib/tools/python3/src/Parser/tokenizer.c
parentfe2ef02b38d9c85d80060963b265a1df9f38c3bb (diff)
downloadydb-31f2a419764a8ba77c2a970cfc80056c6cd06756.tar.gz
Update Python from 3.11.8 to 3.12.2
Diffstat (limited to 'contrib/tools/python3/src/Parser/tokenizer.c')
-rw-r--r--contrib/tools/python3/src/Parser/tokenizer.c1195
1 files changed, 1015 insertions, 180 deletions
diff --git a/contrib/tools/python3/src/Parser/tokenizer.c b/contrib/tools/python3/src/Parser/tokenizer.c
index 4bc72ae444..27d49c6f89 100644
--- a/contrib/tools/python3/src/Parser/tokenizer.c
+++ b/contrib/tools/python3/src/Parser/tokenizer.c
@@ -11,11 +11,6 @@
#include "tokenizer.h"
#include "errcode.h"
-#include "unicodeobject.h"
-#include "bytesobject.h"
-#include "fileobject.h"
-#include "abstract.h"
-
/* Alternate tab spacing */
#define ALTTABSIZE 1
@@ -36,6 +31,31 @@
/* Don't ever change this -- it would break the portability of Python code */
#define TABSIZE 8
+#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
+#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
+ type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
+#define ADVANCE_LINENO() \
+ tok->lineno++; \
+ tok->col_offset = 0;
+
+#define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0)
+#define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0)
+#ifdef Py_DEBUG
+static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
+ assert(tok->tok_mode_stack_index >= 0);
+ assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL);
+ return &(tok->tok_mode_stack[tok->tok_mode_stack_index]);
+}
+static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
+ assert(tok->tok_mode_stack_index >= 0);
+ assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL);
+ return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
+}
+#else
+#define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
+#define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
+#endif
+
/* Forward */
static struct tok_state *tok_new(void);
static int tok_nextc(struct tok_state *tok);
@@ -71,6 +91,8 @@ tok_new(void)
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
tok->lineno = 0;
+ tok->starting_col_offset = -1;
+ tok->col_offset = -1;
tok->level = 0;
tok->altindstack[0] = 0;
tok->decoding_state = STATE_INIT;
@@ -81,6 +103,7 @@ tok_new(void)
tok->filename = NULL;
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
+ tok->readline = NULL;
tok->type_comments = 0;
tok->async_hacks = 0;
tok->async_def = 0;
@@ -89,6 +112,14 @@ tok_new(void)
tok->interactive_underflow = IUNDERFLOW_NORMAL;
tok->str = NULL;
tok->report_warnings = 1;
+ tok->tok_extra_tokens = 0;
+ tok->comment_newline = 0;
+ tok->implicit_newline = 0;
+ tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
+ tok->tok_mode_stack_index = 0;
+#ifdef Py_DEBUG
+ tok->debug = _Py_GetConfig()->parser_debug;
+#endif
return tok;
}
@@ -109,8 +140,9 @@ static char *
error_ret(struct tok_state *tok) /* XXX */
{
tok->decoding_erred = 1;
- if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
+ if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */
PyMem_Free(tok->buf);
+ }
tok->buf = tok->cur = tok->inp = NULL;
tok->start = NULL;
tok->end = NULL;
@@ -323,16 +355,182 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
return -1;
}
strcpy(new_str + current_size, line);
+ tok->implicit_newline = 0;
if (last_char != '\n') {
/* Last line does not end in \n, fake one */
new_str[current_size + line_size - 1] = '\n';
new_str[current_size + line_size] = '\0';
+ tok->implicit_newline = 1;
}
tok->interactive_src_start = new_str;
tok->interactive_src_end = new_str + current_size + line_size;
return 0;
}
+/* Traverse and remember all f-string buffers, in order to be able to restore
+ them after reallocating tok->buf */
+static void
+remember_fstring_buffers(struct tok_state *tok)
+{
+ int index;
+ tokenizer_mode *mode;
+
+ for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+ mode = &(tok->tok_mode_stack[index]);
+ mode->f_string_start_offset = mode->f_string_start - tok->buf;
+ mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
+ }
+}
+
+/* Traverse and restore all f-string buffers after reallocating tok->buf */
+static void
+restore_fstring_buffers(struct tok_state *tok)
+{
+ int index;
+ tokenizer_mode *mode;
+
+ for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+ mode = &(tok->tok_mode_stack[index]);
+ mode->f_string_start = tok->buf + mode->f_string_start_offset;
+ mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
+ }
+}
+
+static int
+set_fstring_expr(struct tok_state* tok, struct token *token, char c) {
+ assert(token != NULL);
+ assert(c == '}' || c == ':' || c == '!');
+ tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
+
+ if (!tok_mode->f_string_debug || token->metadata) {
+ return 0;
+ }
+
+ PyObject *res = NULL;
+
+ // Check if there is a # character in the expression
+ int hash_detected = 0;
+ for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) {
+ if (tok_mode->last_expr_buffer[i] == '#') {
+ hash_detected = 1;
+ break;
+ }
+ }
+
+ if (hash_detected) {
+ Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end;
+ char *result = (char *)PyObject_Malloc((input_length + 1) * sizeof(char));
+ if (!result) {
+ return -1;
+ }
+
+ Py_ssize_t i = 0;
+ Py_ssize_t j = 0;
+
+ for (i = 0, j = 0; i < input_length; i++) {
+ if (tok_mode->last_expr_buffer[i] == '#') {
+ // Skip characters until newline or end of string
+ while (tok_mode->last_expr_buffer[i] != '\0' && i < input_length) {
+ if (tok_mode->last_expr_buffer[i] == '\n') {
+ result[j++] = tok_mode->last_expr_buffer[i];
+ break;
+ }
+ i++;
+ }
+ } else {
+ result[j++] = tok_mode->last_expr_buffer[i];
+ }
+ }
+
+ result[j] = '\0'; // Null-terminate the result string
+ res = PyUnicode_DecodeUTF8(result, j, NULL);
+ PyObject_Free(result);
+ } else {
+ res = PyUnicode_DecodeUTF8(
+ tok_mode->last_expr_buffer,
+ tok_mode->last_expr_size - tok_mode->last_expr_end,
+ NULL
+ );
+
+ }
+
+
+ if (!res) {
+ return -1;
+ }
+ token->metadata = res;
+ return 0;
+}
+
+static int
+update_fstring_expr(struct tok_state *tok, char cur)
+{
+ assert(tok->cur != NULL);
+
+ Py_ssize_t size = strlen(tok->cur);
+ tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
+
+ switch (cur) {
+ case 0:
+ if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
+ return 1;
+ }
+ char *new_buffer = PyMem_Realloc(
+ tok_mode->last_expr_buffer,
+ tok_mode->last_expr_size + size
+ );
+ if (new_buffer == NULL) {
+ PyMem_Free(tok_mode->last_expr_buffer);
+ goto error;
+ }
+ tok_mode->last_expr_buffer = new_buffer;
+ strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
+ tok_mode->last_expr_size += size;
+ break;
+ case '{':
+ if (tok_mode->last_expr_buffer != NULL) {
+ PyMem_Free(tok_mode->last_expr_buffer);
+ }
+ tok_mode->last_expr_buffer = PyMem_Malloc(size);
+ if (tok_mode->last_expr_buffer == NULL) {
+ goto error;
+ }
+ tok_mode->last_expr_size = size;
+ tok_mode->last_expr_end = -1;
+ strncpy(tok_mode->last_expr_buffer, tok->cur, size);
+ break;
+ case '}':
+ case '!':
+ case ':':
+ if (tok_mode->last_expr_end == -1) {
+ tok_mode->last_expr_end = strlen(tok->start);
+ }
+ break;
+ default:
+ Py_UNREACHABLE();
+ }
+ return 1;
+error:
+ tok->done = E_NOMEM;
+ return 0;
+}
+
+static void
+free_fstring_expressions(struct tok_state *tok)
+{
+ int index;
+ tokenizer_mode *mode;
+
+ for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+ mode = &(tok->tok_mode_stack[index]);
+ if (mode->last_expr_buffer != NULL) {
+ PyMem_Free(mode->last_expr_buffer);
+ mode->last_expr_buffer = NULL;
+ mode->last_expr_size = 0;
+ mode->last_expr_end = -1;
+ }
+ }
+}
/* Read a line of text from TOK into S, using the stream in TOK.
Return NULL on failure, else S.
@@ -360,6 +558,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
+ remember_fstring_buffers(tok);
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
if (newbuf == NULL) {
tok->done = E_NOMEM;
@@ -372,6 +571,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
tok->start = start < 0 ? NULL : tok->buf + start;
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
+ restore_fstring_buffers(tok);
}
return 1;
}
@@ -436,7 +636,7 @@ error:
static int
fp_setreadl(struct tok_state *tok, const char* enc)
{
- PyObject *readline, *io, *stream;
+ PyObject *readline, *open, *stream;
int fd;
long pos;
@@ -453,13 +653,13 @@ fp_setreadl(struct tok_state *tok, const char* enc)
return 0;
}
- io = PyImport_ImportModule("io");
- if (io == NULL) {
+ open = _PyImport_GetModuleAttrString("io", "open");
+ if (open == NULL) {
return 0;
}
- stream = _PyObject_CallMethod(io, &_Py_ID(open), "isisOOO",
+ stream = PyObject_CallFunction(open, "isisOOO",
fd, "r", -1, enc, Py_None, Py_None, Py_False);
- Py_DECREF(io);
+ Py_DECREF(open);
if (stream == NULL) {
return 0;
}
@@ -620,7 +820,8 @@ translate_into_utf8(const char* str, const char* enc) {
static char *
-translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
+translate_newlines(const char *s, int exec_input, int preserve_crlf,
+ struct tok_state *tok) {
int skip_next_lf = 0;
#if defined(__has_feature)
# if __has_feature(memory_sanitizer)
@@ -645,7 +846,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
break;
}
}
- if (c == '\r') {
+ if (!preserve_crlf && c == '\r') {
skip_next_lf = 1;
c = '\n';
}
@@ -653,7 +854,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
}
/* If this is exec input, add a newline to the end of the string if
there isn't one already. */
- if (exec_input && c != '\n') {
+ if (exec_input && c != '\n' && c != '\0') {
*current = '\n';
current++;
}
@@ -675,14 +876,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
inside TOK. */
static char *
-decode_str(const char *input, int single, struct tok_state *tok)
+decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
{
PyObject* utf8 = NULL;
char *str;
const char *s;
const char *newl[2] = {NULL, NULL};
int lineno = 0;
- tok->input = str = translate_newlines(input, single, tok);
+ tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
if (str == NULL)
return NULL;
tok->enc = NULL;
@@ -734,14 +935,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
/* Set up tokenizer for string */
struct tok_state *
-_PyTokenizer_FromString(const char *str, int exec_input)
+_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
char *decoded;
if (tok == NULL)
return NULL;
- decoded = decode_str(str, exec_input, tok);
+ decoded = decode_str(str, exec_input, tok, preserve_crlf);
if (decoded == NULL) {
_PyTokenizer_Free(tok);
return NULL;
@@ -752,16 +953,43 @@ _PyTokenizer_FromString(const char *str, int exec_input)
return tok;
}
+struct tok_state *
+_PyTokenizer_FromReadline(PyObject* readline, const char* enc,
+ int exec_input, int preserve_crlf)
+{
+ struct tok_state *tok = tok_new();
+ if (tok == NULL)
+ return NULL;
+ if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
+ _PyTokenizer_Free(tok);
+ return NULL;
+ }
+ tok->cur = tok->inp = tok->buf;
+ tok->end = tok->buf + BUFSIZ;
+ tok->fp = NULL;
+ if (enc != NULL) {
+ tok->encoding = new_string(enc, strlen(enc), tok);
+ if (!tok->encoding) {
+ _PyTokenizer_Free(tok);
+ return NULL;
+ }
+ }
+ tok->decoding_state = STATE_NORMAL;
+ Py_INCREF(readline);
+ tok->readline = readline;
+ return tok;
+}
+
/* Set up tokenizer for UTF-8 string */
struct tok_state *
-_PyTokenizer_FromUTF8(const char *str, int exec_input)
+_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
char *translated;
if (tok == NULL)
return NULL;
- tok->input = translated = translate_newlines(str, exec_input, tok);
+ tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
if (translated == NULL) {
_PyTokenizer_Free(tok);
return NULL;
@@ -821,8 +1049,9 @@ _PyTokenizer_Free(struct tok_state *tok)
}
Py_XDECREF(tok->decoding_readline);
Py_XDECREF(tok->decoding_buffer);
+ Py_XDECREF(tok->readline);
Py_XDECREF(tok->filename);
- if (tok->fp != NULL && tok->buf != NULL) {
+ if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
PyMem_Free(tok->buf);
}
if (tok->input) {
@@ -831,9 +1060,20 @@ _PyTokenizer_Free(struct tok_state *tok)
if (tok->interactive_src_start != NULL) {
PyMem_Free(tok->interactive_src_start);
}
+ free_fstring_expressions(tok);
PyMem_Free(tok);
}
+void
+_PyToken_Free(struct token *token) {
+ Py_XDECREF(token->metadata);
+}
+
+void
+_PyToken_Init(struct token *token) {
+ token->metadata = NULL;
+}
+
static int
tok_readline_raw(struct tok_state *tok)
{
@@ -860,6 +1100,67 @@ tok_readline_raw(struct tok_state *tok)
}
static int
+tok_readline_string(struct tok_state* tok) {
+ PyObject* line = NULL;
+ PyObject* raw_line = PyObject_CallNoArgs(tok->readline);
+ if (raw_line == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+ PyErr_Clear();
+ return 1;
+ }
+ error_ret(tok);
+ goto error;
+ }
+ if(tok->encoding != NULL) {
+ if (!PyBytes_Check(raw_line)) {
+ PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object");
+ error_ret(tok);
+ goto error;
+ }
+ line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line),
+ tok->encoding, "replace");
+ Py_CLEAR(raw_line);
+ if (line == NULL) {
+ error_ret(tok);
+ goto error;
+ }
+ } else {
+ if(!PyUnicode_Check(raw_line)) {
+ PyErr_Format(PyExc_TypeError, "readline() returned a non-string object");
+ error_ret(tok);
+ goto error;
+ }
+ line = raw_line;
+ raw_line = NULL;
+ }
+ Py_ssize_t buflen;
+ const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
+ if (buf == NULL) {
+ error_ret(tok);
+ goto error;
+ }
+
+ // Make room for the null terminator *and* potentially
+ // an extra newline character that we may need to artificially
+ // add.
+ size_t buffer_size = buflen + 2;
+ if (!tok_reserve_buf(tok, buffer_size)) {
+ goto error;
+ }
+ memcpy(tok->inp, buf, buflen);
+ tok->inp += buflen;
+ *tok->inp = '\0';
+
+ tok->line_start = tok->cur;
+ Py_DECREF(line);
+ return 1;
+error:
+ Py_XDECREF(raw_line);
+ Py_XDECREF(line);
+ return 0;
+}
+
+static int
tok_underflow_string(struct tok_state *tok) {
char *end = strchr(tok->inp, '\n');
if (end != NULL) {
@@ -876,7 +1177,7 @@ tok_underflow_string(struct tok_state *tok) {
tok->buf = tok->cur;
}
tok->line_start = tok->cur;
- tok->lineno++;
+ ADVANCE_LINENO();
tok->inp = end;
return 1;
}
@@ -889,7 +1190,7 @@ tok_underflow_interactive(struct tok_state *tok) {
}
char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
if (newtok != NULL) {
- char *translated = translate_newlines(newtok, 0, tok);
+ char *translated = translate_newlines(newtok, 0, 0, tok);
PyMem_Free(newtok);
if (translated == NULL) {
return 0;
@@ -934,8 +1235,9 @@ tok_underflow_interactive(struct tok_state *tok) {
}
else if (tok->start != NULL) {
Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
+ remember_fstring_buffers(tok);
size_t size = strlen(newtok);
- tok->lineno++;
+ ADVANCE_LINENO();
if (!tok_reserve_buf(tok, size + 1)) {
PyMem_Free(tok->buf);
tok->buf = NULL;
@@ -946,15 +1248,18 @@ tok_underflow_interactive(struct tok_state *tok) {
PyMem_Free(newtok);
tok->inp += size;
tok->multi_line_start = tok->buf + cur_multi_line_start;
+ restore_fstring_buffers(tok);
}
else {
- tok->lineno++;
+ remember_fstring_buffers(tok);
+ ADVANCE_LINENO();
PyMem_Free(tok->buf);
tok->buf = newtok;
tok->cur = tok->buf;
tok->line_start = tok->buf;
tok->inp = strchr(tok->buf, '\0');
tok->end = tok->inp + 1;
+ restore_fstring_buffers(tok);
}
if (tok->done != E_OK) {
if (tok->prompt != NULL) {
@@ -962,12 +1267,16 @@ tok_underflow_interactive(struct tok_state *tok) {
}
return 0;
}
+
+ if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
+ return 0;
+ }
return 1;
}
static int
tok_underflow_file(struct tok_state *tok) {
- if (tok->start == NULL) {
+ if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
tok->cur = tok->inp = tok->buf;
}
if (tok->decoding_state == STATE_INIT) {
@@ -997,14 +1306,20 @@ tok_underflow_file(struct tok_state *tok) {
tok->done = E_EOF;
return 0;
}
+ tok->implicit_newline = 0;
if (tok->inp[-1] != '\n') {
assert(tok->inp + 1 < tok->end);
/* Last line does not end in \n, fake one */
*tok->inp++ = '\n';
*tok->inp = '\0';
+ tok->implicit_newline = 1;
+ }
+
+ if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
+ return 0;
}
- tok->lineno++;
+ ADVANCE_LINENO();
if (tok->decoding_state != STATE_NORMAL) {
if (tok->lineno > 2) {
tok->decoding_state = STATE_NORMAL;
@@ -1025,6 +1340,44 @@ tok_underflow_file(struct tok_state *tok) {
return tok->done == E_OK;
}
+static int
+tok_underflow_readline(struct tok_state* tok) {
+ assert(tok->decoding_state == STATE_NORMAL);
+ assert(tok->fp == NULL && tok->input == NULL && tok->decoding_readline == NULL);
+ if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
+ tok->cur = tok->inp = tok->buf;
+ }
+ if (!tok_readline_string(tok)) {
+ return 0;
+ }
+ if (tok->inp == tok->cur) {
+ tok->done = E_EOF;
+ return 0;
+ }
+ tok->implicit_newline = 0;
+ if (tok->inp[-1] != '\n') {
+ assert(tok->inp + 1 < tok->end);
+ /* Last line does not end in \n, fake one */
+ *tok->inp++ = '\n';
+ *tok->inp = '\0';
+ tok->implicit_newline = 1;
+ }
+
+ if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
+ return 0;
+ }
+
+ ADVANCE_LINENO();
+ /* The default encoding is UTF-8, so make sure we don't have any
+ non-UTF-8 sequences in it. */
+ if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
+ error_ret(tok);
+ return 0;
+ }
+ assert(tok->done == E_OK);
+ return tok->done == E_OK;
+}
+
#if defined(Py_DEBUG)
static void
print_escape(FILE *f, const char *s, Py_ssize_t size)
@@ -1062,16 +1415,20 @@ tok_nextc(struct tok_state *tok)
int rc;
for (;;) {
if (tok->cur != tok->inp) {
- if (tok->cur - tok->buf >= INT_MAX) {
+ if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
tok->done = E_COLUMNOVERFLOW;
return EOF;
}
+ tok->col_offset++;
return Py_CHARMASK(*tok->cur++); /* Fast path */
}
if (tok->done != E_OK) {
- return EOF;
+ return EOF;
+ }
+ if (tok->readline) {
+ rc = tok_underflow_readline(tok);
}
- if (tok->fp == NULL) {
+ else if (tok->fp == NULL) {
rc = tok_underflow_string(tok);
}
else if (tok->prompt != NULL) {
@@ -1081,7 +1438,7 @@ tok_nextc(struct tok_state *tok)
rc = tok_underflow_file(tok);
}
#if defined(Py_DEBUG)
- if (Py_DebugFlag) {
+ if (tok->debug) {
fprintf(stderr, "line[%d] = ", tok->lineno);
print_escape(stderr, tok->cur, tok->inp - tok->cur);
fprintf(stderr, " tok->done = %d\n", tok->done);
@@ -1111,9 +1468,10 @@ tok_backup(struct tok_state *tok, int c)
if (--tok->cur < tok->buf) {
Py_FatalError("tokenizer beginning of buffer");
}
- if ((int)(unsigned char)*tok->cur != c) {
+ if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) {
Py_FatalError("tok_backup: wrong character");
}
+ tok->col_offset--;
}
}
@@ -1122,6 +1480,12 @@ _syntaxerror_range(struct tok_state *tok, const char *format,
int col_offset, int end_col_offset,
va_list vargs)
{
+ // In release builds, we don't want to overwrite a previous error, but in debug builds we
+ // want to fail if we are not doing it so we can fix it.
+ assert(tok->done != E_ERROR);
+ if (tok->done == E_ERROR) {
+ return ERRORTOKEN;
+ }
PyObject *errmsg, *errtext, *args;
errmsg = PyUnicode_FromFormatV(format, vargs);
if (!errmsg) {
@@ -1167,12 +1531,9 @@ error:
static int
syntaxerror(struct tok_state *tok, const char *format, ...)
{
+ // This errors are cleaned on startup. Todo: Fix it.
va_list vargs;
-#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
-#else
- va_start(vargs);
-#endif
int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
va_end(vargs);
return ret;
@@ -1184,18 +1545,12 @@ syntaxerror_known_range(struct tok_state *tok,
const char *format, ...)
{
va_list vargs;
-#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
-#else
- va_start(vargs);
-#endif
int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
va_end(vargs);
return ret;
}
-
-
static int
indenterror(struct tok_state *tok)
{
@@ -1213,11 +1568,7 @@ parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
PyObject *errmsg;
va_list vargs;
-#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
-#else
- va_start(vargs);
-#endif
errmsg = PyUnicode_FromFormatV(format, vargs);
va_end(vargs);
if (!errmsg) {
@@ -1244,6 +1595,40 @@ error:
}
static int
+warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char)
+{
+ if (!tok->report_warnings) {
+ return 0;
+ }
+
+ PyObject *msg = PyUnicode_FromFormat(
+ "invalid escape sequence '\\%c'",
+ (char) first_invalid_escape_char
+ );
+
+ if (msg == NULL) {
+ return -1;
+ }
+
+ if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename,
+ tok->lineno, NULL, NULL) < 0) {
+ Py_DECREF(msg);
+
+ if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) {
+ /* Replace the SyntaxWarning exception with a SyntaxError
+ to get a more accurate error report */
+ PyErr_Clear();
+ return syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char);
+ }
+
+ return -1;
+ }
+
+ Py_DECREF(msg);
+ return 0;
+}
+
+static int
lookahead(struct tok_state *tok, const char *test)
{
const char *s = test;
@@ -1267,8 +1652,12 @@ lookahead(struct tok_state *tok, const char *test)
}
static int
-verify_end_of_number(struct tok_state *tok, int c, const char *kind)
-{
+verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
+ if (tok->tok_extra_tokens) {
+ // When we are parsing extra tokens, we don't want to emit warnings
+ // about invalid literals, because we want to be a bit more liberal.
+ return 1;
+ }
/* Emit a deprecation warning only if the numeric literal is immediately
* followed by one of keywords which can occur after a numeric literal
* in valid code: "and", "else", "for", "if", "in", "is" and "or".
@@ -1326,6 +1715,9 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind)
static int
verify_identifier(struct tok_state *tok)
{
+ if (tok->tok_extra_tokens) {
+ return 1;
+ }
PyObject *s;
if (tok->decoding_erred)
return 0;
@@ -1361,14 +1753,11 @@ verify_identifier(struct tok_state *tok)
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
}
Py_DECREF(s);
- // PyUnicode_FromFormatV() does not support %X
- char hex[9];
- (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
if (Py_UNICODE_ISPRINTABLE(ch)) {
- syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+ syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch);
}
else {
- syntaxerror(tok, "invalid non-printable character U+%s", hex);
+ syntaxerror(tok, "invalid non-printable character U+%04X", ch);
}
return 0;
}
@@ -1398,11 +1787,13 @@ tok_decimal_tail(struct tok_state *tok)
return c;
}
-/* Get next token, after space stripping etc. */
static inline int
tok_continuation_line(struct tok_state *tok) {
int c = tok_nextc(tok);
+ if (c == '\r') {
+ c = tok_nextc(tok);
+ }
if (c != '\n') {
tok->done = E_LINECONT;
return -1;
@@ -1419,16 +1810,56 @@ tok_continuation_line(struct tok_state *tok) {
}
static int
-tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
+type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
+ int end_col_offset, const char *start, const char *end)
+{
+ token->level = tok->level;
+ token->lineno = token->end_lineno = tok->lineno;
+ token->col_offset = col_offset;
+ token->end_col_offset = end_col_offset;
+ token->start = start;
+ token->end = end;
+ return type;
+}
+
+static int
+token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
+{
+ assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
+ token->level = tok->level;
+ if (ISSTRINGLIT(type)) {
+ token->lineno = tok->first_lineno;
+ }
+ else {
+ token->lineno = tok->lineno;
+ }
+ token->end_lineno = tok->lineno;
+ token->col_offset = token->end_col_offset = -1;
+ token->start = start;
+ token->end = end;
+
+ if (start != NULL && end != NULL) {
+ token->col_offset = tok->starting_col_offset;
+ token->end_col_offset = tok->col_offset;
+ }
+ return type;
+}
+
+
+static int
+tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
{
int c;
int blankline, nonascii;
- *p_start = *p_end = NULL;
+ const char *p_start = NULL;
+ const char *p_end = NULL;
nextline:
tok->start = NULL;
+ tok->starting_col_offset = -1;
blankline = 0;
+
/* Get indentation level */
if (tok->atbol) {
int col = 0;
@@ -1454,7 +1885,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
// the level of indentation of whatever comes next.
cont_line_col = cont_line_col ? cont_line_col : col;
if ((c = tok_continuation_line(tok)) == -1) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else {
@@ -1462,7 +1893,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
}
}
tok_backup(tok, c);
- if (c == '#' || c == '\n') {
+ if (c == '#' || c == '\n' || c == '\r') {
/* Lines with only whitespace and/or comments
shouldn't affect the indentation and are
not passed to the parser as NEWLINE tokens,
@@ -1489,7 +1920,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
if (col == tok->indstack[tok->indent]) {
/* No change */
if (altcol != tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ return MAKE_TOKEN(indenterror(tok));
}
}
else if (col > tok->indstack[tok->indent]) {
@@ -1497,10 +1928,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
if (tok->indent+1 >= MAXINDENT) {
tok->done = E_TOODEEP;
tok->cur = tok->inp;
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
if (altcol <= tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ return MAKE_TOKEN(indenterror(tok));
}
tok->pendin++;
tok->indstack[++tok->indent] = col;
@@ -1516,26 +1947,35 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
if (col != tok->indstack[tok->indent]) {
tok->done = E_DEDENT;
tok->cur = tok->inp;
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
if (altcol != tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ return MAKE_TOKEN(indenterror(tok));
}
}
}
}
tok->start = tok->cur;
+ tok->starting_col_offset = tok->col_offset;
/* Return pending indents/dedents */
- if (tok->pendin != 0) {
+ if (tok->pendin != 0) {
if (tok->pendin < 0) {
+ if (tok->tok_extra_tokens) {
+ p_start = tok->cur;
+ p_end = tok->cur;
+ }
tok->pendin++;
- return DEDENT;
+ return MAKE_TOKEN(DEDENT);
}
else {
+ if (tok->tok_extra_tokens) {
+ p_start = tok->buf;
+ p_end = tok->cur;
+ }
tok->pendin--;
- return INDENT;
+ return MAKE_TOKEN(INDENT);
}
}
@@ -1573,25 +2013,36 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
/* Set start of current token */
tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
+ tok->starting_col_offset = tok->col_offset - 1;
/* Skip comment, unless it's a type comment */
if (c == '#') {
- const char *prefix, *p, *type_start;
- while (c != EOF && c != '\n') {
+ const char* p = NULL;
+ const char *prefix, *type_start;
+ int current_starting_col_offset;
+
+ while (c != EOF && c != '\n' && c != '\r') {
c = tok_nextc(tok);
}
+ if (tok->tok_extra_tokens) {
+ p = tok->start;
+ }
+
if (tok->type_comments) {
p = tok->start;
+ current_starting_col_offset = tok->starting_col_offset;
prefix = type_comment_prefix;
while (*prefix && p < tok->cur) {
if (*prefix == ' ') {
while (*p == ' ' || *p == '\t') {
p++;
+ current_starting_col_offset++;
}
} else if (*prefix == *p) {
p++;
+ current_starting_col_offset++;
} else {
break;
}
@@ -1602,7 +2053,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
/* This is a type comment if we matched all of type_comment_prefix. */
if (!*prefix) {
int is_type_ignore = 1;
+ // +6 in order to skip the word 'ignore'
const char *ignore_end = p + 6;
+ const int ignore_end_col_offset = current_starting_col_offset + 6;
tok_backup(tok, c); /* don't eat the newline or EOF */
type_start = p;
@@ -1615,34 +2068,41 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
&& ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
if (is_type_ignore) {
- *p_start = ignore_end;
- *p_end = tok->cur;
+ p_start = ignore_end;
+ p_end = tok->cur;
/* If this type ignore is the only thing on the line, consume the newline also. */
if (blankline) {
tok_nextc(tok);
tok->atbol = 1;
}
- return TYPE_IGNORE;
+ return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
} else {
- *p_start = type_start; /* after type_comment_prefix */
- *p_end = tok->cur;
- return TYPE_COMMENT;
+ p_start = type_start;
+ p_end = tok->cur;
+ return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
}
}
}
+ if (tok->tok_extra_tokens) {
+ tok_backup(tok, c); /* don't eat the newline or EOF */
+ p_start = p;
+ p_end = tok->cur;
+ tok->comment_newline = blankline;
+ return MAKE_TOKEN(COMMENT);
+ }
}
if (tok->done == E_INTERACT_STOP) {
- return ENDMARKER;
+ return MAKE_TOKEN(ENDMARKER);
}
/* Check for EOF and errors now */
if (c == EOF) {
if (tok->level) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
- return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
+ return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
}
/* Identifier (most frequent token!) */
@@ -1671,6 +2131,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
}
c = tok_nextc(tok);
if (c == '"' || c == '\'') {
+ if (saw_f) {
+ goto f_string_quote;
+ }
goto letter_quote;
}
}
@@ -1682,11 +2145,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
}
tok_backup(tok, c);
if (nonascii && !verify_identifier(tok)) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
- *p_start = tok->start;
- *p_end = tok->cur;
+ p_start = tok->start;
+ p_end = tok->cur;
/* async/await parsing block. */
if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
@@ -1701,10 +2164,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
if (!tok->async_hacks || tok->async_def) {
/* Always recognize the keywords. */
if (memcmp(tok->start, "async", 5) == 0) {
- return ASYNC;
+ return MAKE_TOKEN(ASYNC);
}
if (memcmp(tok->start, "await", 5) == 0) {
- return AWAIT;
+ return MAKE_TOKEN(AWAIT);
}
}
else if (memcmp(tok->start, "async", 5) == 0) {
@@ -1712,13 +2175,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
Look ahead one token to see if that is 'def'. */
struct tok_state ahead_tok;
- const char *ahead_tok_start = NULL;
- const char *ahead_tok_end = NULL;
+ struct token ahead_token;
+ _PyToken_Init(&ahead_token);
int ahead_tok_kind;
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
- ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
- &ahead_tok_end);
+ ahead_tok_kind = tok_get_normal_mode(&ahead_tok,
+ current_tok,
+ &ahead_token);
if (ahead_tok_kind == NAME
&& ahead_tok.cur - ahead_tok.start == 3
@@ -1728,29 +2192,49 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
returning a plain NAME token, return ASYNC. */
tok->async_def_indent = tok->indent;
tok->async_def = 1;
- return ASYNC;
+ _PyToken_Free(&ahead_token);
+ return MAKE_TOKEN(ASYNC);
}
+ _PyToken_Free(&ahead_token);
}
}
- return NAME;
+ return MAKE_TOKEN(NAME);
+ }
+
+ if (c == '\r') {
+ c = tok_nextc(tok);
}
/* Newline */
if (c == '\n') {
tok->atbol = 1;
if (blankline || tok->level > 0) {
+ if (tok->tok_extra_tokens) {
+ if (tok->comment_newline) {
+ tok->comment_newline = 0;
+ }
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NL);
+ }
goto nextline;
}
- *p_start = tok->start;
- *p_end = tok->cur - 1; /* Leave '\n' out of the string */
+ if (tok->comment_newline && tok->tok_extra_tokens) {
+ tok->comment_newline = 0;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NL);
+ }
+ p_start = tok->start;
+ p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
if (tok->async_def) {
/* We're somewhere inside an 'async def' function, and
we've encountered a NEWLINE after its signature. */
tok->async_def_nl = 1;
}
- return NEWLINE;
+ return MAKE_TOKEN(NEWLINE);
}
/* Period or number starting with period? */
@@ -1761,9 +2245,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
} else if (c == '.') {
c = tok_nextc(tok);
if (c == '.') {
- *p_start = tok->start;
- *p_end = tok->cur;
- return ELLIPSIS;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(ELLIPSIS);
}
else {
tok_backup(tok, c);
@@ -1773,9 +2257,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
else {
tok_backup(tok, c);
}
- *p_start = tok->start;
- *p_end = tok->cur;
- return DOT;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(DOT);
}
/* Number */
@@ -1792,14 +2276,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
}
if (!isxdigit(c)) {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid hexadecimal literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal"));
}
do {
c = tok_nextc(tok);
} while (isxdigit(c));
} while (c == '_');
if (!verify_end_of_number(tok, c, "hexadecimal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else if (c == 'o' || c == 'O') {
@@ -1811,12 +2295,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
}
if (c < '0' || c >= '8') {
if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in octal literal", c);
+ return MAKE_TOKEN(syntaxerror(tok,
+ "invalid digit '%c' in octal literal", c));
}
else {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid octal literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal"));
}
}
do {
@@ -1824,11 +2308,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
} while ('0' <= c && c < '8');
} while (c == '_');
if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in octal literal", c);
+ return MAKE_TOKEN(syntaxerror(tok,
+ "invalid digit '%c' in octal literal", c));
}
if (!verify_end_of_number(tok, c, "octal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else if (c == 'b' || c == 'B') {
@@ -1840,12 +2324,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
}
if (c != '0' && c != '1') {
if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in binary literal", c);
+ return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
}
else {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid binary literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal"));
}
}
do {
@@ -1853,11 +2336,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
} while (c == '0' || c == '1');
} while (c == '_');
if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in binary literal", c);
+ return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
}
if (!verify_end_of_number(tok, c, "binary")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else {
@@ -1869,7 +2351,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
c = tok_nextc(tok);
if (!isdigit(c)) {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid decimal literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
}
}
if (c != '0') {
@@ -1882,7 +2364,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
nonzero = 1;
c = tok_decimal_tail(tok);
if (c == 0) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
if (c == '.') {
@@ -1895,18 +2377,18 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
else if (c == 'j' || c == 'J') {
goto imaginary;
}
- else if (nonzero) {
+ else if (nonzero && !tok->tok_extra_tokens) {
/* Old-style octal: now disallowed. */
tok_backup(tok, c);
- return syntaxerror_known_range(
+ return MAKE_TOKEN(syntaxerror_known_range(
tok, (int)(tok->start + 1 - tok->line_start),
(int)(zeros_end - tok->line_start),
"leading zeros in decimal integer "
"literals are not permitted; "
- "use an 0o prefix for octal integers");
+ "use an 0o prefix for octal integers"));
}
if (!verify_end_of_number(tok, c, "decimal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
}
@@ -1914,7 +2396,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
/* Decimal */
c = tok_decimal_tail(tok);
if (c == 0) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
{
/* Accept floating point numbers. */
@@ -1925,7 +2407,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
if (isdigit(c)) {
c = tok_decimal_tail(tok);
if (c == 0) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
}
@@ -1939,21 +2421,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
c = tok_nextc(tok);
if (!isdigit(c)) {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid decimal literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
}
} else if (!isdigit(c)) {
tok_backup(tok, c);
if (!verify_end_of_number(tok, e, "decimal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
tok_backup(tok, e);
- *p_start = tok->start;
- *p_end = tok->cur;
- return NUMBER;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NUMBER);
}
c = tok_decimal_tail(tok);
if (c == 0) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
if (c == 'j' || c == 'J') {
@@ -1961,18 +2443,85 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
imaginary:
c = tok_nextc(tok);
if (!verify_end_of_number(tok, c, "imaginary")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else if (!verify_end_of_number(tok, c, "decimal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
}
tok_backup(tok, c);
- *p_start = tok->start;
- *p_end = tok->cur;
- return NUMBER;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NUMBER);
+ }
+
+ f_string_quote:
+ if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) {
+ int quote = c;
+ int quote_size = 1; /* 1 or 3 */
+
+ /* Nodes of type STRING, especially multi line strings
+ must be handled differently in order to get both
+ the starting line number and the column offset right.
+ (cf. issue 16806) */
+ tok->first_lineno = tok->lineno;
+ tok->multi_line_start = tok->line_start;
+
+ /* Find the quote size and start of string */
+ int after_quote = tok_nextc(tok);
+ if (after_quote == quote) {
+ int after_after_quote = tok_nextc(tok);
+ if (after_after_quote == quote) {
+ quote_size = 3;
+ }
+ else {
+ // TODO: Check this
+ tok_backup(tok, after_after_quote);
+ tok_backup(tok, after_quote);
+ }
+ }
+ if (after_quote != quote) {
+ tok_backup(tok, after_quote);
+ }
+
+
+ p_start = tok->start;
+ p_end = tok->cur;
+ if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) {
+ return MAKE_TOKEN(syntaxerror(tok, "too many nested f-strings"));
+ }
+ tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok);
+ the_current_tok->kind = TOK_FSTRING_MODE;
+ the_current_tok->f_string_quote = quote;
+ the_current_tok->f_string_quote_size = quote_size;
+ the_current_tok->f_string_start = tok->start;
+ the_current_tok->f_string_multi_line_start = tok->line_start;
+ the_current_tok->f_string_line_start = tok->lineno;
+ the_current_tok->f_string_start_offset = -1;
+ the_current_tok->f_string_multi_line_start_offset = -1;
+ the_current_tok->last_expr_buffer = NULL;
+ the_current_tok->last_expr_size = 0;
+ the_current_tok->last_expr_end = -1;
+ the_current_tok->f_string_debug = 0;
+
+ switch (*tok->start) {
+ case 'F':
+ case 'f':
+ the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r';
+ break;
+ case 'R':
+ case 'r':
+ the_current_tok->f_string_raw = 1;
+ break;
+ default:
+ Py_UNREACHABLE();
+ }
+
+ the_current_tok->curly_bracket_depth = 0;
+ the_current_tok->curly_bracket_expr_start_depth = -1;
+ return MAKE_TOKEN(FSTRING_START);
}
letter_quote:
@@ -2008,7 +2557,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
while (end_quote_size != quote_size) {
c = tok_nextc(tok);
if (tok->done == E_ERROR) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
if (tok->done == E_DECODE) {
break;
@@ -2023,13 +2572,27 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
tok->line_start = tok->multi_line_start;
int start = tok->lineno;
tok->lineno = tok->first_lineno;
+
+ if (INSIDE_FSTRING(tok)) {
+ /* When we are in an f-string, before raising the
+ * unterminated string literal error, check whether
+ * does the initial quote matches with f-strings quotes
+ * and if it is, then this must be a missing '}' token
+ * so raise the proper error */
+ tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
+ if (the_current_tok->f_string_quote == quote &&
+ the_current_tok->f_string_quote_size == quote_size) {
+ return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start));
+ }
+ }
+
if (quote_size == 3) {
syntaxerror(tok, "unterminated triple-quoted string literal"
" (detected at line %d)", start);
if (c != '\n') {
tok->done = E_EOFS;
}
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
else {
syntaxerror(tok, "unterminated string literal (detected at"
@@ -2037,7 +2600,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
if (c != '\n') {
tok->done = E_EOLS;
}
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
if (c == quote) {
@@ -2046,41 +2609,66 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
else {
end_quote_size = 0;
if (c == '\\') {
- tok_nextc(tok); /* skip escaped char */
+ c = tok_nextc(tok); /* skip escaped char */
+ if (c == '\r') {
+ c = tok_nextc(tok);
+ }
}
}
}
- *p_start = tok->start;
- *p_end = tok->cur;
- return STRING;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(STRING);
}
/* Line continuation */
if (c == '\\') {
if ((c = tok_continuation_line(tok)) == -1) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
tok->cont_line = 1;
goto again; /* Read next line */
}
+ /* Punctuation character */
+ int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
+ if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) {
+ /* This code block gets executed before the curly_bracket_depth is incremented
+ * by the `{` case, so for ensuring that we are on the 0th level, we need
+ * to adjust it manually */
+ int cursor = current_tok->curly_bracket_depth - (c != '{');
+ if (cursor == 0 && !update_fstring_expr(tok, c)) {
+ return MAKE_TOKEN(ENDMARKER);
+ }
+ if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) {
+ return MAKE_TOKEN(ERRORTOKEN);
+ }
+
+ if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
+ current_tok->kind = TOK_FSTRING_MODE;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(_PyToken_OneChar(c));
+ }
+ }
+
/* Check for two-character token */
{
int c2 = tok_nextc(tok);
- int token = PyToken_TwoChars(c, c2);
- if (token != OP) {
+ int current_token = _PyToken_TwoChars(c, c2);
+ if (current_token != OP) {
int c3 = tok_nextc(tok);
- int token3 = PyToken_ThreeChars(c, c2, c3);
- if (token3 != OP) {
- token = token3;
+ int current_token3 = _PyToken_ThreeChars(c, c2, c3);
+ if (current_token3 != OP) {
+ current_token = current_token3;
}
else {
tok_backup(tok, c3);
}
- *p_start = tok->start;
- *p_end = tok->cur;
- return token;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(current_token);
}
tok_backup(tok, c2);
}
@@ -2091,58 +2679,305 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
case '[':
case '{':
if (tok->level >= MAXLEVEL) {
- return syntaxerror(tok, "too many nested parentheses");
+ return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses"));
}
tok->parenstack[tok->level] = c;
tok->parenlinenostack[tok->level] = tok->lineno;
tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
tok->level++;
+ if (INSIDE_FSTRING(tok)) {
+ current_tok->curly_bracket_depth++;
+ }
break;
case ')':
case ']':
case '}':
- if (!tok->level) {
- return syntaxerror(tok, "unmatched '%c'", c);
- }
- tok->level--;
- int opening = tok->parenstack[tok->level];
- if (!((opening == '(' && c == ')') ||
- (opening == '[' && c == ']') ||
- (opening == '{' && c == '}')))
- {
- if (tok->parenlinenostack[tok->level] != tok->lineno) {
- return syntaxerror(tok,
- "closing parenthesis '%c' does not match "
- "opening parenthesis '%c' on line %d",
- c, opening, tok->parenlinenostack[tok->level]);
+ if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
+ return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed"));
+ }
+ if (!tok->tok_extra_tokens && !tok->level) {
+ return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
+ }
+ if (tok->level > 0) {
+ tok->level--;
+ int opening = tok->parenstack[tok->level];
+ if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') ||
+ (opening == '[' && c == ']') ||
+ (opening == '{' && c == '}'))) {
+ /* If the opening bracket belongs to an f-string's expression
+ part (e.g. f"{)}") and the closing bracket is an arbitrary
+ nested expression, then instead of matching a different
+ syntactical construct with it; we'll throw an unmatched
+ parentheses error. */
+ if (INSIDE_FSTRING(tok) && opening == '{') {
+ assert(current_tok->curly_bracket_depth >= 0);
+ int previous_bracket = current_tok->curly_bracket_depth - 1;
+ if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
+ return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c));
+ }
+ }
+ if (tok->parenlinenostack[tok->level] != tok->lineno) {
+ return MAKE_TOKEN(syntaxerror(tok,
+ "closing parenthesis '%c' does not match "
+ "opening parenthesis '%c' on line %d",
+ c, opening, tok->parenlinenostack[tok->level]));
+ }
+ else {
+ return MAKE_TOKEN(syntaxerror(tok,
+ "closing parenthesis '%c' does not match "
+ "opening parenthesis '%c'",
+ c, opening));
+ }
}
- else {
- return syntaxerror(tok,
- "closing parenthesis '%c' does not match "
- "opening parenthesis '%c'",
- c, opening);
+ }
+
+ if (INSIDE_FSTRING(tok)) {
+ current_tok->curly_bracket_depth--;
+ if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
+ current_tok->curly_bracket_expr_start_depth--;
+ current_tok->kind = TOK_FSTRING_MODE;
+ current_tok->f_string_debug = 0;
}
}
break;
+ default:
+ break;
}
if (!Py_UNICODE_ISPRINTABLE(c)) {
- char hex[9];
- (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
- return syntaxerror(tok, "invalid non-printable character U+%s", hex);
+ return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%04X", c));
+ }
+
+ if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
+ current_tok->f_string_debug = 1;
}
/* Punctuation character */
- *p_start = tok->start;
- *p_end = tok->cur;
- return PyToken_OneChar(c);
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(_PyToken_OneChar(c));
+}
+
+static int
+tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
+{
+ const char *p_start = NULL;
+ const char *p_end = NULL;
+ int end_quote_size = 0;
+ int unicode_escape = 0;
+
+ tok->start = tok->cur;
+ tok->first_lineno = tok->lineno;
+ tok->starting_col_offset = tok->col_offset;
+
+ // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
+ // before it.
+ int start_char = tok_nextc(tok);
+ if (start_char == '{') {
+ int peek1 = tok_nextc(tok);
+ tok_backup(tok, peek1);
+ tok_backup(tok, start_char);
+ if (peek1 != '{') {
+ current_tok->curly_bracket_expr_start_depth++;
+ if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
+ return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
+ }
+ TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+ return tok_get_normal_mode(tok, current_tok, token);
+ }
+ }
+ else {
+ tok_backup(tok, start_char);
+ }
+
+ // Check if we are at the end of the string
+ for (int i = 0; i < current_tok->f_string_quote_size; i++) {
+ int quote = tok_nextc(tok);
+ if (quote != current_tok->f_string_quote) {
+ tok_backup(tok, quote);
+ goto f_string_middle;
+ }
+ }
+
+ if (current_tok->last_expr_buffer != NULL) {
+ PyMem_Free(current_tok->last_expr_buffer);
+ current_tok->last_expr_buffer = NULL;
+ current_tok->last_expr_size = 0;
+ current_tok->last_expr_end = -1;
+ }
+
+ p_start = tok->start;
+ p_end = tok->cur;
+ tok->tok_mode_stack_index--;
+ return MAKE_TOKEN(FSTRING_END);
+
+f_string_middle:
+
+ // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
+ // this.
+ tok->multi_line_start = tok->line_start;
+ while (end_quote_size != current_tok->f_string_quote_size) {
+ int c = tok_nextc(tok);
+ if (tok->done == E_ERROR) {
+ return MAKE_TOKEN(ERRORTOKEN);
+ }
+ int in_format_spec = (
+ current_tok->last_expr_end != -1
+ &&
+ INSIDE_FSTRING_EXPR(current_tok)
+ );
+
+ if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) {
+ if (tok->decoding_erred) {
+ return MAKE_TOKEN(ERRORTOKEN);
+ }
+
+ // If we are in a format spec and we found a newline,
+ // it means that the format spec ends here and we should
+ // return to the regular mode.
+ if (in_format_spec && c == '\n') {
+ tok_backup(tok, c);
+ TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(FSTRING_MIDDLE);
+ }
+
+ assert(tok->multi_line_start != NULL);
+ // shift the tok_state's location into
+ // the start of string, and report the error
+ // from the initial quote character
+ tok->cur = (char *)current_tok->f_string_start;
+ tok->cur++;
+ tok->line_start = current_tok->f_string_multi_line_start;
+ int start = tok->lineno;
+
+ tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
+ tok->lineno = the_current_tok->f_string_line_start;
+
+ if (current_tok->f_string_quote_size == 3) {
+ syntaxerror(tok,
+ "unterminated triple-quoted f-string literal"
+ " (detected at line %d)", start);
+ if (c != '\n') {
+ tok->done = E_EOFS;
+ }
+ return MAKE_TOKEN(ERRORTOKEN);
+ }
+ else {
+ return MAKE_TOKEN(syntaxerror(tok,
+ "unterminated f-string literal (detected at"
+ " line %d)", start));
+ }
+ }
+
+ if (c == current_tok->f_string_quote) {
+ end_quote_size += 1;
+ continue;
+ } else {
+ end_quote_size = 0;
+ }
+
+ if (c == '{') {
+ int peek = tok_nextc(tok);
+ if (peek != '{' || in_format_spec) {
+ tok_backup(tok, peek);
+ tok_backup(tok, c);
+ current_tok->curly_bracket_expr_start_depth++;
+ if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
+ return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
+ }
+ TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+ p_start = tok->start;
+ p_end = tok->cur;
+ } else {
+ p_start = tok->start;
+ p_end = tok->cur - 1;
+ }
+ return MAKE_TOKEN(FSTRING_MIDDLE);
+ } else if (c == '}') {
+ if (unicode_escape) {
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(FSTRING_MIDDLE);
+ }
+ int peek = tok_nextc(tok);
+
+ // The tokenizer can only be in the format spec if we have already completed the expression
+ // scanning (indicated by the end of the expression being set) and we are not at the top level
+ // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
+ // brackets, we can bypass it here.
+ if (peek == '}' && !in_format_spec) {
+ p_start = tok->start;
+ p_end = tok->cur - 1;
+ } else {
+ tok_backup(tok, peek);
+ tok_backup(tok, c);
+ TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+ p_start = tok->start;
+ p_end = tok->cur;
+ }
+ return MAKE_TOKEN(FSTRING_MIDDLE);
+ } else if (c == '\\') {
+ int peek = tok_nextc(tok);
+ if (peek == '\r') {
+ peek = tok_nextc(tok);
+ }
+ // Special case when the backslash is right before a curly
+ // brace. We have to restore and return the control back
+ // to the loop for the next iteration.
+ if (peek == '{' || peek == '}') {
+ if (!current_tok->f_string_raw) {
+ if (warn_invalid_escape_sequence(tok, peek)) {
+ return MAKE_TOKEN(ERRORTOKEN);
+ }
+ }
+ tok_backup(tok, peek);
+ continue;
+ }
+
+ if (!current_tok->f_string_raw) {
+ if (peek == 'N') {
+ /* Handle named unicode escapes (\N{BULLET}) */
+ peek = tok_nextc(tok);
+ if (peek == '{') {
+ unicode_escape = 1;
+ } else {
+ tok_backup(tok, peek);
+ }
+ }
+ } /* else {
+ skip the escaped character
+ }*/
+ }
+ }
+
+ // Backup the f-string quotes to emit a final FSTRING_MIDDLE and
+ // add the quotes to the FSTRING_END in the next tokenizer iteration.
+ for (int i = 0; i < current_tok->f_string_quote_size; i++) {
+ tok_backup(tok, current_tok->f_string_quote);
+ }
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(FSTRING_MIDDLE);
+}
+
+
+static int
+tok_get(struct tok_state *tok, struct token *token)
+{
+ tokenizer_mode *current_tok = TOK_GET_MODE(tok);
+ if (current_tok->kind == TOK_REGULAR_MODE) {
+ return tok_get_normal_mode(tok, current_tok, token);
+ } else {
+ return tok_get_fstring_mode(tok, current_tok, token);
+ }
}
int
-_PyTokenizer_Get(struct tok_state *tok,
- const char **p_start, const char **p_end)
+_PyTokenizer_Get(struct tok_state *tok, struct token *token)
{
- int result = tok_get(tok, p_start, p_end);
+ int result = tok_get(tok, token);
if (tok->decoding_erred) {
result = ERRORTOKEN;
tok->done = E_DECODE;
@@ -2198,8 +3033,6 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
{
struct tok_state *tok;
FILE *fp;
- const char *p_start = NULL;
- const char *p_end = NULL;
char *encoding = NULL;
fp = fdopen_borrow(fd);
@@ -2212,8 +3045,7 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
return NULL;
}
if (filename != NULL) {
- Py_INCREF(filename);
- tok->filename = filename;
+ tok->filename = Py_NewRef(filename);
}
else {
tok->filename = PyUnicode_FromString("<string>");
@@ -2223,11 +3055,14 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
return encoding;
}
}
+ struct token token;
// We don't want to report warnings here because it could cause infinite recursion
// if fetching the encoding shows a warning.
tok->report_warnings = 0;
while (tok->lineno < 2 && tok->done == E_OK) {
- _PyTokenizer_Get(tok, &p_start, &p_end);
+ _PyToken_Init(&token);
+ _PyTokenizer_Get(tok, &token);
+ _PyToken_Free(&token);
}
fclose(fp);
if (tok->encoding) {