diff options
author | shadchin <shadchin@yandex-team.com> | 2024-02-12 07:53:52 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@ydb.tech> | 2024-02-14 14:26:16 +0000 |
commit | 31f2a419764a8ba77c2a970cfc80056c6cd06756 (patch) | |
tree | c1995d239eba8571cefc640f6648e1d5dd4ce9e2 /contrib/tools/python3/src/Parser/tokenizer.c | |
parent | fe2ef02b38d9c85d80060963b265a1df9f38c3bb (diff) | |
download | ydb-31f2a419764a8ba77c2a970cfc80056c6cd06756.tar.gz |
Update Python from 3.11.8 to 3.12.2
Diffstat (limited to 'contrib/tools/python3/src/Parser/tokenizer.c')
-rw-r--r-- | contrib/tools/python3/src/Parser/tokenizer.c | 1195 |
1 files changed, 1015 insertions, 180 deletions
diff --git a/contrib/tools/python3/src/Parser/tokenizer.c b/contrib/tools/python3/src/Parser/tokenizer.c index 4bc72ae444..27d49c6f89 100644 --- a/contrib/tools/python3/src/Parser/tokenizer.c +++ b/contrib/tools/python3/src/Parser/tokenizer.c @@ -11,11 +11,6 @@ #include "tokenizer.h" #include "errcode.h" -#include "unicodeobject.h" -#include "bytesobject.h" -#include "fileobject.h" -#include "abstract.h" - /* Alternate tab spacing */ #define ALTTABSIZE 1 @@ -36,6 +31,31 @@ /* Don't ever change this -- it would break the portability of Python code */ #define TABSIZE 8 +#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) +#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ + type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) +#define ADVANCE_LINENO() \ + tok->lineno++; \ + tok->col_offset = 0; + +#define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0) +#define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0) +#ifdef Py_DEBUG +static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { + assert(tok->tok_mode_stack_index >= 0); + assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL); + return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); +} +static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { + assert(tok->tok_mode_stack_index >= 0); + assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL); + return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); +} +#else +#define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index])) +#define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index])) +#endif + /* Forward */ static struct tok_state *tok_new(void); static int tok_nextc(struct tok_state *tok); @@ -71,6 +91,8 @@ tok_new(void) tok->pendin = 0; tok->prompt = tok->nextprompt = NULL; tok->lineno = 0; + tok->starting_col_offset = -1; + tok->col_offset = -1; tok->level = 0; tok->altindstack[0] = 0; tok->decoding_state = STATE_INIT; @@ -81,6 +103,7 @@ tok_new(void) tok->filename = NULL; tok->decoding_readline = NULL; tok->decoding_buffer = NULL; + tok->readline = NULL; tok->type_comments = 0; tok->async_hacks = 0; tok->async_def = 0; @@ -89,6 +112,14 @@ tok_new(void) tok->interactive_underflow = IUNDERFLOW_NORMAL; tok->str = NULL; tok->report_warnings = 1; + tok->tok_extra_tokens = 0; + tok->comment_newline = 0; + tok->implicit_newline = 0; + tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0}; + tok->tok_mode_stack_index = 0; +#ifdef Py_DEBUG + tok->debug = _Py_GetConfig()->parser_debug; +#endif return tok; } @@ -109,8 +140,9 @@ static char * error_ret(struct tok_state *tok) /* XXX */ { tok->decoding_erred = 1; - if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */ + if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */ PyMem_Free(tok->buf); + } tok->buf = tok->cur = tok->inp = NULL; tok->start = NULL; tok->end = NULL; @@ -323,16 +355,182 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { return -1; } strcpy(new_str + current_size, line); + tok->implicit_newline = 0; if (last_char != '\n') { /* Last line does not end in \n, fake one */ new_str[current_size + line_size - 1] = '\n'; new_str[current_size + line_size] = '\0'; + tok->implicit_newline = 1; } tok->interactive_src_start = new_str; tok->interactive_src_end = new_str + current_size + line_size; return 0; } +/* Traverse and remember all f-string buffers, in order to be able to restore + them after reallocating tok->buf */ +static void +remember_fstring_buffers(struct tok_state *tok) +{ + int index; + tokenizer_mode *mode; + + for (index = tok->tok_mode_stack_index; index >= 0; --index) { + mode = &(tok->tok_mode_stack[index]); + mode->f_string_start_offset = mode->f_string_start - tok->buf; + mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf; + } +} + +/* Traverse and restore all f-string buffers after reallocating tok->buf */ +static void +restore_fstring_buffers(struct tok_state *tok) +{ + int index; + tokenizer_mode *mode; + + for (index = tok->tok_mode_stack_index; index >= 0; --index) { + mode = &(tok->tok_mode_stack[index]); + mode->f_string_start = tok->buf + mode->f_string_start_offset; + mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset; + } +} + +static int +set_fstring_expr(struct tok_state* tok, struct token *token, char c) { + assert(token != NULL); + assert(c == '}' || c == ':' || c == '!'); + tokenizer_mode *tok_mode = TOK_GET_MODE(tok); + + if (!tok_mode->f_string_debug || token->metadata) { + return 0; + } + + PyObject *res = NULL; + + // Check if there is a # character in the expression + int hash_detected = 0; + for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { + if (tok_mode->last_expr_buffer[i] == '#') { + hash_detected = 1; + break; + } + } + + if (hash_detected) { + Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end; + char *result = (char *)PyObject_Malloc((input_length + 1) * sizeof(char)); + if (!result) { + return -1; + } + + Py_ssize_t i = 0; + Py_ssize_t j = 0; + + for (i = 0, j = 0; i < input_length; i++) { + if (tok_mode->last_expr_buffer[i] == '#') { + // Skip characters until newline or end of string + while (tok_mode->last_expr_buffer[i] != '\0' && i < input_length) { + if (tok_mode->last_expr_buffer[i] == '\n') { + result[j++] = tok_mode->last_expr_buffer[i]; + break; + } + i++; + } + } else { + result[j++] = tok_mode->last_expr_buffer[i]; + } + } + + result[j] = '\0'; // Null-terminate the result string + res = PyUnicode_DecodeUTF8(result, j, NULL); + PyObject_Free(result); + } else { + res = PyUnicode_DecodeUTF8( + tok_mode->last_expr_buffer, + tok_mode->last_expr_size - tok_mode->last_expr_end, + NULL + ); + + } + + + if (!res) { + return -1; + } + token->metadata = res; + return 0; +} + +static int +update_fstring_expr(struct tok_state *tok, char cur) +{ + assert(tok->cur != NULL); + + Py_ssize_t size = strlen(tok->cur); + tokenizer_mode *tok_mode = TOK_GET_MODE(tok); + + switch (cur) { + case 0: + if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { + return 1; + } + char *new_buffer = PyMem_Realloc( + tok_mode->last_expr_buffer, + tok_mode->last_expr_size + size + ); + if (new_buffer == NULL) { + PyMem_Free(tok_mode->last_expr_buffer); + goto error; + } + tok_mode->last_expr_buffer = new_buffer; + strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); + tok_mode->last_expr_size += size; + break; + case '{': + if (tok_mode->last_expr_buffer != NULL) { + PyMem_Free(tok_mode->last_expr_buffer); + } + tok_mode->last_expr_buffer = PyMem_Malloc(size); + if (tok_mode->last_expr_buffer == NULL) { + goto error; + } + tok_mode->last_expr_size = size; + tok_mode->last_expr_end = -1; + strncpy(tok_mode->last_expr_buffer, tok->cur, size); + break; + case '}': + case '!': + case ':': + if (tok_mode->last_expr_end == -1) { + tok_mode->last_expr_end = strlen(tok->start); + } + break; + default: + Py_UNREACHABLE(); + } + return 1; +error: + tok->done = E_NOMEM; + return 0; +} + +static void +free_fstring_expressions(struct tok_state *tok) +{ + int index; + tokenizer_mode *mode; + + for (index = tok->tok_mode_stack_index; index >= 0; --index) { + mode = &(tok->tok_mode_stack[index]); + if (mode->last_expr_buffer != NULL) { + PyMem_Free(mode->last_expr_buffer); + mode->last_expr_buffer = NULL; + mode->last_expr_size = 0; + mode->last_expr_end = -1; + } + } +} /* Read a line of text from TOK into S, using the stream in TOK. Return NULL on failure, else S. @@ -360,6 +558,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf; Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf; Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf; + remember_fstring_buffers(tok); newbuf = (char *)PyMem_Realloc(newbuf, newsize); if (newbuf == NULL) { tok->done = E_NOMEM; @@ -372,6 +571,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) tok->start = start < 0 ? NULL : tok->buf + start; tok->line_start = line_start < 0 ? NULL : tok->buf + line_start; tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start; + restore_fstring_buffers(tok); } return 1; } @@ -436,7 +636,7 @@ error: static int fp_setreadl(struct tok_state *tok, const char* enc) { - PyObject *readline, *io, *stream; + PyObject *readline, *open, *stream; int fd; long pos; @@ -453,13 +653,13 @@ fp_setreadl(struct tok_state *tok, const char* enc) return 0; } - io = PyImport_ImportModule("io"); - if (io == NULL) { + open = _PyImport_GetModuleAttrString("io", "open"); + if (open == NULL) { return 0; } - stream = _PyObject_CallMethod(io, &_Py_ID(open), "isisOOO", + stream = PyObject_CallFunction(open, "isisOOO", fd, "r", -1, enc, Py_None, Py_None, Py_False); - Py_DECREF(io); + Py_DECREF(open); if (stream == NULL) { return 0; } @@ -620,7 +820,8 @@ translate_into_utf8(const char* str, const char* enc) { static char * -translate_newlines(const char *s, int exec_input, struct tok_state *tok) { +translate_newlines(const char *s, int exec_input, int preserve_crlf, + struct tok_state *tok) { int skip_next_lf = 0; #if defined(__has_feature) # if __has_feature(memory_sanitizer) @@ -645,7 +846,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) { break; } } - if (c == '\r') { + if (!preserve_crlf && c == '\r') { skip_next_lf = 1; c = '\n'; } @@ -653,7 +854,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) { } /* If this is exec input, add a newline to the end of the string if there isn't one already. */ - if (exec_input && c != '\n') { + if (exec_input && c != '\n' && c != '\0') { *current = '\n'; current++; } @@ -675,14 +876,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) { inside TOK. */ static char * -decode_str(const char *input, int single, struct tok_state *tok) +decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf) { PyObject* utf8 = NULL; char *str; const char *s; const char *newl[2] = {NULL, NULL}; int lineno = 0; - tok->input = str = translate_newlines(input, single, tok); + tok->input = str = translate_newlines(input, single, preserve_crlf, tok); if (str == NULL) return NULL; tok->enc = NULL; @@ -734,14 +935,14 @@ decode_str(const char *input, int single, struct tok_state *tok) /* Set up tokenizer for string */ struct tok_state * -_PyTokenizer_FromString(const char *str, int exec_input) +_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf) { struct tok_state *tok = tok_new(); char *decoded; if (tok == NULL) return NULL; - decoded = decode_str(str, exec_input, tok); + decoded = decode_str(str, exec_input, tok, preserve_crlf); if (decoded == NULL) { _PyTokenizer_Free(tok); return NULL; @@ -752,16 +953,43 @@ _PyTokenizer_FromString(const char *str, int exec_input) return tok; } +struct tok_state * +_PyTokenizer_FromReadline(PyObject* readline, const char* enc, + int exec_input, int preserve_crlf) +{ + struct tok_state *tok = tok_new(); + if (tok == NULL) + return NULL; + if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { + _PyTokenizer_Free(tok); + return NULL; + } + tok->cur = tok->inp = tok->buf; + tok->end = tok->buf + BUFSIZ; + tok->fp = NULL; + if (enc != NULL) { + tok->encoding = new_string(enc, strlen(enc), tok); + if (!tok->encoding) { + _PyTokenizer_Free(tok); + return NULL; + } + } + tok->decoding_state = STATE_NORMAL; + Py_INCREF(readline); + tok->readline = readline; + return tok; +} + /* Set up tokenizer for UTF-8 string */ struct tok_state * -_PyTokenizer_FromUTF8(const char *str, int exec_input) +_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf) { struct tok_state *tok = tok_new(); char *translated; if (tok == NULL) return NULL; - tok->input = translated = translate_newlines(str, exec_input, tok); + tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok); if (translated == NULL) { _PyTokenizer_Free(tok); return NULL; @@ -821,8 +1049,9 @@ _PyTokenizer_Free(struct tok_state *tok) } Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); + Py_XDECREF(tok->readline); Py_XDECREF(tok->filename); - if (tok->fp != NULL && tok->buf != NULL) { + if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) { PyMem_Free(tok->buf); } if (tok->input) { @@ -831,9 +1060,20 @@ _PyTokenizer_Free(struct tok_state *tok) if (tok->interactive_src_start != NULL) { PyMem_Free(tok->interactive_src_start); } + free_fstring_expressions(tok); PyMem_Free(tok); } +void +_PyToken_Free(struct token *token) { + Py_XDECREF(token->metadata); +} + +void +_PyToken_Init(struct token *token) { + token->metadata = NULL; +} + static int tok_readline_raw(struct tok_state *tok) { @@ -860,6 +1100,67 @@ tok_readline_raw(struct tok_state *tok) } static int +tok_readline_string(struct tok_state* tok) { + PyObject* line = NULL; + PyObject* raw_line = PyObject_CallNoArgs(tok->readline); + if (raw_line == NULL) { + if (PyErr_ExceptionMatches(PyExc_StopIteration)) { + PyErr_Clear(); + return 1; + } + error_ret(tok); + goto error; + } + if(tok->encoding != NULL) { + if (!PyBytes_Check(raw_line)) { + PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object"); + error_ret(tok); + goto error; + } + line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line), + tok->encoding, "replace"); + Py_CLEAR(raw_line); + if (line == NULL) { + error_ret(tok); + goto error; + } + } else { + if(!PyUnicode_Check(raw_line)) { + PyErr_Format(PyExc_TypeError, "readline() returned a non-string object"); + error_ret(tok); + goto error; + } + line = raw_line; + raw_line = NULL; + } + Py_ssize_t buflen; + const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen); + if (buf == NULL) { + error_ret(tok); + goto error; + } + + // Make room for the null terminator *and* potentially + // an extra newline character that we may need to artificially + // add. + size_t buffer_size = buflen + 2; + if (!tok_reserve_buf(tok, buffer_size)) { + goto error; + } + memcpy(tok->inp, buf, buflen); + tok->inp += buflen; + *tok->inp = '\0'; + + tok->line_start = tok->cur; + Py_DECREF(line); + return 1; +error: + Py_XDECREF(raw_line); + Py_XDECREF(line); + return 0; +} + +static int tok_underflow_string(struct tok_state *tok) { char *end = strchr(tok->inp, '\n'); if (end != NULL) { @@ -876,7 +1177,7 @@ tok_underflow_string(struct tok_state *tok) { tok->buf = tok->cur; } tok->line_start = tok->cur; - tok->lineno++; + ADVANCE_LINENO(); tok->inp = end; return 1; } @@ -889,7 +1190,7 @@ tok_underflow_interactive(struct tok_state *tok) { } char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); if (newtok != NULL) { - char *translated = translate_newlines(newtok, 0, tok); + char *translated = translate_newlines(newtok, 0, 0, tok); PyMem_Free(newtok); if (translated == NULL) { return 0; @@ -934,8 +1235,9 @@ tok_underflow_interactive(struct tok_state *tok) { } else if (tok->start != NULL) { Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; + remember_fstring_buffers(tok); size_t size = strlen(newtok); - tok->lineno++; + ADVANCE_LINENO(); if (!tok_reserve_buf(tok, size + 1)) { PyMem_Free(tok->buf); tok->buf = NULL; @@ -946,15 +1248,18 @@ tok_underflow_interactive(struct tok_state *tok) { PyMem_Free(newtok); tok->inp += size; tok->multi_line_start = tok->buf + cur_multi_line_start; + restore_fstring_buffers(tok); } else { - tok->lineno++; + remember_fstring_buffers(tok); + ADVANCE_LINENO(); PyMem_Free(tok->buf); tok->buf = newtok; tok->cur = tok->buf; tok->line_start = tok->buf; tok->inp = strchr(tok->buf, '\0'); tok->end = tok->inp + 1; + restore_fstring_buffers(tok); } if (tok->done != E_OK) { if (tok->prompt != NULL) { @@ -962,12 +1267,16 @@ tok_underflow_interactive(struct tok_state *tok) { } return 0; } + + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { + return 0; + } return 1; } static int tok_underflow_file(struct tok_state *tok) { - if (tok->start == NULL) { + if (tok->start == NULL && !INSIDE_FSTRING(tok)) { tok->cur = tok->inp = tok->buf; } if (tok->decoding_state == STATE_INIT) { @@ -997,14 +1306,20 @@ tok_underflow_file(struct tok_state *tok) { tok->done = E_EOF; return 0; } + tok->implicit_newline = 0; if (tok->inp[-1] != '\n') { assert(tok->inp + 1 < tok->end); /* Last line does not end in \n, fake one */ *tok->inp++ = '\n'; *tok->inp = '\0'; + tok->implicit_newline = 1; + } + + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { + return 0; } - tok->lineno++; + ADVANCE_LINENO(); if (tok->decoding_state != STATE_NORMAL) { if (tok->lineno > 2) { tok->decoding_state = STATE_NORMAL; @@ -1025,6 +1340,44 @@ tok_underflow_file(struct tok_state *tok) { return tok->done == E_OK; } +static int +tok_underflow_readline(struct tok_state* tok) { + assert(tok->decoding_state == STATE_NORMAL); + assert(tok->fp == NULL && tok->input == NULL && tok->decoding_readline == NULL); + if (tok->start == NULL && !INSIDE_FSTRING(tok)) { + tok->cur = tok->inp = tok->buf; + } + if (!tok_readline_string(tok)) { + return 0; + } + if (tok->inp == tok->cur) { + tok->done = E_EOF; + return 0; + } + tok->implicit_newline = 0; + if (tok->inp[-1] != '\n') { + assert(tok->inp + 1 < tok->end); + /* Last line does not end in \n, fake one */ + *tok->inp++ = '\n'; + *tok->inp = '\0'; + tok->implicit_newline = 1; + } + + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { + return 0; + } + + ADVANCE_LINENO(); + /* The default encoding is UTF-8, so make sure we don't have any + non-UTF-8 sequences in it. */ + if (!tok->encoding && !ensure_utf8(tok->cur, tok)) { + error_ret(tok); + return 0; + } + assert(tok->done == E_OK); + return tok->done == E_OK; +} + #if defined(Py_DEBUG) static void print_escape(FILE *f, const char *s, Py_ssize_t size) @@ -1062,16 +1415,20 @@ tok_nextc(struct tok_state *tok) int rc; for (;;) { if (tok->cur != tok->inp) { - if (tok->cur - tok->buf >= INT_MAX) { + if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) { tok->done = E_COLUMNOVERFLOW; return EOF; } + tok->col_offset++; return Py_CHARMASK(*tok->cur++); /* Fast path */ } if (tok->done != E_OK) { - return EOF; + return EOF; + } + if (tok->readline) { + rc = tok_underflow_readline(tok); } - if (tok->fp == NULL) { + else if (tok->fp == NULL) { rc = tok_underflow_string(tok); } else if (tok->prompt != NULL) { @@ -1081,7 +1438,7 @@ tok_nextc(struct tok_state *tok) rc = tok_underflow_file(tok); } #if defined(Py_DEBUG) - if (Py_DebugFlag) { + if (tok->debug) { fprintf(stderr, "line[%d] = ", tok->lineno); print_escape(stderr, tok->cur, tok->inp - tok->cur); fprintf(stderr, " tok->done = %d\n", tok->done); @@ -1111,9 +1468,10 @@ tok_backup(struct tok_state *tok, int c) if (--tok->cur < tok->buf) { Py_FatalError("tokenizer beginning of buffer"); } - if ((int)(unsigned char)*tok->cur != c) { + if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) { Py_FatalError("tok_backup: wrong character"); } + tok->col_offset--; } } @@ -1122,6 +1480,12 @@ _syntaxerror_range(struct tok_state *tok, const char *format, int col_offset, int end_col_offset, va_list vargs) { + // In release builds, we don't want to overwrite a previous error, but in debug builds we + // want to fail if we are not doing it so we can fix it. + assert(tok->done != E_ERROR); + if (tok->done == E_ERROR) { + return ERRORTOKEN; + } PyObject *errmsg, *errtext, *args; errmsg = PyUnicode_FromFormatV(format, vargs); if (!errmsg) { @@ -1167,12 +1531,9 @@ error: static int syntaxerror(struct tok_state *tok, const char *format, ...) { + // This errors are cleaned on startup. Todo: Fix it. va_list vargs; -#ifdef HAVE_STDARG_PROTOTYPES va_start(vargs, format); -#else - va_start(vargs); -#endif int ret = _syntaxerror_range(tok, format, -1, -1, vargs); va_end(vargs); return ret; @@ -1184,18 +1545,12 @@ syntaxerror_known_range(struct tok_state *tok, const char *format, ...) { va_list vargs; -#ifdef HAVE_STDARG_PROTOTYPES va_start(vargs, format); -#else - va_start(vargs); -#endif int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs); va_end(vargs); return ret; } - - static int indenterror(struct tok_state *tok) { @@ -1213,11 +1568,7 @@ parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) PyObject *errmsg; va_list vargs; -#ifdef HAVE_STDARG_PROTOTYPES va_start(vargs, format); -#else - va_start(vargs); -#endif errmsg = PyUnicode_FromFormatV(format, vargs); va_end(vargs); if (!errmsg) { @@ -1244,6 +1595,40 @@ error: } static int +warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char) +{ + if (!tok->report_warnings) { + return 0; + } + + PyObject *msg = PyUnicode_FromFormat( + "invalid escape sequence '\\%c'", + (char) first_invalid_escape_char + ); + + if (msg == NULL) { + return -1; + } + + if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename, + tok->lineno, NULL, NULL) < 0) { + Py_DECREF(msg); + + if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) { + /* Replace the SyntaxWarning exception with a SyntaxError + to get a more accurate error report */ + PyErr_Clear(); + return syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char); + } + + return -1; + } + + Py_DECREF(msg); + return 0; +} + +static int lookahead(struct tok_state *tok, const char *test) { const char *s = test; @@ -1267,8 +1652,12 @@ lookahead(struct tok_state *tok, const char *test) } static int -verify_end_of_number(struct tok_state *tok, int c, const char *kind) -{ +verify_end_of_number(struct tok_state *tok, int c, const char *kind) { + if (tok->tok_extra_tokens) { + // When we are parsing extra tokens, we don't want to emit warnings + // about invalid literals, because we want to be a bit more liberal. + return 1; + } /* Emit a deprecation warning only if the numeric literal is immediately * followed by one of keywords which can occur after a numeric literal * in valid code: "and", "else", "for", "if", "in", "is" and "or". @@ -1326,6 +1715,9 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind) static int verify_identifier(struct tok_state *tok) { + if (tok->tok_extra_tokens) { + return 1; + } PyObject *s; if (tok->decoding_erred) return 0; @@ -1361,14 +1753,11 @@ verify_identifier(struct tok_state *tok) tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); } Py_DECREF(s); - // PyUnicode_FromFormatV() does not support %X - char hex[9]; - (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch); if (Py_UNICODE_ISPRINTABLE(ch)) { - syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch); } else { - syntaxerror(tok, "invalid non-printable character U+%s", hex); + syntaxerror(tok, "invalid non-printable character U+%04X", ch); } return 0; } @@ -1398,11 +1787,13 @@ tok_decimal_tail(struct tok_state *tok) return c; } -/* Get next token, after space stripping etc. */ static inline int tok_continuation_line(struct tok_state *tok) { int c = tok_nextc(tok); + if (c == '\r') { + c = tok_nextc(tok); + } if (c != '\n') { tok->done = E_LINECONT; return -1; @@ -1419,16 +1810,56 @@ tok_continuation_line(struct tok_state *tok) { } static int -tok_get(struct tok_state *tok, const char **p_start, const char **p_end) +type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset, + int end_col_offset, const char *start, const char *end) +{ + token->level = tok->level; + token->lineno = token->end_lineno = tok->lineno; + token->col_offset = col_offset; + token->end_col_offset = end_col_offset; + token->start = start; + token->end = end; + return type; +} + +static int +token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end) +{ + assert((start == NULL && end == NULL) || (start != NULL && end != NULL)); + token->level = tok->level; + if (ISSTRINGLIT(type)) { + token->lineno = tok->first_lineno; + } + else { + token->lineno = tok->lineno; + } + token->end_lineno = tok->lineno; + token->col_offset = token->end_col_offset = -1; + token->start = start; + token->end = end; + + if (start != NULL && end != NULL) { + token->col_offset = tok->starting_col_offset; + token->end_col_offset = tok->col_offset; + } + return type; +} + + +static int +tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) { int c; int blankline, nonascii; - *p_start = *p_end = NULL; + const char *p_start = NULL; + const char *p_end = NULL; nextline: tok->start = NULL; + tok->starting_col_offset = -1; blankline = 0; + /* Get indentation level */ if (tok->atbol) { int col = 0; @@ -1454,7 +1885,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) // the level of indentation of whatever comes next. cont_line_col = cont_line_col ? cont_line_col : col; if ((c = tok_continuation_line(tok)) == -1) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else { @@ -1462,7 +1893,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } } tok_backup(tok, c); - if (c == '#' || c == '\n') { + if (c == '#' || c == '\n' || c == '\r') { /* Lines with only whitespace and/or comments shouldn't affect the indentation and are not passed to the parser as NEWLINE tokens, @@ -1489,7 +1920,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + return MAKE_TOKEN(indenterror(tok)); } } else if (col > tok->indstack[tok->indent]) { @@ -1497,10 +1928,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (tok->indent+1 >= MAXINDENT) { tok->done = E_TOODEEP; tok->cur = tok->inp; - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } if (altcol <= tok->altindstack[tok->indent]) { - return indenterror(tok); + return MAKE_TOKEN(indenterror(tok)); } tok->pendin++; tok->indstack[++tok->indent] = col; @@ -1516,26 +1947,35 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (col != tok->indstack[tok->indent]) { tok->done = E_DEDENT; tok->cur = tok->inp; - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + return MAKE_TOKEN(indenterror(tok)); } } } } tok->start = tok->cur; + tok->starting_col_offset = tok->col_offset; /* Return pending indents/dedents */ - if (tok->pendin != 0) { + if (tok->pendin != 0) { if (tok->pendin < 0) { + if (tok->tok_extra_tokens) { + p_start = tok->cur; + p_end = tok->cur; + } tok->pendin++; - return DEDENT; + return MAKE_TOKEN(DEDENT); } else { + if (tok->tok_extra_tokens) { + p_start = tok->buf; + p_end = tok->cur; + } tok->pendin--; - return INDENT; + return MAKE_TOKEN(INDENT); } } @@ -1573,25 +2013,36 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Set start of current token */ tok->start = tok->cur == NULL ? NULL : tok->cur - 1; + tok->starting_col_offset = tok->col_offset - 1; /* Skip comment, unless it's a type comment */ if (c == '#') { - const char *prefix, *p, *type_start; - while (c != EOF && c != '\n') { + const char* p = NULL; + const char *prefix, *type_start; + int current_starting_col_offset; + + while (c != EOF && c != '\n' && c != '\r') { c = tok_nextc(tok); } + if (tok->tok_extra_tokens) { + p = tok->start; + } + if (tok->type_comments) { p = tok->start; + current_starting_col_offset = tok->starting_col_offset; prefix = type_comment_prefix; while (*prefix && p < tok->cur) { if (*prefix == ' ') { while (*p == ' ' || *p == '\t') { p++; + current_starting_col_offset++; } } else if (*prefix == *p) { p++; + current_starting_col_offset++; } else { break; } @@ -1602,7 +2053,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* This is a type comment if we matched all of type_comment_prefix. */ if (!*prefix) { int is_type_ignore = 1; + // +6 in order to skip the word 'ignore' const char *ignore_end = p + 6; + const int ignore_end_col_offset = current_starting_col_offset + 6; tok_backup(tok, c); /* don't eat the newline or EOF */ type_start = p; @@ -1615,34 +2068,41 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); if (is_type_ignore) { - *p_start = ignore_end; - *p_end = tok->cur; + p_start = ignore_end; + p_end = tok->cur; /* If this type ignore is the only thing on the line, consume the newline also. */ if (blankline) { tok_nextc(tok); tok->atbol = 1; } - return TYPE_IGNORE; + return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); } else { - *p_start = type_start; /* after type_comment_prefix */ - *p_end = tok->cur; - return TYPE_COMMENT; + p_start = type_start; + p_end = tok->cur; + return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); } } } + if (tok->tok_extra_tokens) { + tok_backup(tok, c); /* don't eat the newline or EOF */ + p_start = p; + p_end = tok->cur; + tok->comment_newline = blankline; + return MAKE_TOKEN(COMMENT); + } } if (tok->done == E_INTERACT_STOP) { - return ENDMARKER; + return MAKE_TOKEN(ENDMARKER); } /* Check for EOF and errors now */ if (c == EOF) { if (tok->level) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } - return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; + return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); } /* Identifier (most frequent token!) */ @@ -1671,6 +2131,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } c = tok_nextc(tok); if (c == '"' || c == '\'') { + if (saw_f) { + goto f_string_quote; + } goto letter_quote; } } @@ -1682,11 +2145,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } tok_backup(tok, c); if (nonascii && !verify_identifier(tok)) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } - *p_start = tok->start; - *p_end = tok->cur; + p_start = tok->start; + p_end = tok->cur; /* async/await parsing block. */ if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { @@ -1701,10 +2164,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (!tok->async_hacks || tok->async_def) { /* Always recognize the keywords. */ if (memcmp(tok->start, "async", 5) == 0) { - return ASYNC; + return MAKE_TOKEN(ASYNC); } if (memcmp(tok->start, "await", 5) == 0) { - return AWAIT; + return MAKE_TOKEN(AWAIT); } } else if (memcmp(tok->start, "async", 5) == 0) { @@ -1712,13 +2175,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) Look ahead one token to see if that is 'def'. */ struct tok_state ahead_tok; - const char *ahead_tok_start = NULL; - const char *ahead_tok_end = NULL; + struct token ahead_token; + _PyToken_Init(&ahead_token); int ahead_tok_kind; memcpy(&ahead_tok, tok, sizeof(ahead_tok)); - ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, - &ahead_tok_end); + ahead_tok_kind = tok_get_normal_mode(&ahead_tok, + current_tok, + &ahead_token); if (ahead_tok_kind == NAME && ahead_tok.cur - ahead_tok.start == 3 @@ -1728,29 +2192,49 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) returning a plain NAME token, return ASYNC. */ tok->async_def_indent = tok->indent; tok->async_def = 1; - return ASYNC; + _PyToken_Free(&ahead_token); + return MAKE_TOKEN(ASYNC); } + _PyToken_Free(&ahead_token); } } - return NAME; + return MAKE_TOKEN(NAME); + } + + if (c == '\r') { + c = tok_nextc(tok); } /* Newline */ if (c == '\n') { tok->atbol = 1; if (blankline || tok->level > 0) { + if (tok->tok_extra_tokens) { + if (tok->comment_newline) { + tok->comment_newline = 0; + } + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NL); + } goto nextline; } - *p_start = tok->start; - *p_end = tok->cur - 1; /* Leave '\n' out of the string */ + if (tok->comment_newline && tok->tok_extra_tokens) { + tok->comment_newline = 0; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NL); + } + p_start = tok->start; + p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; if (tok->async_def) { /* We're somewhere inside an 'async def' function, and we've encountered a NEWLINE after its signature. */ tok->async_def_nl = 1; } - return NEWLINE; + return MAKE_TOKEN(NEWLINE); } /* Period or number starting with period? */ @@ -1761,9 +2245,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } else if (c == '.') { c = tok_nextc(tok); if (c == '.') { - *p_start = tok->start; - *p_end = tok->cur; - return ELLIPSIS; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(ELLIPSIS); } else { tok_backup(tok, c); @@ -1773,9 +2257,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) else { tok_backup(tok, c); } - *p_start = tok->start; - *p_end = tok->cur; - return DOT; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(DOT); } /* Number */ @@ -1792,14 +2276,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } if (!isxdigit(c)) { tok_backup(tok, c); - return syntaxerror(tok, "invalid hexadecimal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal")); } do { c = tok_nextc(tok); } while (isxdigit(c)); } while (c == '_'); if (!verify_end_of_number(tok, c, "hexadecimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'o' || c == 'O') { @@ -1811,12 +2295,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } if (c < '0' || c >= '8') { if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); + return MAKE_TOKEN(syntaxerror(tok, + "invalid digit '%c' in octal literal", c)); } else { tok_backup(tok, c); - return syntaxerror(tok, "invalid octal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal")); } } do { @@ -1824,11 +2308,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } while ('0' <= c && c < '8'); } while (c == '_'); if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); + return MAKE_TOKEN(syntaxerror(tok, + "invalid digit '%c' in octal literal", c)); } if (!verify_end_of_number(tok, c, "octal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'b' || c == 'B') { @@ -1840,12 +2324,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } if (c != '0' && c != '1') { if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); + return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } else { tok_backup(tok, c); - return syntaxerror(tok, "invalid binary literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal")); } } do { @@ -1853,11 +2336,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } while (c == '0' || c == '1'); } while (c == '_'); if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); + return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } if (!verify_end_of_number(tok, c, "binary")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else { @@ -1869,7 +2351,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) c = tok_nextc(tok); if (!isdigit(c)) { tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } if (c != '0') { @@ -1882,7 +2364,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) nonzero = 1; c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } if (c == '.') { @@ -1895,18 +2377,18 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) else if (c == 'j' || c == 'J') { goto imaginary; } - else if (nonzero) { + else if (nonzero && !tok->tok_extra_tokens) { /* Old-style octal: now disallowed. */ tok_backup(tok, c); - return syntaxerror_known_range( + return MAKE_TOKEN(syntaxerror_known_range( tok, (int)(tok->start + 1 - tok->line_start), (int)(zeros_end - tok->line_start), "leading zeros in decimal integer " "literals are not permitted; " - "use an 0o prefix for octal integers"); + "use an 0o prefix for octal integers")); } if (!verify_end_of_number(tok, c, "decimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } } @@ -1914,7 +2396,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Decimal */ c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } { /* Accept floating point numbers. */ @@ -1925,7 +2407,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (isdigit(c)) { c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } } @@ -1939,21 +2421,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) c = tok_nextc(tok); if (!isdigit(c)) { tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } else if (!isdigit(c)) { tok_backup(tok, c); if (!verify_end_of_number(tok, e, "decimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } tok_backup(tok, e); - *p_start = tok->start; - *p_end = tok->cur; - return NUMBER; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NUMBER); } c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } if (c == 'j' || c == 'J') { @@ -1961,18 +2443,85 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) imaginary: c = tok_nextc(tok); if (!verify_end_of_number(tok, c, "imaginary")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else if (!verify_end_of_number(tok, c, "decimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } } tok_backup(tok, c); - *p_start = tok->start; - *p_end = tok->cur; - return NUMBER; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NUMBER); + } + + f_string_quote: + if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) { + int quote = c; + int quote_size = 1; /* 1 or 3 */ + + /* Nodes of type STRING, especially multi line strings + must be handled differently in order to get both + the starting line number and the column offset right. + (cf. issue 16806) */ + tok->first_lineno = tok->lineno; + tok->multi_line_start = tok->line_start; + + /* Find the quote size and start of string */ + int after_quote = tok_nextc(tok); + if (after_quote == quote) { + int after_after_quote = tok_nextc(tok); + if (after_after_quote == quote) { + quote_size = 3; + } + else { + // TODO: Check this + tok_backup(tok, after_after_quote); + tok_backup(tok, after_quote); + } + } + if (after_quote != quote) { + tok_backup(tok, after_quote); + } + + + p_start = tok->start; + p_end = tok->cur; + if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) { + return MAKE_TOKEN(syntaxerror(tok, "too many nested f-strings")); + } + tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); + the_current_tok->kind = TOK_FSTRING_MODE; + the_current_tok->f_string_quote = quote; + the_current_tok->f_string_quote_size = quote_size; + the_current_tok->f_string_start = tok->start; + the_current_tok->f_string_multi_line_start = tok->line_start; + the_current_tok->f_string_line_start = tok->lineno; + the_current_tok->f_string_start_offset = -1; + the_current_tok->f_string_multi_line_start_offset = -1; + the_current_tok->last_expr_buffer = NULL; + the_current_tok->last_expr_size = 0; + the_current_tok->last_expr_end = -1; + the_current_tok->f_string_debug = 0; + + switch (*tok->start) { + case 'F': + case 'f': + the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r'; + break; + case 'R': + case 'r': + the_current_tok->f_string_raw = 1; + break; + default: + Py_UNREACHABLE(); + } + + the_current_tok->curly_bracket_depth = 0; + the_current_tok->curly_bracket_expr_start_depth = -1; + return MAKE_TOKEN(FSTRING_START); } letter_quote: @@ -2008,7 +2557,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) while (end_quote_size != quote_size) { c = tok_nextc(tok); if (tok->done == E_ERROR) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } if (tok->done == E_DECODE) { break; @@ -2023,13 +2572,27 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) tok->line_start = tok->multi_line_start; int start = tok->lineno; tok->lineno = tok->first_lineno; + + if (INSIDE_FSTRING(tok)) { + /* When we are in an f-string, before raising the + * unterminated string literal error, check whether + * does the initial quote matches with f-strings quotes + * and if it is, then this must be a missing '}' token + * so raise the proper error */ + tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); + if (the_current_tok->f_string_quote == quote && + the_current_tok->f_string_quote_size == quote_size) { + return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start)); + } + } + if (quote_size == 3) { syntaxerror(tok, "unterminated triple-quoted string literal" " (detected at line %d)", start); if (c != '\n') { tok->done = E_EOFS; } - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } else { syntaxerror(tok, "unterminated string literal (detected at" @@ -2037,7 +2600,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (c != '\n') { tok->done = E_EOLS; } - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } if (c == quote) { @@ -2046,41 +2609,66 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) else { end_quote_size = 0; if (c == '\\') { - tok_nextc(tok); /* skip escaped char */ + c = tok_nextc(tok); /* skip escaped char */ + if (c == '\r') { + c = tok_nextc(tok); + } } } } - *p_start = tok->start; - *p_end = tok->cur; - return STRING; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(STRING); } /* Line continuation */ if (c == '\\') { if ((c = tok_continuation_line(tok)) == -1) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } tok->cont_line = 1; goto again; /* Read next line */ } + /* Punctuation character */ + int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); + if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { + /* This code block gets executed before the curly_bracket_depth is incremented + * by the `{` case, so for ensuring that we are on the 0th level, we need + * to adjust it manually */ + int cursor = current_tok->curly_bracket_depth - (c != '{'); + if (cursor == 0 && !update_fstring_expr(tok, c)) { + return MAKE_TOKEN(ENDMARKER); + } + if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) { + return MAKE_TOKEN(ERRORTOKEN); + } + + if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { + current_tok->kind = TOK_FSTRING_MODE; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(_PyToken_OneChar(c)); + } + } + /* Check for two-character token */ { int c2 = tok_nextc(tok); - int token = PyToken_TwoChars(c, c2); - if (token != OP) { + int current_token = _PyToken_TwoChars(c, c2); + if (current_token != OP) { int c3 = tok_nextc(tok); - int token3 = PyToken_ThreeChars(c, c2, c3); - if (token3 != OP) { - token = token3; + int current_token3 = _PyToken_ThreeChars(c, c2, c3); + if (current_token3 != OP) { + current_token = current_token3; } else { tok_backup(tok, c3); } - *p_start = tok->start; - *p_end = tok->cur; - return token; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(current_token); } tok_backup(tok, c2); } @@ -2091,58 +2679,305 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) case '[': case '{': if (tok->level >= MAXLEVEL) { - return syntaxerror(tok, "too many nested parentheses"); + return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses")); } tok->parenstack[tok->level] = c; tok->parenlinenostack[tok->level] = tok->lineno; tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); tok->level++; + if (INSIDE_FSTRING(tok)) { + current_tok->curly_bracket_depth++; + } break; case ')': case ']': case '}': - if (!tok->level) { - return syntaxerror(tok, "unmatched '%c'", c); - } - tok->level--; - int opening = tok->parenstack[tok->level]; - if (!((opening == '(' && c == ')') || - (opening == '[' && c == ']') || - (opening == '{' && c == '}'))) - { - if (tok->parenlinenostack[tok->level] != tok->lineno) { - return syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c' on line %d", - c, opening, tok->parenlinenostack[tok->level]); + if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { + return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed")); + } + if (!tok->tok_extra_tokens && !tok->level) { + return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c)); + } + if (tok->level > 0) { + tok->level--; + int opening = tok->parenstack[tok->level]; + if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') || + (opening == '[' && c == ']') || + (opening == '{' && c == '}'))) { + /* If the opening bracket belongs to an f-string's expression + part (e.g. f"{)}") and the closing bracket is an arbitrary + nested expression, then instead of matching a different + syntactical construct with it; we'll throw an unmatched + parentheses error. */ + if (INSIDE_FSTRING(tok) && opening == '{') { + assert(current_tok->curly_bracket_depth >= 0); + int previous_bracket = current_tok->curly_bracket_depth - 1; + if (previous_bracket == current_tok->curly_bracket_expr_start_depth) { + return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c)); + } + } + if (tok->parenlinenostack[tok->level] != tok->lineno) { + return MAKE_TOKEN(syntaxerror(tok, + "closing parenthesis '%c' does not match " + "opening parenthesis '%c' on line %d", + c, opening, tok->parenlinenostack[tok->level])); + } + else { + return MAKE_TOKEN(syntaxerror(tok, + "closing parenthesis '%c' does not match " + "opening parenthesis '%c'", + c, opening)); + } } - else { - return syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c'", - c, opening); + } + + if (INSIDE_FSTRING(tok)) { + current_tok->curly_bracket_depth--; + if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { + current_tok->curly_bracket_expr_start_depth--; + current_tok->kind = TOK_FSTRING_MODE; + current_tok->f_string_debug = 0; } } break; + default: + break; } if (!Py_UNICODE_ISPRINTABLE(c)) { - char hex[9]; - (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c); - return syntaxerror(tok, "invalid non-printable character U+%s", hex); + return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%04X", c)); + } + + if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) { + current_tok->f_string_debug = 1; } /* Punctuation character */ - *p_start = tok->start; - *p_end = tok->cur; - return PyToken_OneChar(c); + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(_PyToken_OneChar(c)); +} + +static int +tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) +{ + const char *p_start = NULL; + const char *p_end = NULL; + int end_quote_size = 0; + int unicode_escape = 0; + + tok->start = tok->cur; + tok->first_lineno = tok->lineno; + tok->starting_col_offset = tok->col_offset; + + // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize + // before it. + int start_char = tok_nextc(tok); + if (start_char == '{') { + int peek1 = tok_nextc(tok); + tok_backup(tok, peek1); + tok_backup(tok, start_char); + if (peek1 != '{') { + current_tok->curly_bracket_expr_start_depth++; + if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { + return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); + } + TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; + return tok_get_normal_mode(tok, current_tok, token); + } + } + else { + tok_backup(tok, start_char); + } + + // Check if we are at the end of the string + for (int i = 0; i < current_tok->f_string_quote_size; i++) { + int quote = tok_nextc(tok); + if (quote != current_tok->f_string_quote) { + tok_backup(tok, quote); + goto f_string_middle; + } + } + + if (current_tok->last_expr_buffer != NULL) { + PyMem_Free(current_tok->last_expr_buffer); + current_tok->last_expr_buffer = NULL; + current_tok->last_expr_size = 0; + current_tok->last_expr_end = -1; + } + + p_start = tok->start; + p_end = tok->cur; + tok->tok_mode_stack_index--; + return MAKE_TOKEN(FSTRING_END); + +f_string_middle: + + // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle + // this. + tok->multi_line_start = tok->line_start; + while (end_quote_size != current_tok->f_string_quote_size) { + int c = tok_nextc(tok); + if (tok->done == E_ERROR) { + return MAKE_TOKEN(ERRORTOKEN); + } + int in_format_spec = ( + current_tok->last_expr_end != -1 + && + INSIDE_FSTRING_EXPR(current_tok) + ); + + if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) { + if (tok->decoding_erred) { + return MAKE_TOKEN(ERRORTOKEN); + } + + // If we are in a format spec and we found a newline, + // it means that the format spec ends here and we should + // return to the regular mode. + if (in_format_spec && c == '\n') { + tok_backup(tok, c); + TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(FSTRING_MIDDLE); + } + + assert(tok->multi_line_start != NULL); + // shift the tok_state's location into + // the start of string, and report the error + // from the initial quote character + tok->cur = (char *)current_tok->f_string_start; + tok->cur++; + tok->line_start = current_tok->f_string_multi_line_start; + int start = tok->lineno; + + tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); + tok->lineno = the_current_tok->f_string_line_start; + + if (current_tok->f_string_quote_size == 3) { + syntaxerror(tok, + "unterminated triple-quoted f-string literal" + " (detected at line %d)", start); + if (c != '\n') { + tok->done = E_EOFS; + } + return MAKE_TOKEN(ERRORTOKEN); + } + else { + return MAKE_TOKEN(syntaxerror(tok, + "unterminated f-string literal (detected at" + " line %d)", start)); + } + } + + if (c == current_tok->f_string_quote) { + end_quote_size += 1; + continue; + } else { + end_quote_size = 0; + } + + if (c == '{') { + int peek = tok_nextc(tok); + if (peek != '{' || in_format_spec) { + tok_backup(tok, peek); + tok_backup(tok, c); + current_tok->curly_bracket_expr_start_depth++; + if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { + return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); + } + TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; + p_start = tok->start; + p_end = tok->cur; + } else { + p_start = tok->start; + p_end = tok->cur - 1; + } + return MAKE_TOKEN(FSTRING_MIDDLE); + } else if (c == '}') { + if (unicode_escape) { + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(FSTRING_MIDDLE); + } + int peek = tok_nextc(tok); + + // The tokenizer can only be in the format spec if we have already completed the expression + // scanning (indicated by the end of the expression being set) and we are not at the top level + // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double + // brackets, we can bypass it here. + if (peek == '}' && !in_format_spec) { + p_start = tok->start; + p_end = tok->cur - 1; + } else { + tok_backup(tok, peek); + tok_backup(tok, c); + TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; + p_start = tok->start; + p_end = tok->cur; + } + return MAKE_TOKEN(FSTRING_MIDDLE); + } else if (c == '\\') { + int peek = tok_nextc(tok); + if (peek == '\r') { + peek = tok_nextc(tok); + } + // Special case when the backslash is right before a curly + // brace. We have to restore and return the control back + // to the loop for the next iteration. + if (peek == '{' || peek == '}') { + if (!current_tok->f_string_raw) { + if (warn_invalid_escape_sequence(tok, peek)) { + return MAKE_TOKEN(ERRORTOKEN); + } + } + tok_backup(tok, peek); + continue; + } + + if (!current_tok->f_string_raw) { + if (peek == 'N') { + /* Handle named unicode escapes (\N{BULLET}) */ + peek = tok_nextc(tok); + if (peek == '{') { + unicode_escape = 1; + } else { + tok_backup(tok, peek); + } + } + } /* else { + skip the escaped character + }*/ + } + } + + // Backup the f-string quotes to emit a final FSTRING_MIDDLE and + // add the quotes to the FSTRING_END in the next tokenizer iteration. + for (int i = 0; i < current_tok->f_string_quote_size; i++) { + tok_backup(tok, current_tok->f_string_quote); + } + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(FSTRING_MIDDLE); +} + + +static int +tok_get(struct tok_state *tok, struct token *token) +{ + tokenizer_mode *current_tok = TOK_GET_MODE(tok); + if (current_tok->kind == TOK_REGULAR_MODE) { + return tok_get_normal_mode(tok, current_tok, token); + } else { + return tok_get_fstring_mode(tok, current_tok, token); + } } int -_PyTokenizer_Get(struct tok_state *tok, - const char **p_start, const char **p_end) +_PyTokenizer_Get(struct tok_state *tok, struct token *token) { - int result = tok_get(tok, p_start, p_end); + int result = tok_get(tok, token); if (tok->decoding_erred) { result = ERRORTOKEN; tok->done = E_DECODE; @@ -2198,8 +3033,6 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) { struct tok_state *tok; FILE *fp; - const char *p_start = NULL; - const char *p_end = NULL; char *encoding = NULL; fp = fdopen_borrow(fd); @@ -2212,8 +3045,7 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) return NULL; } if (filename != NULL) { - Py_INCREF(filename); - tok->filename = filename; + tok->filename = Py_NewRef(filename); } else { tok->filename = PyUnicode_FromString("<string>"); @@ -2223,11 +3055,14 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) return encoding; } } + struct token token; // We don't want to report warnings here because it could cause infinite recursion // if fetching the encoding shows a warning. tok->report_warnings = 0; while (tok->lineno < 2 && tok->done == E_OK) { - _PyTokenizer_Get(tok, &p_start, &p_end); + _PyToken_Init(&token); + _PyTokenizer_Get(tok, &token); + _PyToken_Free(&token); } fclose(fp); if (tok->encoding) { |