diff options
author | AlexSm <alex@ydb.tech> | 2024-03-05 10:40:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-05 12:40:59 +0300 |
commit | 1ac13c847b5358faba44dbb638a828e24369467b (patch) | |
tree | 07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Parser/pegen_errors.c | |
parent | ffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff) | |
download | ydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz |
Library import 16 (#2433)
Co-authored-by: robot-piglet <robot-piglet@yandex-team.com>
Co-authored-by: deshevoy <deshevoy@yandex-team.com>
Co-authored-by: robot-contrib <robot-contrib@yandex-team.com>
Co-authored-by: thegeorg <thegeorg@yandex-team.com>
Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com>
Co-authored-by: svidyuk <svidyuk@yandex-team.com>
Co-authored-by: shadchin <shadchin@yandex-team.com>
Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com>
Co-authored-by: innokentii <innokentii@yandex-team.com>
Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com>
Co-authored-by: snermolaev <snermolaev@yandex-team.com>
Co-authored-by: dimdim11 <dimdim11@yandex-team.com>
Co-authored-by: kickbutt <kickbutt@yandex-team.com>
Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com>
Co-authored-by: korsunandrei <korsunandrei@yandex-team.com>
Co-authored-by: petrk <petrk@yandex-team.com>
Co-authored-by: miroslav2 <miroslav2@yandex-team.com>
Co-authored-by: serjflint <serjflint@yandex-team.com>
Co-authored-by: akhropov <akhropov@yandex-team.com>
Co-authored-by: prettyboy <prettyboy@yandex-team.com>
Co-authored-by: ilikepugs <ilikepugs@yandex-team.com>
Co-authored-by: hiddenpath <hiddenpath@yandex-team.com>
Co-authored-by: mikhnenko <mikhnenko@yandex-team.com>
Co-authored-by: spreis <spreis@yandex-team.com>
Co-authored-by: andreyshspb <andreyshspb@yandex-team.com>
Co-authored-by: dimaandreev <dimaandreev@yandex-team.com>
Co-authored-by: rashid <rashid@yandex-team.com>
Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com>
Co-authored-by: r-vetrov <r-vetrov@yandex-team.com>
Co-authored-by: ypodlesov <ypodlesov@yandex-team.com>
Co-authored-by: zaverden <zaverden@yandex-team.com>
Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com>
Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com>
Co-authored-by: v-korovin <v-korovin@yandex-team.com>
Co-authored-by: arikon <arikon@yandex-team.com>
Co-authored-by: khoden <khoden@yandex-team.com>
Co-authored-by: psydmm <psydmm@yandex-team.com>
Co-authored-by: robot-javacom <robot-javacom@yandex-team.com>
Co-authored-by: dtorilov <dtorilov@yandex-team.com>
Co-authored-by: sennikovmv <sennikovmv@yandex-team.com>
Co-authored-by: hcpp <hcpp@ydb.tech>
Diffstat (limited to 'contrib/tools/python3/Parser/pegen_errors.c')
-rw-r--r-- | contrib/tools/python3/Parser/pegen_errors.c | 454 |
1 files changed, 454 insertions, 0 deletions
diff --git a/contrib/tools/python3/Parser/pegen_errors.c b/contrib/tools/python3/Parser/pegen_errors.c new file mode 100644 index 0000000000..cefec5d275 --- /dev/null +++ b/contrib/tools/python3/Parser/pegen_errors.c @@ -0,0 +1,454 @@ +#include <Python.h> +#include <errcode.h> + +#include "tokenizer.h" +#include "pegen.h" + +// TOKENIZER ERRORS + +void +_PyPegen_raise_tokenizer_init_error(PyObject *filename) +{ + if (!(PyErr_ExceptionMatches(PyExc_LookupError) + || PyErr_ExceptionMatches(PyExc_SyntaxError) + || PyErr_ExceptionMatches(PyExc_ValueError) + || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { + return; + } + PyObject *errstr = NULL; + PyObject *tuple = NULL; + PyObject *type; + PyObject *value; + PyObject *tback; + PyErr_Fetch(&type, &value, &tback); + errstr = PyObject_Str(value); + if (!errstr) { + goto error; + } + + PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); + if (!tmp) { + goto error; + } + + tuple = PyTuple_Pack(2, errstr, tmp); + Py_DECREF(tmp); + if (!value) { + goto error; + } + PyErr_SetObject(PyExc_SyntaxError, tuple); + +error: + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(tback); + Py_XDECREF(errstr); + Py_XDECREF(tuple); +} + +static inline void +raise_unclosed_parentheses_error(Parser *p) { + int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; + int error_col = p->tok->parencolstack[p->tok->level-1]; + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, + error_lineno, error_col, error_lineno, -1, + "'%c' was never closed", + p->tok->parenstack[p->tok->level-1]); +} + +int +_Pypegen_tokenizer_error(Parser *p) +{ + if (PyErr_Occurred()) { + return -1; + } + + const char *msg = NULL; + PyObject* errtype = PyExc_SyntaxError; + Py_ssize_t col_offset = -1; + p->error_indicator = 1; + switch (p->tok->done) { + case E_TOKEN: + msg = "invalid token"; + break; + case E_EOF: + if (p->tok->level) { + raise_unclosed_parentheses_error(p); + } else { + RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); + } + return -1; + case E_DEDENT: + RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); + return -1; + case E_INTR: + if (!PyErr_Occurred()) { + PyErr_SetNone(PyExc_KeyboardInterrupt); + } + return -1; + case E_NOMEM: + PyErr_NoMemory(); + return -1; + case E_TABSPACE: + errtype = PyExc_TabError; + msg = "inconsistent use of tabs and spaces in indentation"; + break; + case E_TOODEEP: + errtype = PyExc_IndentationError; + msg = "too many levels of indentation"; + break; + case E_LINECONT: { + col_offset = p->tok->cur - p->tok->buf - 1; + msg = "unexpected character after line continuation character"; + break; + } + case E_COLUMNOVERFLOW: + PyErr_SetString(PyExc_OverflowError, + "Parser column offset overflow - source line is too big"); + return -1; + default: + msg = "unknown parsing error"; + } + + RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, + col_offset >= 0 ? col_offset : 0, + p->tok->lineno, -1, msg); + return -1; +} + +int +_Pypegen_raise_decode_error(Parser *p) +{ + assert(PyErr_Occurred()); + const char *errtype = NULL; + if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { + errtype = "unicode error"; + } + else if (PyErr_ExceptionMatches(PyExc_ValueError)) { + errtype = "value error"; + } + if (errtype) { + PyObject *type; + PyObject *value; + PyObject *tback; + PyObject *errstr; + PyErr_Fetch(&type, &value, &tback); + errstr = PyObject_Str(value); + if (errstr) { + RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); + Py_DECREF(errstr); + } + else { + PyErr_Clear(); + RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); + } + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(tback); + } + + return -1; +} + +static int +_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { + // Tokenize the whole input to see if there are any tokenization + // errors such as mistmatching parentheses. These will get priority + // over generic syntax errors only if the line number of the error is + // before the one that we had for the generic error. + + // We don't want to tokenize to the end for interactive input + if (p->tok->prompt != NULL) { + return 0; + } + + PyObject *type, *value, *traceback; + PyErr_Fetch(&type, &value, &traceback); + + Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; + Py_ssize_t current_err_line = current_token->lineno; + + int ret = 0; + struct token new_token; + _PyToken_Init(&new_token); + + for (;;) { + switch (_PyTokenizer_Get(p->tok, &new_token)) { + case ERRORTOKEN: + if (PyErr_Occurred()) { + ret = -1; + goto exit; + } + if (p->tok->level != 0) { + int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; + if (current_err_line > error_lineno) { + raise_unclosed_parentheses_error(p); + ret = -1; + goto exit; + } + } + break; + case ENDMARKER: + break; + default: + continue; + } + break; + } + + +exit: + _PyToken_Free(&new_token); + // If we're in an f-string, we want the syntax error in the expression part + // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards + // do not swallow it. + if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) { + Py_XDECREF(value); + Py_XDECREF(type); + Py_XDECREF(traceback); + } else { + PyErr_Restore(type, value, traceback); + } + return ret; +} + +// PARSER ERRORS + +void * +_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...) +{ + // Bail out if we already have an error set. + if (p->error_indicator && PyErr_Occurred()) { + return NULL; + } + if (p->fill == 0) { + va_list va; + va_start(va, errmsg); + _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); + va_end(va); + return NULL; + } + if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) { + p->error_indicator = 1; + return NULL; + } + Token *t = p->known_err_token != NULL + ? p->known_err_token + : p->tokens[use_mark ? p->mark : p->fill - 1]; + Py_ssize_t col_offset; + Py_ssize_t end_col_offset = -1; + if (t->col_offset == -1) { + if (p->tok->cur == p->tok->buf) { + col_offset = 0; + } else { + const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; + col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); + } + } else { + col_offset = t->col_offset + 1; + } + + if (t->end_col_offset != -1) { + end_col_offset = t->end_col_offset + 1; + } + + va_list va; + va_start(va, errmsg); + _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); + va_end(va); + + return NULL; +} + +static PyObject * +get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) +{ + /* If the file descriptor is interactive, the source lines of the current + * (multi-line) statement are stored in p->tok->interactive_src_start. + * If not, we're parsing from a string, which means that the whole source + * is stored in p->tok->str. */ + assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL); + + char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; + if (cur_line == NULL) { + assert(p->tok->fp_interactive); + // We can reach this point if the tokenizer buffers for interactive source have not been + // initialized because we failed to decode the original source with the given locale. + return PyUnicode_FromStringAndSize("", 0); + } + + Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno; + const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp; + + if (buf_end < cur_line) { + buf_end = cur_line + strlen(cur_line); + } + + for (int i = 0; i < relative_lineno - 1; i++) { + char *new_line = strchr(cur_line, '\n'); + // The assert is here for debug builds but the conditional that + // follows is there so in release builds we do not crash at the cost + // to report a potentially wrong line. + assert(new_line != NULL && new_line + 1 < buf_end); + if (new_line == NULL || new_line + 1 > buf_end) { + break; + } + cur_line = new_line + 1; + } + + char *next_newline; + if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line + next_newline = cur_line + strlen(cur_line); + } + return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); +} + +void * +_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, + Py_ssize_t lineno, Py_ssize_t col_offset, + Py_ssize_t end_lineno, Py_ssize_t end_col_offset, + const char *errmsg, va_list va) +{ + // Bail out if we already have an error set. + if (p->error_indicator && PyErr_Occurred()) { + return NULL; + } + PyObject *value = NULL; + PyObject *errstr = NULL; + PyObject *error_line = NULL; + PyObject *tmp = NULL; + p->error_indicator = 1; + + if (end_lineno == CURRENT_POS) { + end_lineno = p->tok->lineno; + } + if (end_col_offset == CURRENT_POS) { + end_col_offset = p->tok->cur - p->tok->line_start; + } + + errstr = PyUnicode_FromFormatV(errmsg, va); + if (!errstr) { + goto error; + } + + if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) { + error_line = get_error_line_from_tokenizer_buffers(p, lineno); + } + else if (p->start_rule == Py_file_input) { + error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, + (int) lineno, p->tok->encoding); + } + + if (!error_line) { + /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, + then we need to find the error line from some other source, because + p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly + failed or we're parsing from a string or the REPL. There's a third edge case where + we're actually parsing from a file, which has an E_EOF SyntaxError and in that case + `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which + does not physically exist */ + assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); + + if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { + Py_ssize_t size = p->tok->inp - p->tok->buf; + error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); + } + else if (p->tok->fp == NULL || p->tok->fp == stdin) { + error_line = get_error_line_from_tokenizer_buffers(p, lineno); + } + else { + error_line = PyUnicode_FromStringAndSize("", 0); + } + if (!error_line) { + goto error; + } + } + + Py_ssize_t col_number = col_offset; + Py_ssize_t end_col_number = end_col_offset; + + if (p->tok->encoding != NULL) { + col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); + if (col_number < 0) { + goto error; + } + if (end_col_number > 0) { + Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number); + if (end_col_offset < 0) { + goto error; + } else { + end_col_number = end_col_offset; + } + } + } + tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); + if (!tmp) { + goto error; + } + value = PyTuple_Pack(2, errstr, tmp); + Py_DECREF(tmp); + if (!value) { + goto error; + } + PyErr_SetObject(errtype, value); + + Py_DECREF(errstr); + Py_DECREF(value); + return NULL; + +error: + Py_XDECREF(errstr); + Py_XDECREF(error_line); + return NULL; +} + +void +_Pypegen_set_syntax_error(Parser* p, Token* last_token) { + // Existing sintax error + if (PyErr_Occurred()) { + // Prioritize tokenizer errors to custom syntax errors raised + // on the second phase only if the errors come from the parser. + int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK); + if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) { + _PyPegen_tokenize_full_source_to_check_for_errors(p); + } + // Propagate the existing syntax error. + return; + } + // Initialization error + if (p->fill == 0) { + RAISE_SYNTAX_ERROR("error at start before reading any input"); + } + // Parser encountered EOF (End of File) unexpectedtly + if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { + if (p->tok->level) { + raise_unclosed_parentheses_error(p); + } else { + RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); + } + return; + } + // Indentation error in the tokenizer + if (last_token->type == INDENT || last_token->type == DEDENT) { + RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); + return; + } + // Unknown error (generic case) + + // Use the last token we found on the first pass to avoid reporting + // incorrect locations for generic syntax errors just because we reached + // further away when trying to find specific syntax errors in the second + // pass. + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); + // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing + // generic SyntaxError we just raised if errors are found. + _PyPegen_tokenize_full_source_to_check_for_errors(p); +} + +void +_Pypegen_stack_overflow(Parser *p) +{ + p->error_indicator = 1; + PyErr_SetString(PyExc_MemoryError, + "Parser stack overflowed - Python source too complex to parse"); +} |