Library import 16 (#2433)

Co-authored-by: robot-piglet <robot-piglet@yandex-team.com> Co-authored-by: deshevoy <deshevoy@yandex-team.com> Co-authored-by: robot-contrib <robot-contrib@yandex-team.com> Co-authored-by: thegeorg <thegeorg@yandex-team.com> Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com> Co-authored-by: svidyuk <svidyuk@yandex-team.com> Co-authored-by: shadchin <shadchin@yandex-team.com> Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com> Co-authored-by: innokentii <innokentii@yandex-team.com> Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com> Co-authored-by: snermolaev <snermolaev@yandex-team.com> Co-authored-by: dimdim11 <dimdim11@yandex-team.com> Co-authored-by: kickbutt <kickbutt@yandex-team.com> Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com> Co-authored-by: korsunandrei <korsunandrei@yandex-team.com> Co-authored-by: petrk <petrk@yandex-team.com> Co-authored-by: miroslav2 <miroslav2@yandex-team.com> Co-authored-by: serjflint <serjflint@yandex-team.com> Co-authored-by: akhropov <akhropov@yandex-team.com> Co-authored-by: prettyboy <prettyboy@yandex-team.com> Co-authored-by: ilikepugs <ilikepugs@yandex-team.com> Co-authored-by: hiddenpath <hiddenpath@yandex-team.com> Co-authored-by: mikhnenko <mikhnenko@yandex-team.com> Co-authored-by: spreis <spreis@yandex-team.com> Co-authored-by: andreyshspb <andreyshspb@yandex-team.com> Co-authored-by: dimaandreev <dimaandreev@yandex-team.com> Co-authored-by: rashid <rashid@yandex-team.com> Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com> Co-authored-by: r-vetrov <r-vetrov@yandex-team.com> Co-authored-by: ypodlesov <ypodlesov@yandex-team.com> Co-authored-by: zaverden <zaverden@yandex-team.com> Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com> Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com> Co-authored-by: v-korovin <v-korovin@yandex-team.com> Co-authored-by: arikon <arikon@yandex-team.com> Co-authored-by: khoden <khoden@yandex-team.com> Co-authored-by: psydmm <psydmm@yandex-team.com> Co-authored-by: robot-javacom <robot-javacom@yandex-team.com> Co-authored-by: dtorilov <dtorilov@yandex-team.com> Co-authored-by: sennikovmv <sennikovmv@yandex-team.com> Co-authored-by: hcpp <hcpp@ydb.tech>
author: AlexSm <alex@ydb.tech> 2024-03-05 10:40:59 +0100
committer: GitHub <noreply@github.com> 2024-03-05 12:40:59 +0300
commit: 1ac13c847b5358faba44dbb638a828e24369467b (patch)
tree: 07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Parser/pegen_errors.c
parent: ffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff)
download: ydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz
1 files changed, 454 insertions, 0 deletions
diff --git a/contrib/tools/python3/Parser/pegen_errors.c b/contrib/tools/python3/Parser/pegen_errors.c
new file mode 100644
index 0000000000..cefec5d275
--- /dev/null
+++ b/contrib/tools/python3/Parser/pegen_errors.c
@@ -0,0 +1,454 @@
+#include <Python.h>
+#include <errcode.h>
+
+#include "tokenizer.h"
+#include "pegen.h"
+
+// TOKENIZER ERRORS
+
+void
+_PyPegen_raise_tokenizer_init_error(PyObject *filename)
+{
+    if (!(PyErr_ExceptionMatches(PyExc_LookupError)
+          || PyErr_ExceptionMatches(PyExc_SyntaxError)
+          || PyErr_ExceptionMatches(PyExc_ValueError)
+          || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
+        return;
+    }
+    PyObject *errstr = NULL;
+    PyObject *tuple = NULL;
+    PyObject *type;
+    PyObject *value;
+    PyObject *tback;
+    PyErr_Fetch(&type, &value, &tback);
+    errstr = PyObject_Str(value);
+    if (!errstr) {
+        goto error;
+    }
+
+    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
+    if (!tmp) {
+        goto error;
+    }
+
+    tuple = PyTuple_Pack(2, errstr, tmp);
+    Py_DECREF(tmp);
+    if (!value) {
+        goto error;
+    }
+    PyErr_SetObject(PyExc_SyntaxError, tuple);
+
+error:
+    Py_XDECREF(type);
+    Py_XDECREF(value);
+    Py_XDECREF(tback);
+    Py_XDECREF(errstr);
+    Py_XDECREF(tuple);
+}
+
+static inline void
+raise_unclosed_parentheses_error(Parser *p) {
+       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
+       int error_col = p->tok->parencolstack[p->tok->level-1];
+       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
+                                  error_lineno, error_col, error_lineno, -1,
+                                  "'%c' was never closed",
+                                  p->tok->parenstack[p->tok->level-1]);
+}
+
+int
+_Pypegen_tokenizer_error(Parser *p)
+{
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+
+    const char *msg = NULL;
+    PyObject* errtype = PyExc_SyntaxError;
+    Py_ssize_t col_offset = -1;
+    p->error_indicator = 1;
+    switch (p->tok->done) {
+        case E_TOKEN:
+            msg = "invalid token";
+            break;
+        case E_EOF:
+            if (p->tok->level) {
+                raise_unclosed_parentheses_error(p);
+            } else {
+                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
+            }
+            return -1;
+        case E_DEDENT:
+            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
+            return -1;
+        case E_INTR:
+            if (!PyErr_Occurred()) {
+                PyErr_SetNone(PyExc_KeyboardInterrupt);
+            }
+            return -1;
+        case E_NOMEM:
+            PyErr_NoMemory();
+            return -1;
+        case E_TABSPACE:
+            errtype = PyExc_TabError;
+            msg = "inconsistent use of tabs and spaces in indentation";
+            break;
+        case E_TOODEEP:
+            errtype = PyExc_IndentationError;
+            msg = "too many levels of indentation";
+            break;
+        case E_LINECONT: {
+            col_offset = p->tok->cur - p->tok->buf - 1;
+            msg = "unexpected character after line continuation character";
+            break;
+        }
+        case E_COLUMNOVERFLOW:
+            PyErr_SetString(PyExc_OverflowError,
+                    "Parser column offset overflow - source line is too big");
+            return -1;
+        default:
+            msg = "unknown parsing error";
+    }
+
+    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
+                               col_offset >= 0 ? col_offset : 0,
+                               p->tok->lineno, -1, msg);
+    return -1;
+}
+
+int
+_Pypegen_raise_decode_error(Parser *p)
+{
+    assert(PyErr_Occurred());
+    const char *errtype = NULL;
+    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
+        errtype = "unicode error";
+    }
+    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
+        errtype = "value error";
+    }
+    if (errtype) {
+        PyObject *type;
+        PyObject *value;
+        PyObject *tback;
+        PyObject *errstr;
+        PyErr_Fetch(&type, &value, &tback);
+        errstr = PyObject_Str(value);
+        if (errstr) {
+            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
+            Py_DECREF(errstr);
+        }
+        else {
+            PyErr_Clear();
+            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
+        }
+        Py_XDECREF(type);
+        Py_XDECREF(value);
+        Py_XDECREF(tback);
+    }
+
+    return -1;
+}
+
+static int
+_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
+    // Tokenize the whole input to see if there are any tokenization
+    // errors such as mistmatching parentheses. These will get priority
+    // over generic syntax errors only if the line number of the error is
+    // before the one that we had for the generic error.
+
+    // We don't want to tokenize to the end for interactive input
+    if (p->tok->prompt != NULL) {
+        return 0;
+    }
+
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+
+    Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
+    Py_ssize_t current_err_line = current_token->lineno;
+
+    int ret = 0;
+    struct token new_token;
+    _PyToken_Init(&new_token);
+
+    for (;;) {
+        switch (_PyTokenizer_Get(p->tok, &new_token)) {
+            case ERRORTOKEN:
+                if (PyErr_Occurred()) {
+                    ret = -1;
+                    goto exit;
+                }
+                if (p->tok->level != 0) {
+                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
+                    if (current_err_line > error_lineno) {
+                        raise_unclosed_parentheses_error(p);
+                        ret = -1;
+                        goto exit;
+                    }
+                }
+                break;
+            case ENDMARKER:
+                break;
+            default:
+                continue;
+        }
+        break;
+    }
+
+
+exit:
+    _PyToken_Free(&new_token);
+    // If we're in an f-string, we want the syntax error in the expression part
+    // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
+    // do not swallow it.
+    if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
+        Py_XDECREF(value);
+        Py_XDECREF(type);
+        Py_XDECREF(traceback);
+    } else {
+        PyErr_Restore(type, value, traceback);
+    }
+    return ret;
+}
+
+// PARSER ERRORS
+
+void *
+_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
+{
+    // Bail out if we already have an error set.
+    if (p->error_indicator && PyErr_Occurred()) {
+        return NULL;
+    }
+    if (p->fill == 0) {
+        va_list va;
+        va_start(va, errmsg);
+        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
+        va_end(va);
+        return NULL;
+    }
+    if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
+        p->error_indicator = 1;
+        return NULL;
+    }
+    Token *t = p->known_err_token != NULL
+                   ? p->known_err_token
+                   : p->tokens[use_mark ? p->mark : p->fill - 1];
+    Py_ssize_t col_offset;
+    Py_ssize_t end_col_offset = -1;
+    if (t->col_offset == -1) {
+        if (p->tok->cur == p->tok->buf) {
+            col_offset = 0;
+        } else {
+            const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
+            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
+        }
+    } else {
+        col_offset = t->col_offset + 1;
+    }
+
+    if (t->end_col_offset != -1) {
+        end_col_offset = t->end_col_offset + 1;
+    }
+
+    va_list va;
+    va_start(va, errmsg);
+    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
+    va_end(va);
+
+    return NULL;
+}
+
+static PyObject *
+get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
+{
+    /* If the file descriptor is interactive, the source lines of the current
+     * (multi-line) statement are stored in p->tok->interactive_src_start.
+     * If not, we're parsing from a string, which means that the whole source
+     * is stored in p->tok->str. */
+    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
+
+    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
+    if (cur_line == NULL) {
+        assert(p->tok->fp_interactive);
+        // We can reach this point if the tokenizer buffers for interactive source have not been
+        // initialized because we failed to decode the original source with the given locale.
+        return PyUnicode_FromStringAndSize("", 0);
+    }
+
+    Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
+    const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
+
+    if (buf_end < cur_line) {
+        buf_end = cur_line + strlen(cur_line);
+    }
+
+    for (int i = 0; i < relative_lineno - 1; i++) {
+        char *new_line = strchr(cur_line, '\n');
+        // The assert is here for debug builds but the conditional that
+        // follows is there so in release builds we do not crash at the cost
+        // to report a potentially wrong line.
+        assert(new_line != NULL && new_line + 1 < buf_end);
+        if (new_line == NULL || new_line + 1 > buf_end) {
+            break;
+        }
+        cur_line = new_line + 1;
+    }
+
+    char *next_newline;
+    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
+        next_newline = cur_line + strlen(cur_line);
+    }
+    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
+}
+
+void *
+_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
+                                    Py_ssize_t lineno, Py_ssize_t col_offset,
+                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
+                                    const char *errmsg, va_list va)
+{
+    // Bail out if we already have an error set.
+    if (p->error_indicator && PyErr_Occurred()) {
+        return NULL;
+    }
+    PyObject *value = NULL;
+    PyObject *errstr = NULL;
+    PyObject *error_line = NULL;
+    PyObject *tmp = NULL;
+    p->error_indicator = 1;
+
+    if (end_lineno == CURRENT_POS) {
+        end_lineno = p->tok->lineno;
+    }
+    if (end_col_offset == CURRENT_POS) {
+        end_col_offset = p->tok->cur - p->tok->line_start;
+    }
+
+    errstr = PyUnicode_FromFormatV(errmsg, va);
+    if (!errstr) {
+        goto error;
+    }
+
+    if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
+        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
+    }
+    else if (p->start_rule == Py_file_input) {
+        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
+                                                     (int) lineno, p->tok->encoding);
+    }
+
+    if (!error_line) {
+        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
+           then we need to find the error line from some other source, because
+           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
+           failed or we're parsing from a string or the REPL. There's a third edge case where
+           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
+           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
+           does not physically exist */
+        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
+
+        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
+            Py_ssize_t size = p->tok->inp - p->tok->buf;
+            error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
+        }
+        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
+            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
+        }
+        else {
+            error_line = PyUnicode_FromStringAndSize("", 0);
+        }
+        if (!error_line) {
+            goto error;
+        }
+    }
+
+    Py_ssize_t col_number = col_offset;
+    Py_ssize_t end_col_number = end_col_offset;
+
+    if (p->tok->encoding != NULL) {
+        col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
+        if (col_number < 0) {
+            goto error;
+        }
+        if (end_col_number > 0) {
+            Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
+            if (end_col_offset < 0) {
+                goto error;
+            } else {
+                end_col_number = end_col_offset;
+            }
+        }
+    }
+    tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
+    if (!tmp) {
+        goto error;
+    }
+    value = PyTuple_Pack(2, errstr, tmp);
+    Py_DECREF(tmp);
+    if (!value) {
+        goto error;
+    }
+    PyErr_SetObject(errtype, value);
+
+    Py_DECREF(errstr);
+    Py_DECREF(value);
+    return NULL;
+
+error:
+    Py_XDECREF(errstr);
+    Py_XDECREF(error_line);
+    return NULL;
+}
+
+void
+_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
+    // Existing sintax error
+    if (PyErr_Occurred()) {
+        // Prioritize tokenizer errors to custom syntax errors raised
+        // on the second phase only if the errors come from the parser.
+        int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
+        if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
+            _PyPegen_tokenize_full_source_to_check_for_errors(p);
+        }
+        // Propagate the existing syntax error.
+        return;
+    }
+    // Initialization error
+    if (p->fill == 0) {
+        RAISE_SYNTAX_ERROR("error at start before reading any input");
+    }
+    // Parser encountered EOF (End of File) unexpectedtly
+    if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
+        if (p->tok->level) {
+            raise_unclosed_parentheses_error(p);
+        } else {
+            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
+        }
+        return;
+    }
+    // Indentation error in the tokenizer
+    if (last_token->type == INDENT || last_token->type == DEDENT) {
+        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
+        return;
+    }
+    // Unknown error (generic case)
+
+    // Use the last token we found on the first pass to avoid reporting
+    // incorrect locations for generic syntax errors just because we reached
+    // further away when trying to find specific syntax errors in the second
+    // pass.
+    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
+    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
+    // generic SyntaxError we just raised if errors are found.
+    _PyPegen_tokenize_full_source_to_check_for_errors(p);
+}
+
+void
+_Pypegen_stack_overflow(Parser *p)
+{
+    p->error_indicator = 1;
+    PyErr_SetString(PyExc_MemoryError,
+        "Parser stack overflowed - Python source too complex to parse");
+}
author	AlexSm <alex@ydb.tech>	2024-03-05 10:40:59 +0100
committer	GitHub <noreply@github.com>	2024-03-05 12:40:59 +0300
commit	1ac13c847b5358faba44dbb638a828e24369467b (patch)
tree	07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Parser/pegen_errors.c
parent	ffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff)
download	ydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz