Update Python from 3.11.8 to 3.12.2

author: shadchin <shadchin@yandex-team.com> 2024-02-12 07:53:52 +0300
committer: shadchin <shadchin@yandex-team.com> 2024-02-12 08:07:36 +0300
commit: ce1b7ca3171f9158180640c6a02a74b4afffedea (patch)
tree: e47c1e8391b1b0128262c1e9b1e6ed4c8fff2348 /contrib/tools/python3/src/Python/Python-tokenize.c
parent: 57350d96f030db90f220ce50ee591d5c5d403df7 (diff)
download: ydb-ce1b7ca3171f9158180640c6a02a74b4afffedea.tar.gz
1 files changed, 200 insertions, 27 deletions
diff --git a/contrib/tools/python3/src/Python/Python-tokenize.c b/contrib/tools/python3/src/Python/Python-tokenize.c
index 6acfc2a7cf..179f71aa1f 100644
--- a/contrib/tools/python3/src/Python/Python-tokenize.c
+++ b/contrib/tools/python3/src/Python/Python-tokenize.c
@@ -1,5 +1,8 @@
 #include "Python.h"
+#include "errcode.h"
 #include "../Parser/tokenizer.h"
+#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
+#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
 
 static struct PyModuleDef _tokenizemodule;
 
@@ -15,6 +18,7 @@ get_tokenize_state(PyObject *module) {
 #define _tokenize_get_state_by_type(type) \
     get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
 
+#include "pycore_runtime.h"
 #include "clinic/Python-tokenize.c.h"
 
 /*[clinic input]
@@ -26,18 +30,24 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t
 typedef struct
 {
     PyObject_HEAD struct tok_state *tok;
+    int done;
 } tokenizeriterobject;
 
 /*[clinic input]
 @classmethod
 _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
 
-    source: str
+    readline: object
+    /
+    *
+    extra_tokens: bool
+    encoding: str(c_default="NULL") = 'utf-8'
 [clinic start generated code]*/
 
 static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source)
-/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
+tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
+                       int extra_tokens, const char *encoding)
+/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
 {
     tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
     if (self == NULL) {
@@ -47,58 +57,220 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source)
     if (filename == NULL) {
         return NULL;
     }
-    self->tok = _PyTokenizer_FromUTF8(source, 1);
+    self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
     if (self->tok == NULL) {
         Py_DECREF(filename);
         return NULL;
     }
     self->tok->filename = filename;
+    if (extra_tokens) {
+        self->tok->tok_extra_tokens = 1;
+    }
+    self->done = 0;
     return (PyObject *)self;
 }
 
+static int
+_tokenizer_error(struct tok_state *tok)
+{
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+
+    const char *msg = NULL;
+    PyObject* errtype = PyExc_SyntaxError;
+    switch (tok->done) {
+        case E_TOKEN:
+            msg = "invalid token";
+            break;
+        case E_EOF:
+            PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
+            PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
+                                       tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
+            return -1;
+        case E_DEDENT:
+            msg = "unindent does not match any outer indentation level";
+            errtype = PyExc_IndentationError;
+            break;
+        case E_INTR:
+            if (!PyErr_Occurred()) {
+                PyErr_SetNone(PyExc_KeyboardInterrupt);
+            }
+            return -1;
+        case E_NOMEM:
+            PyErr_NoMemory();
+            return -1;
+        case E_TABSPACE:
+            errtype = PyExc_TabError;
+            msg = "inconsistent use of tabs and spaces in indentation";
+            break;
+        case E_TOODEEP:
+            errtype = PyExc_IndentationError;
+            msg = "too many levels of indentation";
+            break;
+        case E_LINECONT: {
+            msg = "unexpected character after line continuation character";
+            break;
+        }
+        default:
+            msg = "unknown tokenization error";
+    }
+
+    PyObject* errstr = NULL;
+    PyObject* error_line = NULL;
+    PyObject* tmp = NULL;
+    PyObject* value = NULL;
+    int result = 0;
+
+    Py_ssize_t size = tok->inp - tok->buf;
+    assert(tok->buf[size-1] == '\n');
+    size -= 1; // Remove the newline character from the end of the line
+    error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
+    if (!error_line) {
+        result = -1;
+        goto exit;
+    }
+
+    Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
+    if (offset == -1) {
+        result = -1;
+        goto exit;
+    }
+    tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
+    if (!tmp) {
+        result = -1;
+        goto exit;
+    }
+
+    errstr = PyUnicode_FromString(msg);
+    if (!errstr) {
+        result = -1;
+        goto exit;
+    }
+
+    value = PyTuple_Pack(2, errstr, tmp);
+    if (!value) {
+        result = -1;
+        goto exit;
+    }
+
+    PyErr_SetObject(errtype, value);
+
+exit:
+    Py_XDECREF(errstr);
+    Py_XDECREF(error_line);
+    Py_XDECREF(tmp);
+    Py_XDECREF(value);
+    return result;
+}
+
 static PyObject *
 tokenizeriter_next(tokenizeriterobject *it)
 {
-    const char *start;
-    const char *end;
-    int type = _PyTokenizer_Get(it->tok, &start, &end);
-    if (type == ERRORTOKEN && PyErr_Occurred()) {
-        return NULL;
+    PyObject* result = NULL;
+    struct token token;
+    _PyToken_Init(&token);
+
+    int type = _PyTokenizer_Get(it->tok, &token);
+    if (type == ERRORTOKEN) {
+        if(!PyErr_Occurred()) {
+            _tokenizer_error(it->tok);
+            assert(PyErr_Occurred());
+        }
+        goto exit;
     }
-    if (type == ERRORTOKEN || type == ENDMARKER) {
+    if (it->done || type == ERRORTOKEN) {
         PyErr_SetString(PyExc_StopIteration, "EOF");
-        return NULL;
+        it->done = 1;
+        goto exit;
     }
     PyObject *str = NULL;
-    if (start == NULL || end == NULL) {
+    if (token.start == NULL || token.end == NULL) {
         str = PyUnicode_FromString("");
     }
     else {
-        str = PyUnicode_FromStringAndSize(start, end - start);
+        str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
     }
     if (str == NULL) {
-        return NULL;
+        goto exit;
     }
 
-    Py_ssize_t size = it->tok->inp - it->tok->buf;
-    PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
+    int is_trailing_token = 0;
+    if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
+        is_trailing_token = 1;
+    }
+
+    const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
+    PyObject* line = NULL;
+    if (it->tok->tok_extra_tokens && is_trailing_token) {
+        line = PyUnicode_FromString("");
+    } else {
+        Py_ssize_t size = it->tok->inp - line_start;
+        if (size >= 1 && it->tok->implicit_newline) {
+            size -= 1;
+        }
+        line = PyUnicode_DecodeUTF8(line_start, size, "replace");
+    }
     if (line == NULL) {
         Py_DECREF(str);
-        return NULL;
+        goto exit;
+    }
+
+    Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
+    Py_ssize_t end_lineno = it->tok->lineno;
+    Py_ssize_t col_offset = -1;
+    Py_ssize_t end_col_offset = -1;
+    if (token.start != NULL && token.start >= line_start) {
+        col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
     }
-    const char *line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
-    int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
-    int end_lineno = it->tok->lineno;
-    int col_offset = -1;
-    int end_col_offset = -1;
-    if (start != NULL && start >= line_start) {
-        col_offset = (int)(start - line_start);
+    if (token.end != NULL && token.end >= it->tok->line_start) {
+        end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
     }
-    if (end != NULL && end >= it->tok->line_start) {
-        end_col_offset = (int)(end - it->tok->line_start);
+
+    if (it->tok->tok_extra_tokens) {
+        if (is_trailing_token) {
+            lineno = end_lineno = lineno + 1;
+            col_offset = end_col_offset = 0;
+        }
+        // Necessary adjustments to match the original Python tokenize
+        // implementation
+        if (type > DEDENT && type < OP) {
+            type = OP;
+        }
+        else if (type == ASYNC || type == AWAIT) {
+            type = NAME;
+        }
+        else if (type == NEWLINE) {
+            Py_DECREF(str);
+            if (!it->tok->implicit_newline) {
+                if (it->tok->start[0] == '\r') {
+                    str = PyUnicode_FromString("\r\n");
+                } else {
+                    str = PyUnicode_FromString("\n");
+                }
+            }
+            end_col_offset++;
+        }
+        else if (type == NL) {
+            if (it->tok->implicit_newline) {
+                Py_DECREF(str);
+                str = PyUnicode_FromString("");
+            }
+        }
+
+        if (str == NULL) {
+            Py_DECREF(line);
+            goto exit;
+        }
     }
 
-    return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+    result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
+exit:
+    _PyToken_Free(&token);
+    if (type == ENDMARKER) {
+        it->done = 1;
+    }
+    return result;
 }
 
 static void
@@ -151,6 +323,7 @@ static PyMethodDef tokenize_methods[] = {
 
 static PyModuleDef_Slot tokenizemodule_slots[] = {
     {Py_mod_exec, tokenizemodule_exec},
+    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
     {0, NULL}
 };
author	shadchin <shadchin@yandex-team.com>	2024-02-12 07:53:52 +0300
committer	shadchin <shadchin@yandex-team.com>	2024-02-12 08:07:36 +0300
commit	ce1b7ca3171f9158180640c6a02a74b4afffedea (patch)
tree	e47c1e8391b1b0128262c1e9b1e6ed4c8fff2348 /contrib/tools/python3/src/Python/Python-tokenize.c
parent	57350d96f030db90f220ce50ee591d5c5d403df7 (diff)
download	ydb-ce1b7ca3171f9158180640c6a02a74b4afffedea.tar.gz