diff options
author | shadchin <shadchin@yandex-team.com> | 2024-02-12 07:53:52 +0300 |
---|---|---|
committer | shadchin <shadchin@yandex-team.com> | 2024-02-12 08:07:36 +0300 |
commit | ce1b7ca3171f9158180640c6a02a74b4afffedea (patch) | |
tree | e47c1e8391b1b0128262c1e9b1e6ed4c8fff2348 /contrib/tools/python3/src/Python/Python-tokenize.c | |
parent | 57350d96f030db90f220ce50ee591d5c5d403df7 (diff) | |
download | ydb-ce1b7ca3171f9158180640c6a02a74b4afffedea.tar.gz |
Update Python from 3.11.8 to 3.12.2
Diffstat (limited to 'contrib/tools/python3/src/Python/Python-tokenize.c')
-rw-r--r-- | contrib/tools/python3/src/Python/Python-tokenize.c | 227 |
1 files changed, 200 insertions, 27 deletions
diff --git a/contrib/tools/python3/src/Python/Python-tokenize.c b/contrib/tools/python3/src/Python/Python-tokenize.c index 6acfc2a7cf..179f71aa1f 100644 --- a/contrib/tools/python3/src/Python/Python-tokenize.c +++ b/contrib/tools/python3/src/Python/Python-tokenize.c @@ -1,5 +1,8 @@ #include "Python.h" +#include "errcode.h" #include "../Parser/tokenizer.h" +#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() +#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() static struct PyModuleDef _tokenizemodule; @@ -15,6 +18,7 @@ get_tokenize_state(PyObject *module) { #define _tokenize_get_state_by_type(type) \ get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule)) +#include "pycore_runtime.h" #include "clinic/Python-tokenize.c.h" /*[clinic input] @@ -26,18 +30,24 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t typedef struct { PyObject_HEAD struct tok_state *tok; + int done; } tokenizeriterobject; /*[clinic input] @classmethod _tokenizer.tokenizeriter.__new__ as tokenizeriter_new - source: str + readline: object + / + * + extra_tokens: bool + encoding: str(c_default="NULL") = 'utf-8' [clinic start generated code]*/ static PyObject * -tokenizeriter_new_impl(PyTypeObject *type, const char *source) -/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/ +tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline, + int extra_tokens, const char *encoding) +/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/ { tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0); if (self == NULL) { @@ -47,58 +57,220 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source) if (filename == NULL) { return NULL; } - self->tok = _PyTokenizer_FromUTF8(source, 1); + self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1); if (self->tok == NULL) { Py_DECREF(filename); return NULL; } self->tok->filename = filename; + if (extra_tokens) { + self->tok->tok_extra_tokens = 1; + } + self->done = 0; return (PyObject *)self; } +static int +_tokenizer_error(struct tok_state *tok) +{ + if (PyErr_Occurred()) { + return -1; + } + + const char *msg = NULL; + PyObject* errtype = PyExc_SyntaxError; + switch (tok->done) { + case E_TOKEN: + msg = "invalid token"; + break; + case E_EOF: + PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement"); + PyErr_SyntaxLocationObject(tok->filename, tok->lineno, + tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf)); + return -1; + case E_DEDENT: + msg = "unindent does not match any outer indentation level"; + errtype = PyExc_IndentationError; + break; + case E_INTR: + if (!PyErr_Occurred()) { + PyErr_SetNone(PyExc_KeyboardInterrupt); + } + return -1; + case E_NOMEM: + PyErr_NoMemory(); + return -1; + case E_TABSPACE: + errtype = PyExc_TabError; + msg = "inconsistent use of tabs and spaces in indentation"; + break; + case E_TOODEEP: + errtype = PyExc_IndentationError; + msg = "too many levels of indentation"; + break; + case E_LINECONT: { + msg = "unexpected character after line continuation character"; + break; + } + default: + msg = "unknown tokenization error"; + } + + PyObject* errstr = NULL; + PyObject* error_line = NULL; + PyObject* tmp = NULL; + PyObject* value = NULL; + int result = 0; + + Py_ssize_t size = tok->inp - tok->buf; + assert(tok->buf[size-1] == '\n'); + size -= 1; // Remove the newline character from the end of the line + error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace"); + if (!error_line) { + result = -1; + goto exit; + } + + Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf); + if (offset == -1) { + result = -1; + goto exit; + } + tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None); + if (!tmp) { + result = -1; + goto exit; + } + + errstr = PyUnicode_FromString(msg); + if (!errstr) { + result = -1; + goto exit; + } + + value = PyTuple_Pack(2, errstr, tmp); + if (!value) { + result = -1; + goto exit; + } + + PyErr_SetObject(errtype, value); + +exit: + Py_XDECREF(errstr); + Py_XDECREF(error_line); + Py_XDECREF(tmp); + Py_XDECREF(value); + return result; +} + static PyObject * tokenizeriter_next(tokenizeriterobject *it) { - const char *start; - const char *end; - int type = _PyTokenizer_Get(it->tok, &start, &end); - if (type == ERRORTOKEN && PyErr_Occurred()) { - return NULL; + PyObject* result = NULL; + struct token token; + _PyToken_Init(&token); + + int type = _PyTokenizer_Get(it->tok, &token); + if (type == ERRORTOKEN) { + if(!PyErr_Occurred()) { + _tokenizer_error(it->tok); + assert(PyErr_Occurred()); + } + goto exit; } - if (type == ERRORTOKEN || type == ENDMARKER) { + if (it->done || type == ERRORTOKEN) { PyErr_SetString(PyExc_StopIteration, "EOF"); - return NULL; + it->done = 1; + goto exit; } PyObject *str = NULL; - if (start == NULL || end == NULL) { + if (token.start == NULL || token.end == NULL) { str = PyUnicode_FromString(""); } else { - str = PyUnicode_FromStringAndSize(start, end - start); + str = PyUnicode_FromStringAndSize(token.start, token.end - token.start); } if (str == NULL) { - return NULL; + goto exit; } - Py_ssize_t size = it->tok->inp - it->tok->buf; - PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace"); + int is_trailing_token = 0; + if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) { + is_trailing_token = 1; + } + + const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; + PyObject* line = NULL; + if (it->tok->tok_extra_tokens && is_trailing_token) { + line = PyUnicode_FromString(""); + } else { + Py_ssize_t size = it->tok->inp - line_start; + if (size >= 1 && it->tok->implicit_newline) { + size -= 1; + } + line = PyUnicode_DecodeUTF8(line_start, size, "replace"); + } if (line == NULL) { Py_DECREF(str); - return NULL; + goto exit; + } + + Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; + Py_ssize_t end_lineno = it->tok->lineno; + Py_ssize_t col_offset = -1; + Py_ssize_t end_col_offset = -1; + if (token.start != NULL && token.start >= line_start) { + col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start); } - const char *line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start; - int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno; - int end_lineno = it->tok->lineno; - int col_offset = -1; - int end_col_offset = -1; - if (start != NULL && start >= line_start) { - col_offset = (int)(start - line_start); + if (token.end != NULL && token.end >= it->tok->line_start) { + end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start); } - if (end != NULL && end >= it->tok->line_start) { - end_col_offset = (int)(end - it->tok->line_start); + + if (it->tok->tok_extra_tokens) { + if (is_trailing_token) { + lineno = end_lineno = lineno + 1; + col_offset = end_col_offset = 0; + } + // Necessary adjustments to match the original Python tokenize + // implementation + if (type > DEDENT && type < OP) { + type = OP; + } + else if (type == ASYNC || type == AWAIT) { + type = NAME; + } + else if (type == NEWLINE) { + Py_DECREF(str); + if (!it->tok->implicit_newline) { + if (it->tok->start[0] == '\r') { + str = PyUnicode_FromString("\r\n"); + } else { + str = PyUnicode_FromString("\n"); + } + } + end_col_offset++; + } + else if (type == NL) { + if (it->tok->implicit_newline) { + Py_DECREF(str); + str = PyUnicode_FromString(""); + } + } + + if (str == NULL) { + Py_DECREF(line); + goto exit; + } } - return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); + result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line); +exit: + _PyToken_Free(&token); + if (type == ENDMARKER) { + it->done = 1; + } + return result; } static void @@ -151,6 +323,7 @@ static PyMethodDef tokenize_methods[] = { static PyModuleDef_Slot tokenizemodule_slots[] = { {Py_mod_exec, tokenizemodule_exec}, + {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, {0, NULL} }; |