aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/src/Python/Python-tokenize.c
diff options
context:
space:
mode:
authorshadchin <shadchin@yandex-team.com>2024-02-12 07:53:52 +0300
committershadchin <shadchin@yandex-team.com>2024-02-12 08:07:36 +0300
commitce1b7ca3171f9158180640c6a02a74b4afffedea (patch)
treee47c1e8391b1b0128262c1e9b1e6ed4c8fff2348 /contrib/tools/python3/src/Python/Python-tokenize.c
parent57350d96f030db90f220ce50ee591d5c5d403df7 (diff)
downloadydb-ce1b7ca3171f9158180640c6a02a74b4afffedea.tar.gz
Update Python from 3.11.8 to 3.12.2
Diffstat (limited to 'contrib/tools/python3/src/Python/Python-tokenize.c')
-rw-r--r--contrib/tools/python3/src/Python/Python-tokenize.c227
1 files changed, 200 insertions, 27 deletions
diff --git a/contrib/tools/python3/src/Python/Python-tokenize.c b/contrib/tools/python3/src/Python/Python-tokenize.c
index 6acfc2a7cf..179f71aa1f 100644
--- a/contrib/tools/python3/src/Python/Python-tokenize.c
+++ b/contrib/tools/python3/src/Python/Python-tokenize.c
@@ -1,5 +1,8 @@
#include "Python.h"
+#include "errcode.h"
#include "../Parser/tokenizer.h"
+#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
+#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
static struct PyModuleDef _tokenizemodule;
@@ -15,6 +18,7 @@ get_tokenize_state(PyObject *module) {
#define _tokenize_get_state_by_type(type) \
get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
+#include "pycore_runtime.h"
#include "clinic/Python-tokenize.c.h"
/*[clinic input]
@@ -26,18 +30,24 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t
typedef struct
{
PyObject_HEAD struct tok_state *tok;
+ int done;
} tokenizeriterobject;
/*[clinic input]
@classmethod
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
- source: str
+ readline: object
+ /
+ *
+ extra_tokens: bool
+ encoding: str(c_default="NULL") = 'utf-8'
[clinic start generated code]*/
static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source)
-/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
+tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
+ int extra_tokens, const char *encoding)
+/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
{
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
if (self == NULL) {
@@ -47,58 +57,220 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source)
if (filename == NULL) {
return NULL;
}
- self->tok = _PyTokenizer_FromUTF8(source, 1);
+ self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
if (self->tok == NULL) {
Py_DECREF(filename);
return NULL;
}
self->tok->filename = filename;
+ if (extra_tokens) {
+ self->tok->tok_extra_tokens = 1;
+ }
+ self->done = 0;
return (PyObject *)self;
}
+static int
+_tokenizer_error(struct tok_state *tok)
+{
+ if (PyErr_Occurred()) {
+ return -1;
+ }
+
+ const char *msg = NULL;
+ PyObject* errtype = PyExc_SyntaxError;
+ switch (tok->done) {
+ case E_TOKEN:
+ msg = "invalid token";
+ break;
+ case E_EOF:
+ PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
+ PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
+ tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
+ return -1;
+ case E_DEDENT:
+ msg = "unindent does not match any outer indentation level";
+ errtype = PyExc_IndentationError;
+ break;
+ case E_INTR:
+ if (!PyErr_Occurred()) {
+ PyErr_SetNone(PyExc_KeyboardInterrupt);
+ }
+ return -1;
+ case E_NOMEM:
+ PyErr_NoMemory();
+ return -1;
+ case E_TABSPACE:
+ errtype = PyExc_TabError;
+ msg = "inconsistent use of tabs and spaces in indentation";
+ break;
+ case E_TOODEEP:
+ errtype = PyExc_IndentationError;
+ msg = "too many levels of indentation";
+ break;
+ case E_LINECONT: {
+ msg = "unexpected character after line continuation character";
+ break;
+ }
+ default:
+ msg = "unknown tokenization error";
+ }
+
+ PyObject* errstr = NULL;
+ PyObject* error_line = NULL;
+ PyObject* tmp = NULL;
+ PyObject* value = NULL;
+ int result = 0;
+
+ Py_ssize_t size = tok->inp - tok->buf;
+ assert(tok->buf[size-1] == '\n');
+ size -= 1; // Remove the newline character from the end of the line
+ error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
+ if (!error_line) {
+ result = -1;
+ goto exit;
+ }
+
+ Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
+ if (offset == -1) {
+ result = -1;
+ goto exit;
+ }
+ tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
+ if (!tmp) {
+ result = -1;
+ goto exit;
+ }
+
+ errstr = PyUnicode_FromString(msg);
+ if (!errstr) {
+ result = -1;
+ goto exit;
+ }
+
+ value = PyTuple_Pack(2, errstr, tmp);
+ if (!value) {
+ result = -1;
+ goto exit;
+ }
+
+ PyErr_SetObject(errtype, value);
+
+exit:
+ Py_XDECREF(errstr);
+ Py_XDECREF(error_line);
+ Py_XDECREF(tmp);
+ Py_XDECREF(value);
+ return result;
+}
+
static PyObject *
tokenizeriter_next(tokenizeriterobject *it)
{
- const char *start;
- const char *end;
- int type = _PyTokenizer_Get(it->tok, &start, &end);
- if (type == ERRORTOKEN && PyErr_Occurred()) {
- return NULL;
+ PyObject* result = NULL;
+ struct token token;
+ _PyToken_Init(&token);
+
+ int type = _PyTokenizer_Get(it->tok, &token);
+ if (type == ERRORTOKEN) {
+ if(!PyErr_Occurred()) {
+ _tokenizer_error(it->tok);
+ assert(PyErr_Occurred());
+ }
+ goto exit;
}
- if (type == ERRORTOKEN || type == ENDMARKER) {
+ if (it->done || type == ERRORTOKEN) {
PyErr_SetString(PyExc_StopIteration, "EOF");
- return NULL;
+ it->done = 1;
+ goto exit;
}
PyObject *str = NULL;
- if (start == NULL || end == NULL) {
+ if (token.start == NULL || token.end == NULL) {
str = PyUnicode_FromString("");
}
else {
- str = PyUnicode_FromStringAndSize(start, end - start);
+ str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
}
if (str == NULL) {
- return NULL;
+ goto exit;
}
- Py_ssize_t size = it->tok->inp - it->tok->buf;
- PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
+ int is_trailing_token = 0;
+ if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
+ is_trailing_token = 1;
+ }
+
+ const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
+ PyObject* line = NULL;
+ if (it->tok->tok_extra_tokens && is_trailing_token) {
+ line = PyUnicode_FromString("");
+ } else {
+ Py_ssize_t size = it->tok->inp - line_start;
+ if (size >= 1 && it->tok->implicit_newline) {
+ size -= 1;
+ }
+ line = PyUnicode_DecodeUTF8(line_start, size, "replace");
+ }
if (line == NULL) {
Py_DECREF(str);
- return NULL;
+ goto exit;
+ }
+
+ Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
+ Py_ssize_t end_lineno = it->tok->lineno;
+ Py_ssize_t col_offset = -1;
+ Py_ssize_t end_col_offset = -1;
+ if (token.start != NULL && token.start >= line_start) {
+ col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
}
- const char *line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
- int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
- int end_lineno = it->tok->lineno;
- int col_offset = -1;
- int end_col_offset = -1;
- if (start != NULL && start >= line_start) {
- col_offset = (int)(start - line_start);
+ if (token.end != NULL && token.end >= it->tok->line_start) {
+ end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
}
- if (end != NULL && end >= it->tok->line_start) {
- end_col_offset = (int)(end - it->tok->line_start);
+
+ if (it->tok->tok_extra_tokens) {
+ if (is_trailing_token) {
+ lineno = end_lineno = lineno + 1;
+ col_offset = end_col_offset = 0;
+ }
+ // Necessary adjustments to match the original Python tokenize
+ // implementation
+ if (type > DEDENT && type < OP) {
+ type = OP;
+ }
+ else if (type == ASYNC || type == AWAIT) {
+ type = NAME;
+ }
+ else if (type == NEWLINE) {
+ Py_DECREF(str);
+ if (!it->tok->implicit_newline) {
+ if (it->tok->start[0] == '\r') {
+ str = PyUnicode_FromString("\r\n");
+ } else {
+ str = PyUnicode_FromString("\n");
+ }
+ }
+ end_col_offset++;
+ }
+ else if (type == NL) {
+ if (it->tok->implicit_newline) {
+ Py_DECREF(str);
+ str = PyUnicode_FromString("");
+ }
+ }
+
+ if (str == NULL) {
+ Py_DECREF(line);
+ goto exit;
+ }
}
- return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+ result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
+exit:
+ _PyToken_Free(&token);
+ if (type == ENDMARKER) {
+ it->done = 1;
+ }
+ return result;
}
static void
@@ -151,6 +323,7 @@ static PyMethodDef tokenize_methods[] = {
static PyModuleDef_Slot tokenizemodule_slots[] = {
{Py_mod_exec, tokenizemodule_exec},
+ {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
{0, NULL}
};