1 files changed, 367 insertions, 0 deletions
diff --git a/contrib/tools/python3/Python/Python-tokenize.c b/contrib/tools/python3/Python/Python-tokenize.c
new file mode 100644
index 0000000000..179f71aa1f
--- /dev/null
+++ b/contrib/tools/python3/Python/Python-tokenize.c
@@ -0,0 +1,367 @@
+#include "Python.h"
+#include "errcode.h"
+#include "../Parser/tokenizer.h"
+#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
+#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
+
+static struct PyModuleDef _tokenizemodule;
+
+typedef struct {
+    PyTypeObject *TokenizerIter;
+} tokenize_state;
+
+static tokenize_state *
+get_tokenize_state(PyObject *module) {
+    return (tokenize_state *)PyModule_GetState(module);
+}
+
+#define _tokenize_get_state_by_type(type) \
+    get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
+
+#include "pycore_runtime.h"
+#include "clinic/Python-tokenize.c.h"
+
+/*[clinic input]
+module _tokenizer
+class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
+
+typedef struct
+{
+    PyObject_HEAD struct tok_state *tok;
+    int done;
+} tokenizeriterobject;
+
+/*[clinic input]
+@classmethod
+_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
+
+    readline: object
+    /
+    *
+    extra_tokens: bool
+    encoding: str(c_default="NULL") = 'utf-8'
+[clinic start generated code]*/
+
+static PyObject *
+tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
+                       int extra_tokens, const char *encoding)
+/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
+{
+    tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
+    if (self == NULL) {
+        return NULL;
+    }
+    PyObject *filename = PyUnicode_FromString("<string>");
+    if (filename == NULL) {
+        return NULL;
+    }
+    self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
+    if (self->tok == NULL) {
+        Py_DECREF(filename);
+        return NULL;
+    }
+    self->tok->filename = filename;
+    if (extra_tokens) {
+        self->tok->tok_extra_tokens = 1;
+    }
+    self->done = 0;
+    return (PyObject *)self;
+}
+
+static int
+_tokenizer_error(struct tok_state *tok)
+{
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+
+    const char *msg = NULL;
+    PyObject* errtype = PyExc_SyntaxError;
+    switch (tok->done) {
+        case E_TOKEN:
+            msg = "invalid token";
+            break;
+        case E_EOF:
+            PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
+            PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
+                                       tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
+            return -1;
+        case E_DEDENT:
+            msg = "unindent does not match any outer indentation level";
+            errtype = PyExc_IndentationError;
+            break;
+        case E_INTR:
+            if (!PyErr_Occurred()) {
+                PyErr_SetNone(PyExc_KeyboardInterrupt);
+            }
+            return -1;
+        case E_NOMEM:
+            PyErr_NoMemory();
+            return -1;
+        case E_TABSPACE:
+            errtype = PyExc_TabError;
+            msg = "inconsistent use of tabs and spaces in indentation";
+            break;
+        case E_TOODEEP:
+            errtype = PyExc_IndentationError;
+            msg = "too many levels of indentation";
+            break;
+        case E_LINECONT: {
+            msg = "unexpected character after line continuation character";
+            break;
+        }
+        default:
+            msg = "unknown tokenization error";
+    }
+
+    PyObject* errstr = NULL;
+    PyObject* error_line = NULL;
+    PyObject* tmp = NULL;
+    PyObject* value = NULL;
+    int result = 0;
+
+    Py_ssize_t size = tok->inp - tok->buf;
+    assert(tok->buf[size-1] == '\n');
+    size -= 1; // Remove the newline character from the end of the line
+    error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
+    if (!error_line) {
+        result = -1;
+        goto exit;
+    }
+
+    Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
+    if (offset == -1) {
+        result = -1;
+        goto exit;
+    }
+    tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
+    if (!tmp) {
+        result = -1;
+        goto exit;
+    }
+
+    errstr = PyUnicode_FromString(msg);
+    if (!errstr) {
+        result = -1;
+        goto exit;
+    }
+
+    value = PyTuple_Pack(2, errstr, tmp);
+    if (!value) {
+        result = -1;
+        goto exit;
+    }
+
+    PyErr_SetObject(errtype, value);
+
+exit:
+    Py_XDECREF(errstr);
+    Py_XDECREF(error_line);
+    Py_XDECREF(tmp);
+    Py_XDECREF(value);
+    return result;
+}
+
+static PyObject *
+tokenizeriter_next(tokenizeriterobject *it)
+{
+    PyObject* result = NULL;
+    struct token token;
+    _PyToken_Init(&token);
+
+    int type = _PyTokenizer_Get(it->tok, &token);
+    if (type == ERRORTOKEN) {
+        if(!PyErr_Occurred()) {
+            _tokenizer_error(it->tok);
+            assert(PyErr_Occurred());
+        }
+        goto exit;
+    }
+    if (it->done || type == ERRORTOKEN) {
+        PyErr_SetString(PyExc_StopIteration, "EOF");
+        it->done = 1;
+        goto exit;
+    }
+    PyObject *str = NULL;
+    if (token.start == NULL || token.end == NULL) {
+        str = PyUnicode_FromString("");
+    }
+    else {
+        str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
+    }
+    if (str == NULL) {
+        goto exit;
+    }
+
+    int is_trailing_token = 0;
+    if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
+        is_trailing_token = 1;
+    }
+
+    const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
+    PyObject* line = NULL;
+    if (it->tok->tok_extra_tokens && is_trailing_token) {
+        line = PyUnicode_FromString("");
+    } else {
+        Py_ssize_t size = it->tok->inp - line_start;
+        if (size >= 1 && it->tok->implicit_newline) {
+            size -= 1;
+        }
+        line = PyUnicode_DecodeUTF8(line_start, size, "replace");
+    }
+    if (line == NULL) {
+        Py_DECREF(str);
+        goto exit;
+    }
+
+    Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
+    Py_ssize_t end_lineno = it->tok->lineno;
+    Py_ssize_t col_offset = -1;
+    Py_ssize_t end_col_offset = -1;
+    if (token.start != NULL && token.start >= line_start) {
+        col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
+    }
+    if (token.end != NULL && token.end >= it->tok->line_start) {
+        end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
+    }
+
+    if (it->tok->tok_extra_tokens) {
+        if (is_trailing_token) {
+            lineno = end_lineno = lineno + 1;
+            col_offset = end_col_offset = 0;
+        }
+        // Necessary adjustments to match the original Python tokenize
+        // implementation
+        if (type > DEDENT && type < OP) {
+            type = OP;
+        }
+        else if (type == ASYNC || type == AWAIT) {
+            type = NAME;
+        }
+        else if (type == NEWLINE) {
+            Py_DECREF(str);
+            if (!it->tok->implicit_newline) {
+                if (it->tok->start[0] == '\r') {
+                    str = PyUnicode_FromString("\r\n");
+                } else {
+                    str = PyUnicode_FromString("\n");
+                }
+            }
+            end_col_offset++;
+        }
+        else if (type == NL) {
+            if (it->tok->implicit_newline) {
+                Py_DECREF(str);
+                str = PyUnicode_FromString("");
+            }
+        }
+
+        if (str == NULL) {
+            Py_DECREF(line);
+            goto exit;
+        }
+    }
+
+    result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
+exit:
+    _PyToken_Free(&token);
+    if (type == ENDMARKER) {
+        it->done = 1;
+    }
+    return result;
+}
+
+static void
+tokenizeriter_dealloc(tokenizeriterobject *it)
+{
+    PyTypeObject *tp = Py_TYPE(it);
+    _PyTokenizer_Free(it->tok);
+    tp->tp_free(it);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot tokenizeriter_slots[] = {
+    {Py_tp_new, tokenizeriter_new},
+    {Py_tp_dealloc, tokenizeriter_dealloc},
+    {Py_tp_getattro, PyObject_GenericGetAttr},
+    {Py_tp_iter, PyObject_SelfIter},
+    {Py_tp_iternext, tokenizeriter_next},
+    {0, NULL},
+};
+
+static PyType_Spec tokenizeriter_spec = {
+    .name = "_tokenize.TokenizerIter",
+    .basicsize = sizeof(tokenizeriterobject),
+    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
+    .slots = tokenizeriter_slots,
+};
+
+static int
+tokenizemodule_exec(PyObject *m)
+{
+    tokenize_state *state = get_tokenize_state(m);
+    if (state == NULL) {
+        return -1;
+    }
+
+    state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
+    if (state->TokenizerIter == NULL) {
+        return -1;
+    }
+    if (PyModule_AddType(m, state->TokenizerIter) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static PyMethodDef tokenize_methods[] = {
+    {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+static PyModuleDef_Slot tokenizemodule_slots[] = {
+    {Py_mod_exec, tokenizemodule_exec},
+    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
+    {0, NULL}
+};
+
+static int
+tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
+{
+    tokenize_state *state = get_tokenize_state(m);
+    Py_VISIT(state->TokenizerIter);
+    return 0;
+}
+
+static int
+tokenizemodule_clear(PyObject *m)
+{
+    tokenize_state *state = get_tokenize_state(m);
+    Py_CLEAR(state->TokenizerIter);
+    return 0;
+}
+
+static void
+tokenizemodule_free(void *m)
+{
+    tokenizemodule_clear((PyObject *)m);
+}
+
+static struct PyModuleDef _tokenizemodule = {
+    PyModuleDef_HEAD_INIT,
+    .m_name = "_tokenize",
+    .m_size = sizeof(tokenize_state),
+    .m_slots = tokenizemodule_slots,
+    .m_methods = tokenize_methods,
+    .m_traverse = tokenizemodule_traverse,
+    .m_clear = tokenizemodule_clear,
+    .m_free = tokenizemodule_free,
+};
+
+PyMODINIT_FUNC
+PyInit__tokenize(void)
+{
+    return PyModuleDef_Init(&_tokenizemodule);
+}