Sync contrib/tools/python3 layout with upstream

* Move src/ subdir contents to the top of the layout * Rename self-written lib -> lib2 to avoid CaseFolding warning from the VCS * Regenerate contrib/libs/python proxy-headers accordingly 4ccc62ac1511abcf0fed14ccade38e984e088f1e
author: thegeorg <thegeorg@yandex-team.com> 2024-02-19 02:38:52 +0300
committer: thegeorg <thegeorg@yandex-team.com> 2024-02-19 02:50:43 +0300
commit: d96fa07134c06472bfee6718b5cfd1679196fc99 (patch)
tree: 31ec344fa9d3ff8dc038692516b6438dfbdb8a2d /contrib/tools/python3/Parser/tokenizer.h
parent: 452cf9e068aef7110e35e654c5d47eb80111ef89 (diff)
download: ydb-d96fa07134c06472bfee6718b5cfd1679196fc99.tar.gz
1 files changed, 154 insertions, 0 deletions
diff --git a/contrib/tools/python3/Parser/tokenizer.h b/contrib/tools/python3/Parser/tokenizer.h
new file mode 100644
index 0000000000..1e1daa3648
--- /dev/null
+++ b/contrib/tools/python3/Parser/tokenizer.h
@@ -0,0 +1,154 @@
+#ifndef Py_TOKENIZER_H
+#define Py_TOKENIZER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "object.h"
+
+/* Tokenizer interface */
+
+#include "pycore_token.h" /* For token types */
+
+#define MAXINDENT 100       /* Max indentation level */
+#define MAXLEVEL 200        /* Max parentheses level */
+#define MAXFSTRINGLEVEL 150 /* Max f-string nesting level */
+
+enum decoding_state {
+    STATE_INIT,
+    STATE_SEEK_CODING,
+    STATE_NORMAL
+};
+
+enum interactive_underflow_t {
+    /* Normal mode of operation: return a new token when asked in interactive mode */
+    IUNDERFLOW_NORMAL,
+    /* Forcefully return ENDMARKER when asked for a new token in interactive mode. This
+     * can be used to prevent the tokenizer to prompt the user for new tokens */
+    IUNDERFLOW_STOP,
+};
+
+struct token {
+    int level;
+    int lineno, col_offset, end_lineno, end_col_offset;
+    const char *start, *end;
+    PyObject *metadata;
+};
+
+enum tokenizer_mode_kind_t {
+    TOK_REGULAR_MODE,
+    TOK_FSTRING_MODE,
+};
+
+#define MAX_EXPR_NESTING 3
+
+typedef struct _tokenizer_mode {
+    enum tokenizer_mode_kind_t kind;
+
+    int curly_bracket_depth;
+    int curly_bracket_expr_start_depth;
+
+    char f_string_quote;
+    int f_string_quote_size;
+    int f_string_raw;
+    const char* f_string_start;
+    const char* f_string_multi_line_start;
+    int f_string_line_start;
+
+    Py_ssize_t f_string_start_offset;
+    Py_ssize_t f_string_multi_line_start_offset;
+
+    Py_ssize_t last_expr_size;
+    Py_ssize_t last_expr_end;
+    char* last_expr_buffer;
+    int f_string_debug;
+} tokenizer_mode;
+
+/* Tokenizer state */
+struct tok_state {
+    /* Input state; buf <= cur <= inp <= end */
+    /* NB an entire line is held in the buffer */
+    char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL or readline != NULL */
+    char *cur;          /* Next character in buffer */
+    char *inp;          /* End of data in buffer */
+    int fp_interactive; /* If the file descriptor is interactive */
+    char *interactive_src_start; /* The start of the source parsed so far in interactive mode */
+    char *interactive_src_end; /* The end of the source parsed so far in interactive mode */
+    const char *end;    /* End of input buffer if buf != NULL */
+    const char *start;  /* Start of current token if not NULL */
+    int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
+    /* NB If done != E_OK, cur must be == inp!!! */
+    FILE *fp;           /* Rest of input; NULL if tokenizing a string */
+    int tabsize;        /* Tab spacing */
+    int indent;         /* Current indentation index */
+    int indstack[MAXINDENT];            /* Stack of indents */
+    int atbol;          /* Nonzero if at begin of new line */
+    int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
+    const char *prompt, *nextprompt;          /* For interactive prompting */
+    int lineno;         /* Current line number */
+    int first_lineno;   /* First line of a single line or multi line string
+                           expression (cf. issue 16806) */
+    int starting_col_offset; /* The column offset at the beginning of a token */
+    int col_offset;     /* Current col offset */
+    int level;          /* () [] {} Parentheses nesting level */
+            /* Used to allow free continuations inside them */
+    char parenstack[MAXLEVEL];
+    int parenlinenostack[MAXLEVEL];
+    int parencolstack[MAXLEVEL];
+    PyObject *filename;
+    /* Stuff for checking on different tab sizes */
+    int altindstack[MAXINDENT];         /* Stack of alternate indents */
+    /* Stuff for PEP 0263 */
+    enum decoding_state decoding_state;
+    int decoding_erred;         /* whether erred in decoding  */
+    char *encoding;         /* Source encoding. */
+    int cont_line;          /* whether we are in a continuation line. */
+    const char* line_start;     /* pointer to start of current line */
+    const char* multi_line_start; /* pointer to start of first line of
+                                     a single line or multi line string
+                                     expression (cf. issue 16806) */
+    PyObject *decoding_readline; /* open(...).readline */
+    PyObject *decoding_buffer;
+    PyObject *readline;     /* readline() function */
+    const char* enc;        /* Encoding for the current str. */
+    char* str;          /* Source string being tokenized (if tokenizing from a string)*/
+    char* input;       /* Tokenizer's newline translated copy of the string. */
+
+    int type_comments;      /* Whether to look for type comments */
+
+    /* async/await related fields (still needed depending on feature_version) */
+    int async_hacks;     /* =1 if async/await aren't always keywords */
+    int async_def;        /* =1 if tokens are inside an 'async def' body. */
+    int async_def_indent; /* Indentation level of the outermost 'async def'. */
+    int async_def_nl;     /* =1 if the outermost 'async def' had at least one
+                             NEWLINE token after it. */
+    /* How to proceed when asked for a new token in interactive mode */
+    enum interactive_underflow_t interactive_underflow;
+    int report_warnings;
+    // TODO: Factor this into its own thing
+    tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
+    int tok_mode_stack_index;
+    int tok_extra_tokens;
+    int comment_newline;
+    int implicit_newline;
+#ifdef Py_DEBUG
+    int debug;
+#endif
+};
+
+extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
+extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
+extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
+extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
+                                              const char *, const char *);
+extern void _PyTokenizer_Free(struct tok_state *);
+extern void _PyToken_Free(struct token *);
+extern void _PyToken_Init(struct token *);
+extern int _PyTokenizer_Get(struct tok_state *, struct token *);
+
+#define tok_dump _Py_tok_dump
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TOKENIZER_H */
author	thegeorg <thegeorg@yandex-team.com>	2024-02-19 02:38:52 +0300
committer	thegeorg <thegeorg@yandex-team.com>	2024-02-19 02:50:43 +0300
commit	d96fa07134c06472bfee6718b5cfd1679196fc99 (patch)
tree	31ec344fa9d3ff8dc038692516b6438dfbdb8a2d /contrib/tools/python3/Parser/tokenizer.h
parent	452cf9e068aef7110e35e654c5d47eb80111ef89 (diff)
download	ydb-d96fa07134c06472bfee6718b5cfd1679196fc99.tar.gz