summaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/Parser/tokenizer/string_tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/tools/python3/Parser/tokenizer/string_tokenizer.c')
-rw-r--r--contrib/tools/python3/Parser/tokenizer/string_tokenizer.c142
1 files changed, 142 insertions, 0 deletions
diff --git a/contrib/tools/python3/Parser/tokenizer/string_tokenizer.c b/contrib/tools/python3/Parser/tokenizer/string_tokenizer.c
new file mode 100644
index 00000000000..560cb37e518
--- /dev/null
+++ b/contrib/tools/python3/Parser/tokenizer/string_tokenizer.c
@@ -0,0 +1,142 @@
+#include "Python.h"
+#include "errcode.h"
+
+#include "helpers.h"
+#include "../lexer/state.h"
+
+static int
+tok_underflow_string(struct tok_state *tok) {
+ char *end = strchr(tok->inp, '\n');
+ if (end != NULL) {
+ end++;
+ }
+ else {
+ end = strchr(tok->inp, '\0');
+ if (end == tok->inp) {
+ tok->done = E_EOF;
+ return 0;
+ }
+ }
+ if (tok->start == NULL) {
+ tok->buf = tok->cur;
+ }
+ tok->line_start = tok->cur;
+ ADVANCE_LINENO();
+ tok->inp = end;
+ return 1;
+}
+
+/* Fetch a byte from TOK, using the string buffer. */
+static int
+buf_getc(struct tok_state *tok) {
+ return Py_CHARMASK(*tok->str++);
+}
+
+/* Unfetch a byte from TOK, using the string buffer. */
+static void
+buf_ungetc(int c, struct tok_state *tok) {
+ tok->str--;
+ assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
+}
+
+/* Set the readline function for TOK to ENC. For the string-based
+ tokenizer, this means to just record the encoding. */
+static int
+buf_setreadl(struct tok_state *tok, const char* enc) {
+ tok->enc = enc;
+ return 1;
+}
+
+/* Decode a byte string STR for use as the buffer of TOK.
+ Look for encoding declarations inside STR, and record them
+ inside TOK. */
+static char *
+decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
+{
+ PyObject* utf8 = NULL;
+ char *str;
+ const char *s;
+ const char *newl[2] = {NULL, NULL};
+ int lineno = 0;
+ tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
+ if (str == NULL)
+ return NULL;
+ tok->enc = NULL;
+ tok->str = str;
+ if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
+ return _PyTokenizer_error_ret(tok);
+ str = tok->str; /* string after BOM if any */
+ assert(str);
+ if (tok->enc != NULL) {
+ utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
+ if (utf8 == NULL)
+ return _PyTokenizer_error_ret(tok);
+ str = PyBytes_AsString(utf8);
+ }
+ for (s = str;; s++) {
+ if (*s == '\0') break;
+ else if (*s == '\n') {
+ assert(lineno < 2);
+ newl[lineno] = s;
+ lineno++;
+ if (lineno == 2) break;
+ }
+ }
+ tok->enc = NULL;
+ /* need to check line 1 and 2 separately since check_coding_spec
+ assumes a single line as input */
+ if (newl[0]) {
+ if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
+ return NULL;
+ }
+ if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
+ if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
+ tok, buf_setreadl))
+ return NULL;
+ }
+ }
+ if (tok->enc != NULL) {
+ assert(utf8 == NULL);
+ utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
+ if (utf8 == NULL)
+ return _PyTokenizer_error_ret(tok);
+ str = PyBytes_AS_STRING(utf8);
+ }
+ if (utf8 != NULL) {
+ char *translated = _PyTokenizer_translate_newlines(
+ str, single, preserve_crlf, tok);
+ if (translated == NULL) {
+ Py_DECREF(utf8);
+ return _PyTokenizer_error_ret(tok);
+ }
+ PyMem_Free(tok->input);
+ tok->input = translated;
+ str = translated;
+ Py_CLEAR(utf8);
+ }
+ tok->str = str;
+ assert(tok->decoding_buffer == NULL);
+ tok->decoding_buffer = utf8; /* CAUTION */
+ return str;
+}
+
+/* Set up tokenizer for string */
+struct tok_state *
+_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
+{
+ struct tok_state *tok = _PyTokenizer_tok_new();
+ char *decoded;
+
+ if (tok == NULL)
+ return NULL;
+ decoded = decode_str(str, exec_input, tok, preserve_crlf);
+ if (decoded == NULL) {
+ _PyTokenizer_Free(tok);
+ return NULL;
+ }
+
+ tok->buf = tok->cur = tok->inp = decoded;
+ tok->end = decoded;
+ tok->underflow = &tok_underflow_string;
+ return tok;
+}