diff options
author | AlexSm <alex@ydb.tech> | 2024-03-05 10:40:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-05 12:40:59 +0300 |
commit | 1ac13c847b5358faba44dbb638a828e24369467b (patch) | |
tree | 07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Parser/string_parser.c | |
parent | ffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff) | |
download | ydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz |
Library import 16 (#2433)
Co-authored-by: robot-piglet <robot-piglet@yandex-team.com>
Co-authored-by: deshevoy <deshevoy@yandex-team.com>
Co-authored-by: robot-contrib <robot-contrib@yandex-team.com>
Co-authored-by: thegeorg <thegeorg@yandex-team.com>
Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com>
Co-authored-by: svidyuk <svidyuk@yandex-team.com>
Co-authored-by: shadchin <shadchin@yandex-team.com>
Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com>
Co-authored-by: innokentii <innokentii@yandex-team.com>
Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com>
Co-authored-by: snermolaev <snermolaev@yandex-team.com>
Co-authored-by: dimdim11 <dimdim11@yandex-team.com>
Co-authored-by: kickbutt <kickbutt@yandex-team.com>
Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com>
Co-authored-by: korsunandrei <korsunandrei@yandex-team.com>
Co-authored-by: petrk <petrk@yandex-team.com>
Co-authored-by: miroslav2 <miroslav2@yandex-team.com>
Co-authored-by: serjflint <serjflint@yandex-team.com>
Co-authored-by: akhropov <akhropov@yandex-team.com>
Co-authored-by: prettyboy <prettyboy@yandex-team.com>
Co-authored-by: ilikepugs <ilikepugs@yandex-team.com>
Co-authored-by: hiddenpath <hiddenpath@yandex-team.com>
Co-authored-by: mikhnenko <mikhnenko@yandex-team.com>
Co-authored-by: spreis <spreis@yandex-team.com>
Co-authored-by: andreyshspb <andreyshspb@yandex-team.com>
Co-authored-by: dimaandreev <dimaandreev@yandex-team.com>
Co-authored-by: rashid <rashid@yandex-team.com>
Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com>
Co-authored-by: r-vetrov <r-vetrov@yandex-team.com>
Co-authored-by: ypodlesov <ypodlesov@yandex-team.com>
Co-authored-by: zaverden <zaverden@yandex-team.com>
Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com>
Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com>
Co-authored-by: v-korovin <v-korovin@yandex-team.com>
Co-authored-by: arikon <arikon@yandex-team.com>
Co-authored-by: khoden <khoden@yandex-team.com>
Co-authored-by: psydmm <psydmm@yandex-team.com>
Co-authored-by: robot-javacom <robot-javacom@yandex-team.com>
Co-authored-by: dtorilov <dtorilov@yandex-team.com>
Co-authored-by: sennikovmv <sennikovmv@yandex-team.com>
Co-authored-by: hcpp <hcpp@ydb.tech>
Diffstat (limited to 'contrib/tools/python3/Parser/string_parser.c')
-rw-r--r-- | contrib/tools/python3/Parser/string_parser.c | 274 |
1 files changed, 274 insertions, 0 deletions
diff --git a/contrib/tools/python3/Parser/string_parser.c b/contrib/tools/python3/Parser/string_parser.c new file mode 100644 index 0000000000..65c320c217 --- /dev/null +++ b/contrib/tools/python3/Parser/string_parser.c @@ -0,0 +1,274 @@ +#include <stdbool.h> + +#include <Python.h> + +#include "tokenizer.h" +#include "pegen.h" +#include "string_parser.h" + +//// STRING HANDLING FUNCTIONS //// + +static int +warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t) +{ + if (p->call_invalid_rules) { + // Do not report warnings if we are in the second pass of the parser + // to avoid showing the warning twice. + return 0; + } + unsigned char c = *first_invalid_escape; + if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { // in this case the tokenizer has already emitted a warning, + // see tokenizer.c:warn_invalid_escape_sequence + return 0; + } + + int octal = ('4' <= c && c <= '7'); + PyObject *msg = + octal + ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'", + first_invalid_escape) + : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c); + if (msg == NULL) { + return -1; + } + PyObject *category; + if (p->feature_version >= 12) { + category = PyExc_SyntaxWarning; + } + else { + category = PyExc_DeprecationWarning; + } + if (PyErr_WarnExplicitObject(category, msg, p->tok->filename, + t->lineno, NULL, NULL) < 0) { + if (PyErr_ExceptionMatches(category)) { + /* Replace the Syntax/DeprecationWarning exception with a SyntaxError + to get a more accurate error report */ + PyErr_Clear(); + + /* This is needed, in order for the SyntaxError to point to the token t, + since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the + error location, if p->known_err_token is not set. */ + p->known_err_token = t; + if (octal) { + RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'", + first_invalid_escape); + } + else { + RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c); + } + } + Py_DECREF(msg); + return -1; + } + Py_DECREF(msg); + return 0; +} + +static PyObject * +decode_utf8(const char **sPtr, const char *end) +{ + const char *s; + const char *t; + t = s = *sPtr; + while (s < end && (*s & 0x80)) { + s++; + } + *sPtr = s; + return PyUnicode_DecodeUTF8(t, s - t, NULL); +} + +static PyObject * +decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) +{ + PyObject *v; + PyObject *u; + char *buf; + char *p; + const char *end; + + /* check for integer overflow */ + if (len > SIZE_MAX / 6) { + return NULL; + } + /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5 + "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */ + u = PyBytes_FromStringAndSize((char *)NULL, len * 6); + if (u == NULL) { + return NULL; + } + p = buf = PyBytes_AsString(u); + if (p == NULL) { + return NULL; + } + end = s + len; + while (s < end) { + if (*s == '\\') { + *p++ = *s++; + if (s >= end || *s & 0x80) { + strcpy(p, "u005c"); + p += 5; + if (s >= end) { + break; + } + } + } + if (*s & 0x80) { + PyObject *w; + int kind; + const void *data; + Py_ssize_t w_len; + Py_ssize_t i; + w = decode_utf8(&s, end); + if (w == NULL) { + Py_DECREF(u); + return NULL; + } + kind = PyUnicode_KIND(w); + data = PyUnicode_DATA(w); + w_len = PyUnicode_GET_LENGTH(w); + for (i = 0; i < w_len; i++) { + Py_UCS4 chr = PyUnicode_READ(kind, data, i); + sprintf(p, "\\U%08x", chr); + p += 10; + } + /* Should be impossible to overflow */ + assert(p - buf <= PyBytes_GET_SIZE(u)); + Py_DECREF(w); + } + else { + *p++ = *s++; + } + } + len = p - buf; + s = buf; + + const char *first_invalid_escape; + v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape); + + // HACK: later we can simply pass the line no, since we don't preserve the tokens + // when we are decoding the string but we preserve the line numbers. + if (v != NULL && first_invalid_escape != NULL && t != NULL) { + if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) { + /* We have not decref u before because first_invalid_escape points + inside u. */ + Py_XDECREF(u); + Py_DECREF(v); + return NULL; + } + } + Py_XDECREF(u); + return v; +} + +static PyObject * +decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) +{ + const char *first_invalid_escape; + PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape); + if (result == NULL) { + return NULL; + } + + if (first_invalid_escape != NULL) { + if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) { + Py_DECREF(result); + return NULL; + } + } + return result; +} + +PyObject * +_PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t) +{ + if (raw) { + return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); + } + return decode_unicode_with_escapes(p, s, len, t); +} + +/* s must include the bracketing quote characters, and r, b &/or f prefixes + (if any), and embedded escape sequences (if any). (f-strings are handled by the parser) + _PyPegen_parse_string parses it, and returns the decoded Python string object. */ +PyObject * +_PyPegen_parse_string(Parser *p, Token *t) +{ + const char *s = PyBytes_AsString(t->bytes); + if (s == NULL) { + return NULL; + } + + size_t len; + int quote = Py_CHARMASK(*s); + int bytesmode = 0; + int rawmode = 0; + + if (Py_ISALPHA(quote)) { + while (!bytesmode || !rawmode) { + if (quote == 'b' || quote == 'B') { + quote =(unsigned char)*++s; + bytesmode = 1; + } + else if (quote == 'u' || quote == 'U') { + quote = (unsigned char)*++s; + } + else if (quote == 'r' || quote == 'R') { + quote = (unsigned char)*++s; + rawmode = 1; + } + else { + break; + } + } + } + + if (quote != '\'' && quote != '\"') { + PyErr_BadInternalCall(); + return NULL; + } + /* Skip the leading quote char. */ + s++; + len = strlen(s); + if (len > INT_MAX) { + PyErr_SetString(PyExc_OverflowError, "string to parse is too long"); + return NULL; + } + if (s[--len] != quote) { + /* Last quote char must match the first. */ + PyErr_BadInternalCall(); + return NULL; + } + if (len >= 4 && s[0] == quote && s[1] == quote) { + /* A triple quoted string. We've already skipped one quote at + the start and one at the end of the string. Now skip the + two at the start. */ + s += 2; + len -= 2; + /* And check that the last two match. */ + if (s[--len] != quote || s[--len] != quote) { + PyErr_BadInternalCall(); + return NULL; + } + } + + /* Avoid invoking escape decoding routines if possible. */ + rawmode = rawmode || strchr(s, '\\') == NULL; + if (bytesmode) { + /* Disallow non-ASCII characters. */ + const char *ch; + for (ch = s; *ch; ch++) { + if (Py_CHARMASK(*ch) >= 0x80) { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + t, + "bytes can only contain ASCII " + "literal characters"); + return NULL; + } + } + if (rawmode) { + return PyBytes_FromStringAndSize(s, len); + } + return decode_bytes_with_escapes(p, s, len, t); + } + return _PyPegen_decode_string(p, rawmode, s, len, t); +} |