diff options
| author | AlexSm <[email protected]> | 2024-03-05 10:40:59 +0100 |
|---|---|---|
| committer | GitHub <[email protected]> | 2024-03-05 12:40:59 +0300 |
| commit | 1ac13c847b5358faba44dbb638a828e24369467b (patch) | |
| tree | 07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/src/Parser/string_parser.c | |
| parent | ffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff) | |
Library import 16 (#2433)
Co-authored-by: robot-piglet <[email protected]>
Co-authored-by: deshevoy <[email protected]>
Co-authored-by: robot-contrib <[email protected]>
Co-authored-by: thegeorg <[email protected]>
Co-authored-by: robot-ya-builder <[email protected]>
Co-authored-by: svidyuk <[email protected]>
Co-authored-by: shadchin <[email protected]>
Co-authored-by: robot-ratatosk <[email protected]>
Co-authored-by: innokentii <[email protected]>
Co-authored-by: arkady-e1ppa <[email protected]>
Co-authored-by: snermolaev <[email protected]>
Co-authored-by: dimdim11 <[email protected]>
Co-authored-by: kickbutt <[email protected]>
Co-authored-by: abdullinsaid <[email protected]>
Co-authored-by: korsunandrei <[email protected]>
Co-authored-by: petrk <[email protected]>
Co-authored-by: miroslav2 <[email protected]>
Co-authored-by: serjflint <[email protected]>
Co-authored-by: akhropov <[email protected]>
Co-authored-by: prettyboy <[email protected]>
Co-authored-by: ilikepugs <[email protected]>
Co-authored-by: hiddenpath <[email protected]>
Co-authored-by: mikhnenko <[email protected]>
Co-authored-by: spreis <[email protected]>
Co-authored-by: andreyshspb <[email protected]>
Co-authored-by: dimaandreev <[email protected]>
Co-authored-by: rashid <[email protected]>
Co-authored-by: robot-ydb-importer <[email protected]>
Co-authored-by: r-vetrov <[email protected]>
Co-authored-by: ypodlesov <[email protected]>
Co-authored-by: zaverden <[email protected]>
Co-authored-by: vpozdyayev <[email protected]>
Co-authored-by: robot-cozmo <[email protected]>
Co-authored-by: v-korovin <[email protected]>
Co-authored-by: arikon <[email protected]>
Co-authored-by: khoden <[email protected]>
Co-authored-by: psydmm <[email protected]>
Co-authored-by: robot-javacom <[email protected]>
Co-authored-by: dtorilov <[email protected]>
Co-authored-by: sennikovmv <[email protected]>
Co-authored-by: hcpp <[email protected]>
Diffstat (limited to 'contrib/tools/python3/src/Parser/string_parser.c')
| -rw-r--r-- | contrib/tools/python3/src/Parser/string_parser.c | 274 |
1 files changed, 0 insertions, 274 deletions
diff --git a/contrib/tools/python3/src/Parser/string_parser.c b/contrib/tools/python3/src/Parser/string_parser.c deleted file mode 100644 index 65c320c2173..00000000000 --- a/contrib/tools/python3/src/Parser/string_parser.c +++ /dev/null @@ -1,274 +0,0 @@ -#include <stdbool.h> - -#include <Python.h> - -#include "tokenizer.h" -#include "pegen.h" -#include "string_parser.h" - -//// STRING HANDLING FUNCTIONS //// - -static int -warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t) -{ - if (p->call_invalid_rules) { - // Do not report warnings if we are in the second pass of the parser - // to avoid showing the warning twice. - return 0; - } - unsigned char c = *first_invalid_escape; - if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { // in this case the tokenizer has already emitted a warning, - // see tokenizer.c:warn_invalid_escape_sequence - return 0; - } - - int octal = ('4' <= c && c <= '7'); - PyObject *msg = - octal - ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'", - first_invalid_escape) - : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c); - if (msg == NULL) { - return -1; - } - PyObject *category; - if (p->feature_version >= 12) { - category = PyExc_SyntaxWarning; - } - else { - category = PyExc_DeprecationWarning; - } - if (PyErr_WarnExplicitObject(category, msg, p->tok->filename, - t->lineno, NULL, NULL) < 0) { - if (PyErr_ExceptionMatches(category)) { - /* Replace the Syntax/DeprecationWarning exception with a SyntaxError - to get a more accurate error report */ - PyErr_Clear(); - - /* This is needed, in order for the SyntaxError to point to the token t, - since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the - error location, if p->known_err_token is not set. */ - p->known_err_token = t; - if (octal) { - RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'", - first_invalid_escape); - } - else { - RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c); - } - } - Py_DECREF(msg); - return -1; - } - Py_DECREF(msg); - return 0; -} - -static PyObject * -decode_utf8(const char **sPtr, const char *end) -{ - const char *s; - const char *t; - t = s = *sPtr; - while (s < end && (*s & 0x80)) { - s++; - } - *sPtr = s; - return PyUnicode_DecodeUTF8(t, s - t, NULL); -} - -static PyObject * -decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) -{ - PyObject *v; - PyObject *u; - char *buf; - char *p; - const char *end; - - /* check for integer overflow */ - if (len > SIZE_MAX / 6) { - return NULL; - } - /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5 - "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */ - u = PyBytes_FromStringAndSize((char *)NULL, len * 6); - if (u == NULL) { - return NULL; - } - p = buf = PyBytes_AsString(u); - if (p == NULL) { - return NULL; - } - end = s + len; - while (s < end) { - if (*s == '\\') { - *p++ = *s++; - if (s >= end || *s & 0x80) { - strcpy(p, "u005c"); - p += 5; - if (s >= end) { - break; - } - } - } - if (*s & 0x80) { - PyObject *w; - int kind; - const void *data; - Py_ssize_t w_len; - Py_ssize_t i; - w = decode_utf8(&s, end); - if (w == NULL) { - Py_DECREF(u); - return NULL; - } - kind = PyUnicode_KIND(w); - data = PyUnicode_DATA(w); - w_len = PyUnicode_GET_LENGTH(w); - for (i = 0; i < w_len; i++) { - Py_UCS4 chr = PyUnicode_READ(kind, data, i); - sprintf(p, "\\U%08x", chr); - p += 10; - } - /* Should be impossible to overflow */ - assert(p - buf <= PyBytes_GET_SIZE(u)); - Py_DECREF(w); - } - else { - *p++ = *s++; - } - } - len = p - buf; - s = buf; - - const char *first_invalid_escape; - v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape); - - // HACK: later we can simply pass the line no, since we don't preserve the tokens - // when we are decoding the string but we preserve the line numbers. - if (v != NULL && first_invalid_escape != NULL && t != NULL) { - if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) { - /* We have not decref u before because first_invalid_escape points - inside u. */ - Py_XDECREF(u); - Py_DECREF(v); - return NULL; - } - } - Py_XDECREF(u); - return v; -} - -static PyObject * -decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) -{ - const char *first_invalid_escape; - PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape); - if (result == NULL) { - return NULL; - } - - if (first_invalid_escape != NULL) { - if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) { - Py_DECREF(result); - return NULL; - } - } - return result; -} - -PyObject * -_PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t) -{ - if (raw) { - return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); - } - return decode_unicode_with_escapes(p, s, len, t); -} - -/* s must include the bracketing quote characters, and r, b &/or f prefixes - (if any), and embedded escape sequences (if any). (f-strings are handled by the parser) - _PyPegen_parse_string parses it, and returns the decoded Python string object. */ -PyObject * -_PyPegen_parse_string(Parser *p, Token *t) -{ - const char *s = PyBytes_AsString(t->bytes); - if (s == NULL) { - return NULL; - } - - size_t len; - int quote = Py_CHARMASK(*s); - int bytesmode = 0; - int rawmode = 0; - - if (Py_ISALPHA(quote)) { - while (!bytesmode || !rawmode) { - if (quote == 'b' || quote == 'B') { - quote =(unsigned char)*++s; - bytesmode = 1; - } - else if (quote == 'u' || quote == 'U') { - quote = (unsigned char)*++s; - } - else if (quote == 'r' || quote == 'R') { - quote = (unsigned char)*++s; - rawmode = 1; - } - else { - break; - } - } - } - - if (quote != '\'' && quote != '\"') { - PyErr_BadInternalCall(); - return NULL; - } - /* Skip the leading quote char. */ - s++; - len = strlen(s); - if (len > INT_MAX) { - PyErr_SetString(PyExc_OverflowError, "string to parse is too long"); - return NULL; - } - if (s[--len] != quote) { - /* Last quote char must match the first. */ - PyErr_BadInternalCall(); - return NULL; - } - if (len >= 4 && s[0] == quote && s[1] == quote) { - /* A triple quoted string. We've already skipped one quote at - the start and one at the end of the string. Now skip the - two at the start. */ - s += 2; - len -= 2; - /* And check that the last two match. */ - if (s[--len] != quote || s[--len] != quote) { - PyErr_BadInternalCall(); - return NULL; - } - } - - /* Avoid invoking escape decoding routines if possible. */ - rawmode = rawmode || strchr(s, '\\') == NULL; - if (bytesmode) { - /* Disallow non-ASCII characters. */ - const char *ch; - for (ch = s; *ch; ch++) { - if (Py_CHARMASK(*ch) >= 0x80) { - RAISE_SYNTAX_ERROR_KNOWN_LOCATION( - t, - "bytes can only contain ASCII " - "literal characters"); - return NULL; - } - } - if (rawmode) { - return PyBytes_FromStringAndSize(s, len); - } - return decode_bytes_with_escapes(p, s, len, t); - } - return _PyPegen_decode_string(p, rawmode, s, len, t); -} |
