aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/Parser/string_parser.c
diff options
context:
space:
mode:
authorAlexSm <alex@ydb.tech>2024-03-05 10:40:59 +0100
committerGitHub <noreply@github.com>2024-03-05 12:40:59 +0300
commit1ac13c847b5358faba44dbb638a828e24369467b (patch)
tree07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Parser/string_parser.c
parentffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff)
downloadydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz
Library import 16 (#2433)
Co-authored-by: robot-piglet <robot-piglet@yandex-team.com> Co-authored-by: deshevoy <deshevoy@yandex-team.com> Co-authored-by: robot-contrib <robot-contrib@yandex-team.com> Co-authored-by: thegeorg <thegeorg@yandex-team.com> Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com> Co-authored-by: svidyuk <svidyuk@yandex-team.com> Co-authored-by: shadchin <shadchin@yandex-team.com> Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com> Co-authored-by: innokentii <innokentii@yandex-team.com> Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com> Co-authored-by: snermolaev <snermolaev@yandex-team.com> Co-authored-by: dimdim11 <dimdim11@yandex-team.com> Co-authored-by: kickbutt <kickbutt@yandex-team.com> Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com> Co-authored-by: korsunandrei <korsunandrei@yandex-team.com> Co-authored-by: petrk <petrk@yandex-team.com> Co-authored-by: miroslav2 <miroslav2@yandex-team.com> Co-authored-by: serjflint <serjflint@yandex-team.com> Co-authored-by: akhropov <akhropov@yandex-team.com> Co-authored-by: prettyboy <prettyboy@yandex-team.com> Co-authored-by: ilikepugs <ilikepugs@yandex-team.com> Co-authored-by: hiddenpath <hiddenpath@yandex-team.com> Co-authored-by: mikhnenko <mikhnenko@yandex-team.com> Co-authored-by: spreis <spreis@yandex-team.com> Co-authored-by: andreyshspb <andreyshspb@yandex-team.com> Co-authored-by: dimaandreev <dimaandreev@yandex-team.com> Co-authored-by: rashid <rashid@yandex-team.com> Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com> Co-authored-by: r-vetrov <r-vetrov@yandex-team.com> Co-authored-by: ypodlesov <ypodlesov@yandex-team.com> Co-authored-by: zaverden <zaverden@yandex-team.com> Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com> Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com> Co-authored-by: v-korovin <v-korovin@yandex-team.com> Co-authored-by: arikon <arikon@yandex-team.com> Co-authored-by: khoden <khoden@yandex-team.com> Co-authored-by: psydmm <psydmm@yandex-team.com> Co-authored-by: robot-javacom <robot-javacom@yandex-team.com> Co-authored-by: dtorilov <dtorilov@yandex-team.com> Co-authored-by: sennikovmv <sennikovmv@yandex-team.com> Co-authored-by: hcpp <hcpp@ydb.tech>
Diffstat (limited to 'contrib/tools/python3/Parser/string_parser.c')
-rw-r--r--contrib/tools/python3/Parser/string_parser.c274
1 files changed, 274 insertions, 0 deletions
diff --git a/contrib/tools/python3/Parser/string_parser.c b/contrib/tools/python3/Parser/string_parser.c
new file mode 100644
index 0000000000..65c320c217
--- /dev/null
+++ b/contrib/tools/python3/Parser/string_parser.c
@@ -0,0 +1,274 @@
+#include <stdbool.h>
+
+#include <Python.h>
+
+#include "tokenizer.h"
+#include "pegen.h"
+#include "string_parser.h"
+
+//// STRING HANDLING FUNCTIONS ////
+
+static int
+warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
+{
+ if (p->call_invalid_rules) {
+ // Do not report warnings if we are in the second pass of the parser
+ // to avoid showing the warning twice.
+ return 0;
+ }
+ unsigned char c = *first_invalid_escape;
+ if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { // in this case the tokenizer has already emitted a warning,
+ // see tokenizer.c:warn_invalid_escape_sequence
+ return 0;
+ }
+
+ int octal = ('4' <= c && c <= '7');
+ PyObject *msg =
+ octal
+ ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
+ first_invalid_escape)
+ : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
+ if (msg == NULL) {
+ return -1;
+ }
+ PyObject *category;
+ if (p->feature_version >= 12) {
+ category = PyExc_SyntaxWarning;
+ }
+ else {
+ category = PyExc_DeprecationWarning;
+ }
+ if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
+ t->lineno, NULL, NULL) < 0) {
+ if (PyErr_ExceptionMatches(category)) {
+ /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
+ to get a more accurate error report */
+ PyErr_Clear();
+
+ /* This is needed, in order for the SyntaxError to point to the token t,
+ since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
+ error location, if p->known_err_token is not set. */
+ p->known_err_token = t;
+ if (octal) {
+ RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
+ first_invalid_escape);
+ }
+ else {
+ RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
+ }
+ }
+ Py_DECREF(msg);
+ return -1;
+ }
+ Py_DECREF(msg);
+ return 0;
+}
+
+static PyObject *
+decode_utf8(const char **sPtr, const char *end)
+{
+ const char *s;
+ const char *t;
+ t = s = *sPtr;
+ while (s < end && (*s & 0x80)) {
+ s++;
+ }
+ *sPtr = s;
+ return PyUnicode_DecodeUTF8(t, s - t, NULL);
+}
+
+static PyObject *
+decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
+{
+ PyObject *v;
+ PyObject *u;
+ char *buf;
+ char *p;
+ const char *end;
+
+ /* check for integer overflow */
+ if (len > SIZE_MAX / 6) {
+ return NULL;
+ }
+ /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
+ "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
+ u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
+ if (u == NULL) {
+ return NULL;
+ }
+ p = buf = PyBytes_AsString(u);
+ if (p == NULL) {
+ return NULL;
+ }
+ end = s + len;
+ while (s < end) {
+ if (*s == '\\') {
+ *p++ = *s++;
+ if (s >= end || *s & 0x80) {
+ strcpy(p, "u005c");
+ p += 5;
+ if (s >= end) {
+ break;
+ }
+ }
+ }
+ if (*s & 0x80) {
+ PyObject *w;
+ int kind;
+ const void *data;
+ Py_ssize_t w_len;
+ Py_ssize_t i;
+ w = decode_utf8(&s, end);
+ if (w == NULL) {
+ Py_DECREF(u);
+ return NULL;
+ }
+ kind = PyUnicode_KIND(w);
+ data = PyUnicode_DATA(w);
+ w_len = PyUnicode_GET_LENGTH(w);
+ for (i = 0; i < w_len; i++) {
+ Py_UCS4 chr = PyUnicode_READ(kind, data, i);
+ sprintf(p, "\\U%08x", chr);
+ p += 10;
+ }
+ /* Should be impossible to overflow */
+ assert(p - buf <= PyBytes_GET_SIZE(u));
+ Py_DECREF(w);
+ }
+ else {
+ *p++ = *s++;
+ }
+ }
+ len = p - buf;
+ s = buf;
+
+ const char *first_invalid_escape;
+ v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
+
+ // HACK: later we can simply pass the line no, since we don't preserve the tokens
+ // when we are decoding the string but we preserve the line numbers.
+ if (v != NULL && first_invalid_escape != NULL && t != NULL) {
+ if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
+ /* We have not decref u before because first_invalid_escape points
+ inside u. */
+ Py_XDECREF(u);
+ Py_DECREF(v);
+ return NULL;
+ }
+ }
+ Py_XDECREF(u);
+ return v;
+}
+
+static PyObject *
+decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
+{
+ const char *first_invalid_escape;
+ PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
+ if (result == NULL) {
+ return NULL;
+ }
+
+ if (first_invalid_escape != NULL) {
+ if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ }
+ return result;
+}
+
+PyObject *
+_PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
+{
+ if (raw) {
+ return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
+ }
+ return decode_unicode_with_escapes(p, s, len, t);
+}
+
+/* s must include the bracketing quote characters, and r, b &/or f prefixes
+ (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
+ _PyPegen_parse_string parses it, and returns the decoded Python string object. */
+PyObject *
+_PyPegen_parse_string(Parser *p, Token *t)
+{
+ const char *s = PyBytes_AsString(t->bytes);
+ if (s == NULL) {
+ return NULL;
+ }
+
+ size_t len;
+ int quote = Py_CHARMASK(*s);
+ int bytesmode = 0;
+ int rawmode = 0;
+
+ if (Py_ISALPHA(quote)) {
+ while (!bytesmode || !rawmode) {
+ if (quote == 'b' || quote == 'B') {
+ quote =(unsigned char)*++s;
+ bytesmode = 1;
+ }
+ else if (quote == 'u' || quote == 'U') {
+ quote = (unsigned char)*++s;
+ }
+ else if (quote == 'r' || quote == 'R') {
+ quote = (unsigned char)*++s;
+ rawmode = 1;
+ }
+ else {
+ break;
+ }
+ }
+ }
+
+ if (quote != '\'' && quote != '\"') {
+ PyErr_BadInternalCall();
+ return NULL;
+ }
+ /* Skip the leading quote char. */
+ s++;
+ len = strlen(s);
+ if (len > INT_MAX) {
+ PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
+ return NULL;
+ }
+ if (s[--len] != quote) {
+ /* Last quote char must match the first. */
+ PyErr_BadInternalCall();
+ return NULL;
+ }
+ if (len >= 4 && s[0] == quote && s[1] == quote) {
+ /* A triple quoted string. We've already skipped one quote at
+ the start and one at the end of the string. Now skip the
+ two at the start. */
+ s += 2;
+ len -= 2;
+ /* And check that the last two match. */
+ if (s[--len] != quote || s[--len] != quote) {
+ PyErr_BadInternalCall();
+ return NULL;
+ }
+ }
+
+ /* Avoid invoking escape decoding routines if possible. */
+ rawmode = rawmode || strchr(s, '\\') == NULL;
+ if (bytesmode) {
+ /* Disallow non-ASCII characters. */
+ const char *ch;
+ for (ch = s; *ch; ch++) {
+ if (Py_CHARMASK(*ch) >= 0x80) {
+ RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+ t,
+ "bytes can only contain ASCII "
+ "literal characters");
+ return NULL;
+ }
+ }
+ if (rawmode) {
+ return PyBytes_FromStringAndSize(s, len);
+ }
+ return decode_bytes_with_escapes(p, s, len, t);
+ }
+ return _PyPegen_decode_string(p, rawmode, s, len, t);
+}