1 files changed, 75 insertions, 27 deletions
diff --git a/contrib/tools/python3/Parser/string_parser.c b/contrib/tools/python3/Parser/string_parser.c
index 164f715e153..16d96cc5c00 100644
--- a/contrib/tools/python3/Parser/string_parser.c
+++ b/contrib/tools/python3/Parser/string_parser.c
@@ -1,24 +1,27 @@
-#include <stdbool.h>
-
 #include <Python.h>
+#include "pycore_bytesobject.h"   // _PyBytes_DecodeEscape()
+#include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal()
 
-#include "tokenizer.h"
+#include "lexer/state.h"
 #include "pegen.h"
 #include "string_parser.h"
 
+#include <stdbool.h>
+
 //// STRING HANDLING FUNCTIONS ////
 
 static int
-warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
+warn_invalid_escape_sequence(Parser *p, const char* buffer, const char *first_invalid_escape, Token *t)
 {
     if (p->call_invalid_rules) {
         // Do not report warnings if we are in the second pass of the parser
         // to avoid showing the warning twice.
         return 0;
     }
-    unsigned char c = *first_invalid_escape;
-    if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {  // in this case the tokenizer has already emitted a warning,
-                                                                                            // see tokenizer.c:warn_invalid_escape_sequence
+    unsigned char c = (unsigned char)*first_invalid_escape;
+    if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {
+        // in this case the tokenizer has already emitted a warning,
+        // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence
         return 0;
     }
 
@@ -38,8 +41,46 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
     else {
         category = PyExc_DeprecationWarning;
     }
+
+    // Calculate the lineno and the col_offset of the invalid escape sequence
+    const char *start = buffer;
+    const char *end = first_invalid_escape;
+    int lineno = t->lineno;
+    int col_offset = t->col_offset;
+    while (start < end) {
+        if (*start == '\n') {
+            lineno++;
+            col_offset = 0;
+        }
+        else {
+            col_offset++;
+        }
+        start++;
+    }
+
+    // Count the number of quotes in the token
+    char first_quote = 0;
+    if (lineno == t->lineno) {
+        int quote_count = 0;
+        char* tok = PyBytes_AsString(t->bytes);
+        for (int i = 0; i < PyBytes_Size(t->bytes); i++) {
+            if (tok[i] == '\'' || tok[i] == '\"') {
+                if (quote_count == 0) {
+                    first_quote = tok[i];
+                }
+                if (tok[i] == first_quote) {
+                    quote_count++;
+                }
+            } else {
+                break;
+            }
+        }
+
+        col_offset += quote_count;
+    }
+
     if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
-                                 t->lineno, NULL, NULL) < 0) {
+                                 lineno, NULL, NULL) < 0) {
         if (PyErr_ExceptionMatches(category)) {
             /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
                to get a more accurate error report */
@@ -50,11 +91,12 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
                error location, if p->known_err_token is not set. */
             p->known_err_token = t;
             if (octal) {
-                RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
-                                   first_invalid_escape);
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
+                "invalid octal escape sequence '\\%.3s'", first_invalid_escape);
             }
             else {
-                RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
+                "invalid escape sequence '\\%c'", c);
             }
         }
         Py_DECREF(msg);
@@ -87,12 +129,12 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
     const char *end;
 
     /* check for integer overflow */
-    if (len > SIZE_MAX / 6) {
+    if (len > (size_t)PY_SSIZE_T_MAX / 6) {
         return NULL;
     }
     /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
        "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
-    u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
+    u = PyBytes_FromStringAndSize((char *)NULL, (Py_ssize_t)len * 6);
     if (u == NULL) {
         return NULL;
     }
@@ -139,18 +181,21 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
             *p++ = *s++;
         }
     }
-    len = p - buf;
+    len = (size_t)(p - buf);
     s = buf;
 
-    const char *first_invalid_escape;
-    v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
+    int first_invalid_escape_char;
+    const char *first_invalid_escape_ptr;
+    v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
+                                                &first_invalid_escape_char,
+                                                &first_invalid_escape_ptr);
 
     // HACK: later we can simply pass the line no, since we don't preserve the tokens
     // when we are decoding the string but we preserve the line numbers.
-    if (v != NULL && first_invalid_escape != NULL && t != NULL) {
-        if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
-            /* We have not decref u before because first_invalid_escape points
-               inside u. */
+    if (v != NULL && first_invalid_escape_ptr != NULL && t != NULL) {
+        if (warn_invalid_escape_sequence(parser, s, first_invalid_escape_ptr, t) < 0) {
+            /* We have not decref u before because first_invalid_escape_ptr
+               points inside u. */
             Py_XDECREF(u);
             Py_DECREF(v);
             return NULL;
@@ -163,14 +208,17 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
 static PyObject *
 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
 {
-    const char *first_invalid_escape;
-    PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
+    int first_invalid_escape_char;
+    const char *first_invalid_escape_ptr;
+    PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
+                                              &first_invalid_escape_char,
+                                              &first_invalid_escape_ptr);
     if (result == NULL) {
         return NULL;
     }
 
-    if (first_invalid_escape != NULL) {
-        if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
+    if (first_invalid_escape_ptr != NULL) {
+        if (warn_invalid_escape_sequence(p, s, first_invalid_escape_ptr, t) < 0) {
             Py_DECREF(result);
             return NULL;
         }
@@ -182,7 +230,7 @@ PyObject *
 _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
 {
     if (raw) {
-        return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
+        return PyUnicode_DecodeUTF8Stateful(s, (Py_ssize_t)len, NULL, NULL);
     }
     return decode_unicode_with_escapes(p, s, len, t);
 }
@@ -271,9 +319,9 @@ _PyPegen_parse_string(Parser *p, Token *t)
             }
         }
         if (rawmode) {
-            return PyBytes_FromStringAndSize(s, len);
+            return PyBytes_FromStringAndSize(s, (Py_ssize_t)len);
         }
-        return decode_bytes_with_escapes(p, s, len, t);
+        return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);
     }
     return _PyPegen_decode_string(p, rawmode, s, len, t);
 }