aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/src/Lib/tokenize.py
diff options
context:
space:
mode:
authorshadchin <shadchin@yandex-team.com>2024-02-12 07:53:52 +0300
committershadchin <shadchin@yandex-team.com>2024-02-12 08:07:36 +0300
commitce1b7ca3171f9158180640c6a02a74b4afffedea (patch)
treee47c1e8391b1b0128262c1e9b1e6ed4c8fff2348 /contrib/tools/python3/src/Lib/tokenize.py
parent57350d96f030db90f220ce50ee591d5c5d403df7 (diff)
downloadydb-ce1b7ca3171f9158180640c6a02a74b4afffedea.tar.gz
Update Python from 3.11.8 to 3.12.2
Diffstat (limited to 'contrib/tools/python3/src/Lib/tokenize.py')
-rw-r--r--contrib/tools/python3/src/Lib/tokenize.py235
1 files changed, 44 insertions, 191 deletions
diff --git a/contrib/tools/python3/src/Lib/tokenize.py b/contrib/tools/python3/src/Lib/tokenize.py
index 46d2224f5c..49e8144edd 100644
--- a/contrib/tools/python3/src/Lib/tokenize.py
+++ b/contrib/tools/python3/src/Lib/tokenize.py
@@ -34,6 +34,7 @@ import re
import sys
from token import *
from token import EXACT_TOKEN_TYPES
+import _tokenize
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -160,8 +161,8 @@ tabsize = 8
class TokenError(Exception): pass
-class StopTokenizing(Exception): pass
+class StopTokenizing(Exception): pass
class Untokenizer:
@@ -213,6 +214,14 @@ class Untokenizer:
self.tokens.append(indent)
self.prev_col = len(indent)
startline = False
+ elif tok_type == FSTRING_MIDDLE:
+ if '{' in token or '}' in token:
+ end_line, end_col = end
+ end = (end_line, end_col + token.count('{') + token.count('}'))
+ token = re.sub('{', '{{', token)
+ token = re.sub('}', '}}', token)
+
+
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
@@ -255,6 +264,11 @@ class Untokenizer:
elif startline and indents:
toks_append(indents[-1])
startline = False
+ elif toknum == FSTRING_MIDDLE:
+ if '{' in tokval or '}' in tokval:
+ tokval = re.sub('{', '{{', tokval)
+ tokval = re.sub('}', '}}', tokval)
+
toks_append(tokval)
@@ -404,7 +418,6 @@ def open(filename):
buffer.close()
raise
-
def tokenize(readline):
"""
The tokenize() generator requires one argument, readline, which
@@ -425,193 +438,13 @@ def tokenize(readline):
which tells you which encoding was used to decode the bytes stream.
"""
encoding, consumed = detect_encoding(readline)
- empty = _itertools.repeat(b"")
- rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
- return _tokenize(rl_gen.__next__, encoding)
-
-
-def _tokenize(readline, encoding):
- lnum = parenlev = continued = 0
- numchars = '0123456789'
- contstr, needcont = '', 0
- contline = None
- indents = [0]
-
+ rl_gen = _itertools.chain(consumed, iter(readline, b""))
if encoding is not None:
if encoding == "utf-8-sig":
# BOM will already have been stripped.
encoding = "utf-8"
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
- last_line = b''
- line = b''
- while True: # loop over lines in stream
- try:
- # We capture the value of the line variable here because
- # readline uses the empty string '' to signal end of input,
- # hence `line` itself will always be overwritten at the end
- # of this loop.
- last_line = line
- line = readline()
- except StopIteration:
- line = b''
-
- if encoding is not None:
- line = line.decode(encoding)
- lnum += 1
- pos, max = 0, len(line)
-
- if contstr: # continued string
- if not line:
- raise TokenError("EOF in multi-line string", strstart)
- endmatch = endprog.match(line)
- if endmatch:
- pos = end = endmatch.end(0)
- yield TokenInfo(STRING, contstr + line[:end],
- strstart, (lnum, end), contline + line)
- contstr, needcont = '', 0
- contline = None
- elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
- yield TokenInfo(ERRORTOKEN, contstr + line,
- strstart, (lnum, len(line)), contline)
- contstr = ''
- contline = None
- continue
- else:
- contstr = contstr + line
- contline = contline + line
- continue
-
- elif parenlev == 0 and not continued: # new statement
- if not line: break
- column = 0
- while pos < max: # measure leading whitespace
- if line[pos] == ' ':
- column += 1
- elif line[pos] == '\t':
- column = (column//tabsize + 1)*tabsize
- elif line[pos] == '\f':
- column = 0
- else:
- break
- pos += 1
- if pos == max:
- break
-
- if line[pos] in '#\r\n': # skip comments or blank lines
- if line[pos] == '#':
- comment_token = line[pos:].rstrip('\r\n')
- yield TokenInfo(COMMENT, comment_token,
- (lnum, pos), (lnum, pos + len(comment_token)), line)
- pos += len(comment_token)
-
- yield TokenInfo(NL, line[pos:],
- (lnum, pos), (lnum, len(line)), line)
- continue
-
- if column > indents[-1]: # count indents or dedents
- indents.append(column)
- yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
- while column < indents[-1]:
- if column not in indents:
- raise IndentationError(
- "unindent does not match any outer indentation level",
- ("<tokenize>", lnum, pos, line))
- indents = indents[:-1]
-
- yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
-
- else: # continued statement
- if not line:
- raise TokenError("EOF in multi-line statement", (lnum, 0))
- continued = 0
-
- while pos < max:
- pseudomatch = _compile(PseudoToken).match(line, pos)
- if pseudomatch: # scan for tokens
- start, end = pseudomatch.span(1)
- spos, epos, pos = (lnum, start), (lnum, end), end
- if start == end:
- continue
- token, initial = line[start:end], line[start]
-
- if (initial in numchars or # ordinary number
- (initial == '.' and token != '.' and token != '...')):
- yield TokenInfo(NUMBER, token, spos, epos, line)
- elif initial in '\r\n':
- if parenlev > 0:
- yield TokenInfo(NL, token, spos, epos, line)
- else:
- yield TokenInfo(NEWLINE, token, spos, epos, line)
-
- elif initial == '#':
- assert not token.endswith("\n")
- yield TokenInfo(COMMENT, token, spos, epos, line)
-
- elif token in triple_quoted:
- endprog = _compile(endpats[token])
- endmatch = endprog.match(line, pos)
- if endmatch: # all on one line
- pos = endmatch.end(0)
- token = line[start:pos]
- yield TokenInfo(STRING, token, spos, (lnum, pos), line)
- else:
- strstart = (lnum, start) # multiple lines
- contstr = line[start:]
- contline = line
- break
-
- # Check up to the first 3 chars of the token to see if
- # they're in the single_quoted set. If so, they start
- # a string.
- # We're using the first 3, because we're looking for
- # "rb'" (for example) at the start of the token. If
- # we switch to longer prefixes, this needs to be
- # adjusted.
- # Note that initial == token[:1].
- # Also note that single quote checking must come after
- # triple quote checking (above).
- elif (initial in single_quoted or
- token[:2] in single_quoted or
- token[:3] in single_quoted):
- if token[-1] == '\n': # continued string
- strstart = (lnum, start)
- # Again, using the first 3 chars of the
- # token. This is looking for the matching end
- # regex for the correct type of quote
- # character. So it's really looking for
- # endpats["'"] or endpats['"'], by trying to
- # skip string prefix characters, if any.
- endprog = _compile(endpats.get(initial) or
- endpats.get(token[1]) or
- endpats.get(token[2]))
- contstr, needcont = line[start:], 1
- contline = line
- break
- else: # ordinary string
- yield TokenInfo(STRING, token, spos, epos, line)
-
- elif initial.isidentifier(): # ordinary name
- yield TokenInfo(NAME, token, spos, epos, line)
- elif initial == '\\': # continued stmt
- continued = 1
- else:
- if initial in '([{':
- parenlev += 1
- elif initial in ')]}':
- parenlev -= 1
- yield TokenInfo(OP, token, spos, epos, line)
- else:
- yield TokenInfo(ERRORTOKEN, line[pos],
- (lnum, pos), (lnum, pos+1), line)
- pos += 1
-
- # Add an implicit NEWLINE if the input doesn't end in one
- if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
- yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
- for indent in indents[1:]: # pop remaining indent levels
- yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
- yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
-
+ yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
def generate_tokens(readline):
"""Tokenize a source reading Python code as unicode strings.
@@ -619,7 +452,7 @@ def generate_tokens(readline):
This has the same API as tokenize(), except that it expects the *readline*
callable to return str objects instead of bytes.
"""
- return _tokenize(readline, None)
+ return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
def main():
import argparse
@@ -656,7 +489,9 @@ def main():
tokens = list(tokenize(f.readline))
else:
filename = "<stdin>"
- tokens = _tokenize(sys.stdin.readline, None)
+ tokens = _generate_tokens_from_c_tokenizer(
+ sys.stdin.readline, extra_tokens=True)
+
# Output the tokenization
for token in tokens:
@@ -682,12 +517,30 @@ def main():
perror("unexpected error: %s" % err)
raise
-def _generate_tokens_from_c_tokenizer(source):
+def _transform_msg(msg):
+ """Transform error messages from the C tokenizer into the Python tokenize
+
+ The C tokenizer is more picky than the Python one, so we need to massage
+ the error messages a bit for backwards compatibility.
+ """
+ if "unterminated triple-quoted string literal" in msg:
+ return "EOF in multi-line string"
+ return msg
+
+def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
- import _tokenize as c_tokenizer
- for info in c_tokenizer.TokenizerIter(source):
- tok, type, lineno, end_lineno, col_off, end_col_off, line = info
- yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
+ if encoding is None:
+ it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
+ else:
+ it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
+ try:
+ for info in it:
+ yield TokenInfo._make(info)
+ except SyntaxError as e:
+ if type(e) != SyntaxError:
+ raise e from None
+ msg = _transform_msg(e.msg)
+ raise TokenError(msg, (e.lineno, e.offset)) from None
if __name__ == "__main__":