summaryrefslogtreecommitdiffstats
path: root/contrib/python/black/blib2to3/pgen2/tokenize.py
diff options
context:
space:
mode:
authorrobot-piglet <[email protected]>2025-08-28 14:27:58 +0300
committerrobot-piglet <[email protected]>2025-08-28 14:57:06 +0300
commit81d828c32c8d5477cb2f0ce5da06a1a8d9392ca3 (patch)
tree3081d566f0d5158d76e9093261344f6406fd09f7 /contrib/python/black/blib2to3/pgen2/tokenize.py
parent77ea11423f959e51795cc3ef36a48d808b4ffb98 (diff)
Intermediate changes
commit_hash:d5b1af16dbe9030537a04c27eb410c88c2f496cd
Diffstat (limited to 'contrib/python/black/blib2to3/pgen2/tokenize.py')
-rw-r--r--contrib/python/black/blib2to3/pgen2/tokenize.py1114
1 files changed, 1114 insertions, 0 deletions
diff --git a/contrib/python/black/blib2to3/pgen2/tokenize.py b/contrib/python/black/blib2to3/pgen2/tokenize.py
new file mode 100644
index 00000000000..407c184dd74
--- /dev/null
+++ b/contrib/python/black/blib2to3/pgen2/tokenize.py
@@ -0,0 +1,1114 @@
+# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
+# All rights reserved.
+
+# mypy: allow-untyped-defs, allow-untyped-calls
+
+"""Tokenization help for Python programs.
+
+generate_tokens(readline) is a generator that breaks a stream of
+text into Python tokens. It accepts a readline-like method which is called
+repeatedly to get the next line of input (or "" for EOF). It generates
+5-tuples with these members:
+
+ the token type (see token.py)
+ the token (a string)
+ the starting (row, column) indices of the token (a 2-tuple of ints)
+ the ending (row, column) indices of the token (a 2-tuple of ints)
+ the original line (string)
+
+It is designed to match the working of the Python tokenizer exactly, except
+that it produces COMMENT tokens for comments and gives type OP for all
+operators
+
+Older entry points
+ tokenize_loop(readline, tokeneater)
+ tokenize(readline, tokeneater=printtoken)
+are the same, except instead of generating tokens, tokeneater is a callback
+function to which the 5 fields described above are passed as 5 arguments,
+each time a new token is found."""
+
+import builtins
+import sys
+from collections.abc import Callable, Iterable, Iterator
+from re import Pattern
+from typing import Final, Optional, Union
+
+from blib2to3.pgen2.grammar import Grammar
+from blib2to3.pgen2.token import (
+ ASYNC,
+ AWAIT,
+ COMMENT,
+ DEDENT,
+ ENDMARKER,
+ ERRORTOKEN,
+ FSTRING_END,
+ FSTRING_MIDDLE,
+ FSTRING_START,
+ INDENT,
+ LBRACE,
+ NAME,
+ NEWLINE,
+ NL,
+ NUMBER,
+ OP,
+ RBRACE,
+ STRING,
+ tok_name,
+)
+
+__author__ = "Ka-Ping Yee <[email protected]>"
+__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
+
+import re
+from codecs import BOM_UTF8, lookup
+
+from . import token
+
+__all__ = [x for x in dir(token) if x[0] != "_"] + [
+ "tokenize",
+ "generate_tokens",
+ "untokenize",
+]
+del token
+
+
+def group(*choices: str) -> str:
+ return "(" + "|".join(choices) + ")"
+
+
+def any(*choices: str) -> str:
+ return group(*choices) + "*"
+
+
+def maybe(*choices: str) -> str:
+ return group(*choices) + "?"
+
+
+def _combinations(*l: str) -> set[str]:
+ return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
+
+
+Whitespace = r"[ \f\t]*"
+Comment = r"#[^\r\n]*"
+Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
+Name = ( # this is invalid but it's fine because Name comes after Number in all groups
+ r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
+)
+
+Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
+Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
+Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
+Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
+Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
+Exponent = r"[eE][-+]?\d+(?:_\d+)*"
+Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
+ Exponent
+)
+Expfloat = r"\d+(?:_\d+)*" + Exponent
+Floatnumber = group(Pointfloat, Expfloat)
+Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
+Number = group(Imagnumber, Floatnumber, Intnumber)
+
+# Tail end of ' string.
+Single = r"(?:\\.|[^'\\])*'"
+# Tail end of " string.
+Double = r'(?:\\.|[^"\\])*"'
+# Tail end of ''' string.
+Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''"
+# Tail end of """ string.
+Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""'
+_litprefix = r"(?:[uUrRbB]|[rR][bB]|[bBuU][rR])?"
+_fstringlitprefix = r"(?:rF|FR|Fr|fr|RF|F|rf|f|Rf|fR)"
+Triple = group(
+ _litprefix + "'''",
+ _litprefix + '"""',
+ _fstringlitprefix + '"""',
+ _fstringlitprefix + "'''",
+)
+
+# beginning of a single quoted f-string. must not end with `{{` or `\N{`
+SingleLbrace = r"(?:\\N{|{{|\\'|[^\n'{])*(?<!\\N)({)(?!{)"
+DoubleLbrace = r'(?:\\N{|{{|\\"|[^\n"{])*(?<!\\N)({)(?!{)'
+
+# beginning of a triple quoted f-string. must not end with `{{` or `\N{`
+Single3Lbrace = r"(?:\\N{|{{|\\'|'(?!'')|[^'{])*(?<!\\N){(?!{)"
+Double3Lbrace = r'(?:\\N{|{{|\\"|"(?!"")|[^"{])*(?<!\\N){(?!{)'
+
+# ! format specifier inside an fstring brace, ensure it's not a `!=` token
+Bang = Whitespace + group("!") + r"(?!=)"
+bang = re.compile(Bang)
+Colon = Whitespace + group(":")
+colon = re.compile(Colon)
+
+FstringMiddleAfterColon = group(Whitespace + r".*?") + group("{", "}")
+fstring_middle_after_colon = re.compile(FstringMiddleAfterColon)
+
+# Because of leftmost-then-longest match semantics, be sure to put the
+# longest operators first (e.g., if = came before ==, == would get
+# recognized as two instances of =).
+Operator = group(
+ r"\*\*=?",
+ r">>=?",
+ r"<<=?",
+ r"<>",
+ r"!=",
+ r"//=?",
+ r"->",
+ r"[+\-*/%&@|^=<>:]=?",
+ r"~",
+)
+
+Bracket = "[][(){}]"
+Special = group(r"\r?\n", r"[:;.,`@]")
+Funny = group(Operator, Bracket, Special)
+
+_string_middle_single = r"(?:[^\n'\\]|\\.)*"
+_string_middle_double = r'(?:[^\n"\\]|\\.)*'
+
+# FSTRING_MIDDLE and LBRACE, must not end with a `{{` or `\N{`
+_fstring_middle_single = SingleLbrace
+_fstring_middle_double = DoubleLbrace
+
+# First (or only) line of ' or " string.
+ContStr = group(
+ _litprefix + "'" + _string_middle_single + group("'", r"\\\r?\n"),
+ _litprefix + '"' + _string_middle_double + group('"', r"\\\r?\n"),
+ group(_fstringlitprefix + "'") + _fstring_middle_single,
+ group(_fstringlitprefix + '"') + _fstring_middle_double,
+ group(_fstringlitprefix + "'") + _string_middle_single + group("'", r"\\\r?\n"),
+ group(_fstringlitprefix + '"') + _string_middle_double + group('"', r"\\\r?\n"),
+)
+PseudoExtras = group(r"\\\r?\n", Comment, Triple)
+PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
+
+pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
+
+singleprog = re.compile(Single)
+singleprog_plus_lbrace = re.compile(group(SingleLbrace, Single))
+doubleprog = re.compile(Double)
+doubleprog_plus_lbrace = re.compile(group(DoubleLbrace, Double))
+
+single3prog = re.compile(Single3)
+single3prog_plus_lbrace = re.compile(group(Single3Lbrace, Single3))
+double3prog = re.compile(Double3)
+double3prog_plus_lbrace = re.compile(group(Double3Lbrace, Double3))
+
+_strprefixes = _combinations("r", "R", "b", "B") | {"u", "U", "ur", "uR", "Ur", "UR"}
+_fstring_prefixes = _combinations("r", "R", "f", "F") - {"r", "R"}
+
+endprogs: Final = {
+ "'": singleprog,
+ '"': doubleprog,
+ "'''": single3prog,
+ '"""': double3prog,
+ **{f"{prefix}'": singleprog for prefix in _strprefixes},
+ **{f'{prefix}"': doubleprog for prefix in _strprefixes},
+ **{f"{prefix}'": singleprog_plus_lbrace for prefix in _fstring_prefixes},
+ **{f'{prefix}"': doubleprog_plus_lbrace for prefix in _fstring_prefixes},
+ **{f"{prefix}'''": single3prog for prefix in _strprefixes},
+ **{f'{prefix}"""': double3prog for prefix in _strprefixes},
+ **{f"{prefix}'''": single3prog_plus_lbrace for prefix in _fstring_prefixes},
+ **{f'{prefix}"""': double3prog_plus_lbrace for prefix in _fstring_prefixes},
+}
+
+triple_quoted: Final = (
+ {"'''", '"""'}
+ | {f"{prefix}'''" for prefix in _strprefixes | _fstring_prefixes}
+ | {f'{prefix}"""' for prefix in _strprefixes | _fstring_prefixes}
+)
+single_quoted: Final = (
+ {"'", '"'}
+ | {f"{prefix}'" for prefix in _strprefixes | _fstring_prefixes}
+ | {f'{prefix}"' for prefix in _strprefixes | _fstring_prefixes}
+)
+fstring_prefix: Final = tuple(
+ {f"{prefix}'" for prefix in _fstring_prefixes}
+ | {f'{prefix}"' for prefix in _fstring_prefixes}
+ | {f"{prefix}'''" for prefix in _fstring_prefixes}
+ | {f'{prefix}"""' for prefix in _fstring_prefixes}
+)
+
+tabsize = 8
+
+
+class TokenError(Exception):
+ pass
+
+
+class StopTokenizing(Exception):
+ pass
+
+
+Coord = tuple[int, int]
+
+
+def printtoken(
+ type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
+) -> None: # for testing
+ (srow, scol) = srow_col
+ (erow, ecol) = erow_col
+ print(
+ "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
+ )
+
+
+TokenEater = Callable[[int, str, Coord, Coord, str], None]
+
+
+def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
+ """
+ The tokenize() function accepts two parameters: one representing the
+ input stream, and one providing an output mechanism for tokenize().
+
+ The first parameter, readline, must be a callable object which provides
+ the same interface as the readline() method of built-in file objects.
+ Each call to the function should return one line of input as a string.
+
+ The second parameter, tokeneater, must also be a callable object. It is
+ called once for each token, with five arguments, corresponding to the
+ tuples generated by generate_tokens().
+ """
+ try:
+ tokenize_loop(readline, tokeneater)
+ except StopTokenizing:
+ pass
+
+
+# backwards compatible interface
+def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
+ for token_info in generate_tokens(readline):
+ tokeneater(*token_info)
+
+
+GoodTokenInfo = tuple[int, str, Coord, Coord, str]
+TokenInfo = Union[tuple[int, str], GoodTokenInfo]
+
+
+class Untokenizer:
+ tokens: list[str]
+ prev_row: int
+ prev_col: int
+
+ def __init__(self) -> None:
+ self.tokens = []
+ self.prev_row = 1
+ self.prev_col = 0
+
+ def add_whitespace(self, start: Coord) -> None:
+ row, col = start
+ assert row <= self.prev_row
+ col_offset = col - self.prev_col
+ if col_offset:
+ self.tokens.append(" " * col_offset)
+
+ def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
+ for t in iterable:
+ if len(t) == 2:
+ self.compat(t, iterable)
+ break
+ tok_type, token, start, end, line = t
+ self.add_whitespace(start)
+ self.tokens.append(token)
+ self.prev_row, self.prev_col = end
+ if tok_type in (NEWLINE, NL):
+ self.prev_row += 1
+ self.prev_col = 0
+ return "".join(self.tokens)
+
+ def compat(self, token: tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
+ startline = False
+ indents = []
+ toks_append = self.tokens.append
+ toknum, tokval = token
+ if toknum in (NAME, NUMBER):
+ tokval += " "
+ if toknum in (NEWLINE, NL):
+ startline = True
+ for tok in iterable:
+ toknum, tokval = tok[:2]
+
+ if toknum in (NAME, NUMBER, ASYNC, AWAIT):
+ tokval += " "
+
+ if toknum == INDENT:
+ indents.append(tokval)
+ continue
+ elif toknum == DEDENT:
+ indents.pop()
+ continue
+ elif toknum in (NEWLINE, NL):
+ startline = True
+ elif startline and indents:
+ toks_append(indents[-1])
+ startline = False
+ toks_append(tokval)
+
+
+cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
+blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
+
+
+def _get_normal_name(orig_enc: str) -> str:
+ """Imitates get_normal_name in tokenizer.c."""
+ # Only care about the first 12 characters.
+ enc = orig_enc[:12].lower().replace("_", "-")
+ if enc == "utf-8" or enc.startswith("utf-8-"):
+ return "utf-8"
+ if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
+ ("latin-1-", "iso-8859-1-", "iso-latin-1-")
+ ):
+ return "iso-8859-1"
+ return orig_enc
+
+
+def detect_encoding(readline: Callable[[], bytes]) -> tuple[str, list[bytes]]:
+ """
+ The detect_encoding() function is used to detect the encoding that should
+ be used to decode a Python source file. It requires one argument, readline,
+ in the same way as the tokenize() generator.
+
+ It will call readline a maximum of twice, and return the encoding used
+ (as a string) and a list of any lines (left as bytes) it has read
+ in.
+
+ It detects the encoding from the presence of a utf-8 bom or an encoding
+ cookie as specified in pep-0263. If both a bom and a cookie are present, but
+ disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
+ charset, raise a SyntaxError. Note that if a utf-8 bom is found,
+ 'utf-8-sig' is returned.
+
+ If no encoding is specified, then the default of 'utf-8' will be returned.
+ """
+ bom_found = False
+ encoding = None
+ default = "utf-8"
+
+ def read_or_stop() -> bytes:
+ try:
+ return readline()
+ except StopIteration:
+ return b""
+
+ def find_cookie(line: bytes) -> Optional[str]:
+ try:
+ line_string = line.decode("ascii")
+ except UnicodeDecodeError:
+ return None
+ match = cookie_re.match(line_string)
+ if not match:
+ return None
+ encoding = _get_normal_name(match.group(1))
+ try:
+ codec = lookup(encoding)
+ except LookupError:
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError("unknown encoding: " + encoding)
+
+ if bom_found:
+ if codec.name != "utf-8":
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError("encoding problem: utf-8")
+ encoding += "-sig"
+ return encoding
+
+ first = read_or_stop()
+ if first.startswith(BOM_UTF8):
+ bom_found = True
+ first = first[3:]
+ default = "utf-8-sig"
+ if not first:
+ return default, []
+
+ encoding = find_cookie(first)
+ if encoding:
+ return encoding, [first]
+ if not blank_re.match(first):
+ return default, [first]
+
+ second = read_or_stop()
+ if not second:
+ return default, [first]
+
+ encoding = find_cookie(second)
+ if encoding:
+ return encoding, [first, second]
+
+ return default, [first, second]
+
+
+def untokenize(iterable: Iterable[TokenInfo]) -> str:
+ """Transform tokens back into Python source code.
+
+ Each element returned by the iterable must be a token sequence
+ with at least two elements, a token number and token value. If
+ only two tokens are passed, the resulting output is poor.
+
+ Round-trip invariant for full input:
+ Untokenized source will match input source exactly
+
+ Round-trip invariant for limited input:
+ # Output text will tokenize the back to the input
+ t1 = [tok[:2] for tok in generate_tokens(f.readline)]
+ newcode = untokenize(t1)
+ readline = iter(newcode.splitlines(1)).next
+ t2 = [tok[:2] for tokin generate_tokens(readline)]
+ assert t1 == t2
+ """
+ ut = Untokenizer()
+ return ut.untokenize(iterable)
+
+
+def is_fstring_start(token: str) -> bool:
+ return token.startswith(fstring_prefix)
+
+
+def _split_fstring_start_and_middle(token: str) -> tuple[str, str]:
+ for prefix in fstring_prefix:
+ _, prefix, rest = token.partition(prefix)
+ if prefix != "":
+ return prefix, rest
+
+ raise ValueError(f"Token {token!r} is not a valid f-string start")
+
+
+STATE_NOT_FSTRING: Final = 0 # not in an f-string
+STATE_MIDDLE: Final = 1 # in the string portion of an f-string (outside braces)
+STATE_IN_BRACES: Final = 2 # between braces in an f-string
+# in the format specifier (between the colon and the closing brace)
+STATE_IN_COLON: Final = 3
+
+
+class FStringState:
+ """Keeps track of state around f-strings.
+
+ The tokenizer should call the appropriate method on this class when
+ it transitions to a different part of an f-string. This is needed
+ because the tokenization depends on knowing where exactly we are in
+ the f-string.
+
+ For example, consider the following f-string:
+
+ f"a{1:b{2}c}d"
+
+ The following is the tokenization of this string and the states
+ tracked by this class:
+
+ 1,0-1,2: FSTRING_START 'f"' # [STATE_NOT_FSTRING, STATE_MIDDLE]
+ 1,2-1,3: FSTRING_MIDDLE 'a'
+ 1,3-1,4: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_BRACES]
+ 1,4-1,5: NUMBER '1'
+ 1,5-1,6: OP ':' # [STATE_NOT_FSTRING, STATE_IN_COLON]
+ 1,6-1,7: FSTRING_MIDDLE 'b'
+ 1,7-1,8: LBRACE '{' # [STATE_NOT_FSTRING, STATE_IN_COLON, STATE_IN_BRACES]
+ 1,8-1,9: NUMBER '2'
+ 1,9-1,10: RBRACE '}' # [STATE_NOT_FSTRING, STATE_IN_COLON]
+ 1,10-1,11: FSTRING_MIDDLE 'c'
+ 1,11-1,12: RBRACE '}' # [STATE_NOT_FSTRING, STATE_MIDDLE]
+ 1,12-1,13: FSTRING_MIDDLE 'd'
+ 1,13-1,14: FSTRING_END '"' # [STATE_NOT_FSTRING]
+ 1,14-1,15: NEWLINE '\n'
+ 2,0-2,0: ENDMARKER ''
+
+ Notice that the nested braces in the format specifier are represented
+ by adding a STATE_IN_BRACES entry to the state stack. The stack is
+ also used if there are nested f-strings.
+
+ """
+
+ def __init__(self) -> None:
+ self.stack: list[int] = [STATE_NOT_FSTRING]
+
+ def is_in_fstring_expression(self) -> bool:
+ return self.stack[-1] not in (STATE_MIDDLE, STATE_NOT_FSTRING)
+
+ def current(self) -> int:
+ return self.stack[-1]
+
+ def enter_fstring(self) -> None:
+ self.stack.append(STATE_MIDDLE)
+
+ def leave_fstring(self) -> None:
+ state = self.stack.pop()
+ assert state == STATE_MIDDLE
+
+ def consume_lbrace(self) -> None:
+ current_state = self.stack[-1]
+ if current_state == STATE_MIDDLE:
+ self.stack[-1] = STATE_IN_BRACES
+ elif current_state == STATE_IN_COLON:
+ self.stack.append(STATE_IN_BRACES)
+ else:
+ assert False, current_state
+
+ def consume_rbrace(self) -> None:
+ current_state = self.stack[-1]
+ assert current_state in (STATE_IN_BRACES, STATE_IN_COLON)
+ if len(self.stack) > 1 and self.stack[-2] == STATE_IN_COLON:
+ self.stack.pop()
+ else:
+ self.stack[-1] = STATE_MIDDLE
+
+ def consume_colon(self) -> None:
+ assert self.stack[-1] == STATE_IN_BRACES, self.stack
+ self.stack[-1] = STATE_IN_COLON
+
+
+def generate_tokens(
+ readline: Callable[[], str], grammar: Optional[Grammar] = None
+) -> Iterator[GoodTokenInfo]:
+ """
+ The generate_tokens() generator requires one argument, readline, which
+ must be a callable object which provides the same interface as the
+ readline() method of built-in file objects. Each call to the function
+ should return one line of input as a string. Alternately, readline
+ can be a callable function terminating with StopIteration:
+ readline = open(myfile).next # Example of alternate readline
+
+ The generator produces 5-tuples with these members: the token type; the
+ token string; a 2-tuple (srow, scol) of ints specifying the row and
+ column where the token begins in the source; a 2-tuple (erow, ecol) of
+ ints specifying the row and column where the token ends in the source;
+ and the line on which the token was found. The line passed is the
+ logical line; continuation lines are included.
+ """
+ lnum = parenlev = continued = 0
+ parenlev_stack: list[int] = []
+ fstring_state = FStringState()
+ formatspec = ""
+ numchars: Final[str] = "0123456789"
+ contstr, needcont = "", 0
+ contline: Optional[str] = None
+ indents = [0]
+
+ # If we know we're parsing 3.7+, we can unconditionally parse `async` and
+ # `await` as keywords.
+ async_keywords = False if grammar is None else grammar.async_keywords
+ # 'stashed' and 'async_*' are used for async/await parsing
+ stashed: Optional[GoodTokenInfo] = None
+ async_def = False
+ async_def_indent = 0
+ async_def_nl = False
+
+ strstart: tuple[int, int]
+ endprog_stack: list[Pattern[str]] = []
+ formatspec_start: tuple[int, int]
+
+ while 1: # loop over lines in stream
+ try:
+ line = readline()
+ except StopIteration:
+ line = ""
+ lnum += 1
+
+ # skip lines that are just indent characters ending with a slash
+ # to avoid storing that line's indent information.
+ if not contstr and line.rstrip("\n").strip(" \t\f") == "\\":
+ continue
+
+ pos, max = 0, len(line)
+
+ if contstr: # continued string
+ assert contline is not None
+ if not line:
+ raise TokenError("EOF in multi-line string", strstart)
+ endprog = endprog_stack[-1]
+ endmatch = endprog.match(line)
+ if endmatch:
+ end = endmatch.end(0)
+ token = contstr + line[:end]
+ spos = strstart
+ epos = (lnum, end)
+ tokenline = contline + line
+ if fstring_state.current() in (
+ STATE_NOT_FSTRING,
+ STATE_IN_BRACES,
+ ) and not is_fstring_start(token):
+ yield (STRING, token, spos, epos, tokenline)
+ endprog_stack.pop()
+ parenlev = parenlev_stack.pop()
+ else:
+ if is_fstring_start(token):
+ fstring_start, token = _split_fstring_start_and_middle(token)
+ fstring_start_epos = (spos[0], spos[1] + len(fstring_start))
+ yield (
+ FSTRING_START,
+ fstring_start,
+ spos,
+ fstring_start_epos,
+ tokenline,
+ )
+ fstring_state.enter_fstring()
+ # increase spos to the end of the fstring start
+ spos = fstring_start_epos
+
+ if token.endswith("{"):
+ fstring_middle, lbrace = token[:-1], token[-1]
+ fstring_middle_epos = lbrace_spos = (lnum, end - 1)
+ yield (
+ FSTRING_MIDDLE,
+ fstring_middle,
+ spos,
+ fstring_middle_epos,
+ line,
+ )
+ yield (LBRACE, lbrace, lbrace_spos, epos, line)
+ fstring_state.consume_lbrace()
+ else:
+ if token.endswith(('"""', "'''")):
+ fstring_middle, fstring_end = token[:-3], token[-3:]
+ fstring_middle_epos = end_spos = (lnum, end - 3)
+ else:
+ fstring_middle, fstring_end = token[:-1], token[-1]
+ fstring_middle_epos = end_spos = (lnum, end - 1)
+ yield (
+ FSTRING_MIDDLE,
+ fstring_middle,
+ spos,
+ fstring_middle_epos,
+ line,
+ )
+ yield (
+ FSTRING_END,
+ fstring_end,
+ end_spos,
+ epos,
+ line,
+ )
+ fstring_state.leave_fstring()
+ endprog_stack.pop()
+ parenlev = parenlev_stack.pop()
+ pos = end
+ contstr, needcont = "", 0
+ contline = None
+ elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
+ yield (
+ ERRORTOKEN,
+ contstr + line,
+ strstart,
+ (lnum, len(line)),
+ contline,
+ )
+ contstr = ""
+ contline = None
+ continue
+ else:
+ contstr = contstr + line
+ contline = contline + line
+ continue
+
+ # new statement
+ elif (
+ parenlev == 0
+ and not continued
+ and not fstring_state.is_in_fstring_expression()
+ ):
+ if not line:
+ break
+ column = 0
+ while pos < max: # measure leading whitespace
+ if line[pos] == " ":
+ column += 1
+ elif line[pos] == "\t":
+ column = (column // tabsize + 1) * tabsize
+ elif line[pos] == "\f":
+ column = 0
+ else:
+ break
+ pos += 1
+ if pos == max:
+ break
+
+ if stashed:
+ yield stashed
+ stashed = None
+
+ if line[pos] in "\r\n": # skip blank lines
+ yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
+ continue
+
+ if line[pos] == "#": # skip comments
+ comment_token = line[pos:].rstrip("\r\n")
+ nl_pos = pos + len(comment_token)
+ yield (
+ COMMENT,
+ comment_token,
+ (lnum, pos),
+ (lnum, nl_pos),
+ line,
+ )
+ yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
+ continue
+
+ if column > indents[-1]: # count indents
+ indents.append(column)
+ yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
+
+ while column < indents[-1]: # count dedents
+ if column not in indents:
+ raise IndentationError(
+ "unindent does not match any outer indentation level",
+ ("<tokenize>", lnum, pos, line),
+ )
+ indents = indents[:-1]
+
+ if async_def and async_def_indent >= indents[-1]:
+ async_def = False
+ async_def_nl = False
+ async_def_indent = 0
+
+ yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
+
+ if async_def and async_def_nl and async_def_indent >= indents[-1]:
+ async_def = False
+ async_def_nl = False
+ async_def_indent = 0
+
+ else: # continued statement
+ if not line:
+ raise TokenError("EOF in multi-line statement", (lnum, 0))
+ continued = 0
+
+ while pos < max:
+ if fstring_state.current() == STATE_MIDDLE:
+ endprog = endprog_stack[-1]
+ endmatch = endprog.match(line, pos)
+ if endmatch: # all on one line
+ start, end = endmatch.span(0)
+ token = line[start:end]
+ if token.endswith(('"""', "'''")):
+ middle_token, end_token = token[:-3], token[-3:]
+ middle_epos = end_spos = (lnum, end - 3)
+ else:
+ middle_token, end_token = token[:-1], token[-1]
+ middle_epos = end_spos = (lnum, end - 1)
+ # TODO: unsure if this can be safely removed
+ if stashed:
+ yield stashed
+ stashed = None
+ yield (
+ FSTRING_MIDDLE,
+ middle_token,
+ (lnum, pos),
+ middle_epos,
+ line,
+ )
+ if not token.endswith("{"):
+ yield (
+ FSTRING_END,
+ end_token,
+ end_spos,
+ (lnum, end),
+ line,
+ )
+ fstring_state.leave_fstring()
+ endprog_stack.pop()
+ parenlev = parenlev_stack.pop()
+ else:
+ yield (LBRACE, "{", (lnum, end - 1), (lnum, end), line)
+ fstring_state.consume_lbrace()
+ pos = end
+ continue
+ else: # multiple lines
+ strstart = (lnum, end)
+ contstr = line[end:]
+ contline = line
+ break
+
+ if fstring_state.current() == STATE_IN_COLON:
+ match = fstring_middle_after_colon.match(line, pos)
+ if match is None:
+ formatspec += line[pos:]
+ pos = max
+ continue
+
+ start, end = match.span(1)
+ token = line[start:end]
+ formatspec += token
+
+ brace_start, brace_end = match.span(2)
+ brace_or_nl = line[brace_start:brace_end]
+ if brace_or_nl == "\n":
+ pos = brace_end
+
+ yield (FSTRING_MIDDLE, formatspec, formatspec_start, (lnum, end), line)
+ formatspec = ""
+
+ if brace_or_nl == "{":
+ yield (LBRACE, "{", (lnum, brace_start), (lnum, brace_end), line)
+ fstring_state.consume_lbrace()
+ end = brace_end
+ elif brace_or_nl == "}":
+ yield (RBRACE, "}", (lnum, brace_start), (lnum, brace_end), line)
+ fstring_state.consume_rbrace()
+ end = brace_end
+ formatspec_start = (lnum, brace_end)
+
+ pos = end
+ continue
+
+ if fstring_state.current() == STATE_IN_BRACES and parenlev == 0:
+ match = bang.match(line, pos)
+ if match:
+ start, end = match.span(1)
+ yield (OP, "!", (lnum, start), (lnum, end), line)
+ pos = end
+ continue
+
+ match = colon.match(line, pos)
+ if match:
+ start, end = match.span(1)
+ yield (OP, ":", (lnum, start), (lnum, end), line)
+ fstring_state.consume_colon()
+ formatspec_start = (lnum, end)
+ pos = end
+ continue
+
+ pseudomatch = pseudoprog.match(line, pos)
+ if pseudomatch: # scan for tokens
+ start, end = pseudomatch.span(1)
+ spos, epos, pos = (lnum, start), (lnum, end), end
+ token, initial = line[start:end], line[start]
+
+ if initial in numchars or (
+ initial == "." and token != "."
+ ): # ordinary number
+ yield (NUMBER, token, spos, epos, line)
+ elif initial in "\r\n":
+ newline = NEWLINE
+ if parenlev > 0 or fstring_state.is_in_fstring_expression():
+ newline = NL
+ elif async_def:
+ async_def_nl = True
+ if stashed:
+ yield stashed
+ stashed = None
+ yield (newline, token, spos, epos, line)
+
+ elif initial == "#":
+ assert not token.endswith("\n")
+ if stashed:
+ yield stashed
+ stashed = None
+ yield (COMMENT, token, spos, epos, line)
+ elif token in triple_quoted:
+ endprog = endprogs[token]
+ endprog_stack.append(endprog)
+ parenlev_stack.append(parenlev)
+ parenlev = 0
+ if is_fstring_start(token):
+ yield (FSTRING_START, token, spos, epos, line)
+ fstring_state.enter_fstring()
+
+ endmatch = endprog.match(line, pos)
+ if endmatch: # all on one line
+ if stashed:
+ yield stashed
+ stashed = None
+ if not is_fstring_start(token):
+ pos = endmatch.end(0)
+ token = line[start:pos]
+ epos = (lnum, pos)
+ yield (STRING, token, spos, epos, line)
+ endprog_stack.pop()
+ parenlev = parenlev_stack.pop()
+ else:
+ end = endmatch.end(0)
+ token = line[pos:end]
+ spos, epos = (lnum, pos), (lnum, end)
+ if not token.endswith("{"):
+ fstring_middle, fstring_end = token[:-3], token[-3:]
+ fstring_middle_epos = fstring_end_spos = (lnum, end - 3)
+ yield (
+ FSTRING_MIDDLE,
+ fstring_middle,
+ spos,
+ fstring_middle_epos,
+ line,
+ )
+ yield (
+ FSTRING_END,
+ fstring_end,
+ fstring_end_spos,
+ epos,
+ line,
+ )
+ fstring_state.leave_fstring()
+ endprog_stack.pop()
+ parenlev = parenlev_stack.pop()
+ else:
+ fstring_middle, lbrace = token[:-1], token[-1]
+ fstring_middle_epos = lbrace_spos = (lnum, end - 1)
+ yield (
+ FSTRING_MIDDLE,
+ fstring_middle,
+ spos,
+ fstring_middle_epos,
+ line,
+ )
+ yield (LBRACE, lbrace, lbrace_spos, epos, line)
+ fstring_state.consume_lbrace()
+ pos = end
+ else:
+ # multiple lines
+ if is_fstring_start(token):
+ strstart = (lnum, pos)
+ contstr = line[pos:]
+ else:
+ strstart = (lnum, start)
+ contstr = line[start:]
+ contline = line
+ break
+ elif (
+ initial in single_quoted
+ or token[:2] in single_quoted
+ or token[:3] in single_quoted
+ ):
+ maybe_endprog = (
+ endprogs.get(initial)
+ or endprogs.get(token[:2])
+ or endprogs.get(token[:3])
+ )
+ assert maybe_endprog is not None, f"endprog not found for {token}"
+ endprog = maybe_endprog
+ if token[-1] == "\n": # continued string
+ endprog_stack.append(endprog)
+ parenlev_stack.append(parenlev)
+ parenlev = 0
+ strstart = (lnum, start)
+ contstr, needcont = line[start:], 1
+ contline = line
+ break
+ else: # ordinary string
+ if stashed:
+ yield stashed
+ stashed = None
+
+ if not is_fstring_start(token):
+ yield (STRING, token, spos, epos, line)
+ else:
+ if pseudomatch[20] is not None:
+ fstring_start = pseudomatch[20]
+ offset = pseudomatch.end(20) - pseudomatch.start(1)
+ elif pseudomatch[22] is not None:
+ fstring_start = pseudomatch[22]
+ offset = pseudomatch.end(22) - pseudomatch.start(1)
+ elif pseudomatch[24] is not None:
+ fstring_start = pseudomatch[24]
+ offset = pseudomatch.end(24) - pseudomatch.start(1)
+ else:
+ fstring_start = pseudomatch[26]
+ offset = pseudomatch.end(26) - pseudomatch.start(1)
+
+ start_epos = (lnum, start + offset)
+ yield (FSTRING_START, fstring_start, spos, start_epos, line)
+ fstring_state.enter_fstring()
+ endprog = endprogs[fstring_start]
+ endprog_stack.append(endprog)
+ parenlev_stack.append(parenlev)
+ parenlev = 0
+
+ end_offset = pseudomatch.end(1) - 1
+ fstring_middle = line[start + offset : end_offset]
+ middle_spos = (lnum, start + offset)
+ middle_epos = (lnum, end_offset)
+ yield (
+ FSTRING_MIDDLE,
+ fstring_middle,
+ middle_spos,
+ middle_epos,
+ line,
+ )
+ if not token.endswith("{"):
+ end_spos = (lnum, end_offset)
+ end_epos = (lnum, end_offset + 1)
+ yield (FSTRING_END, token[-1], end_spos, end_epos, line)
+ fstring_state.leave_fstring()
+ endprog_stack.pop()
+ parenlev = parenlev_stack.pop()
+ else:
+ end_spos = (lnum, end_offset)
+ end_epos = (lnum, end_offset + 1)
+ yield (LBRACE, "{", end_spos, end_epos, line)
+ fstring_state.consume_lbrace()
+
+ elif initial.isidentifier(): # ordinary name
+ if token in ("async", "await"):
+ if async_keywords or async_def:
+ yield (
+ ASYNC if token == "async" else AWAIT,
+ token,
+ spos,
+ epos,
+ line,
+ )
+ continue
+
+ tok = (NAME, token, spos, epos, line)
+ if token == "async" and not stashed:
+ stashed = tok
+ continue
+
+ if token in ("def", "for"):
+ if stashed and stashed[0] == NAME and stashed[1] == "async":
+ if token == "def":
+ async_def = True
+ async_def_indent = indents[-1]
+
+ yield (
+ ASYNC,
+ stashed[1],
+ stashed[2],
+ stashed[3],
+ stashed[4],
+ )
+ stashed = None
+
+ if stashed:
+ yield stashed
+ stashed = None
+
+ yield tok
+ elif initial == "\\": # continued stmt
+ # This yield is new; needed for better idempotency:
+ if stashed:
+ yield stashed
+ stashed = None
+ yield (NL, token, spos, (lnum, pos), line)
+ continued = 1
+ elif (
+ initial == "}"
+ and parenlev == 0
+ and fstring_state.is_in_fstring_expression()
+ ):
+ yield (RBRACE, token, spos, epos, line)
+ fstring_state.consume_rbrace()
+ formatspec_start = epos
+ else:
+ if initial in "([{":
+ parenlev += 1
+ elif initial in ")]}":
+ parenlev -= 1
+ if stashed:
+ yield stashed
+ stashed = None
+ yield (OP, token, spos, epos, line)
+ else:
+ yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
+ pos += 1
+
+ if stashed:
+ yield stashed
+ stashed = None
+
+ for _indent in indents[1:]: # pop remaining indent levels
+ yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
+ yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
+ assert len(endprog_stack) == 0
+ assert len(parenlev_stack) == 0
+
+
+if __name__ == "__main__": # testing
+ if len(sys.argv) > 1:
+ tokenize(open(sys.argv[1]).readline)
+ else:
+ tokenize(sys.stdin.readline)