aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/Lib/tokenize.py
diff options
context:
space:
mode:
authorAlexSm <alex@ydb.tech>2024-03-05 10:40:59 +0100
committerGitHub <noreply@github.com>2024-03-05 12:40:59 +0300
commit1ac13c847b5358faba44dbb638a828e24369467b (patch)
tree07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Lib/tokenize.py
parentffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff)
downloadydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz
Library import 16 (#2433)
Co-authored-by: robot-piglet <robot-piglet@yandex-team.com> Co-authored-by: deshevoy <deshevoy@yandex-team.com> Co-authored-by: robot-contrib <robot-contrib@yandex-team.com> Co-authored-by: thegeorg <thegeorg@yandex-team.com> Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com> Co-authored-by: svidyuk <svidyuk@yandex-team.com> Co-authored-by: shadchin <shadchin@yandex-team.com> Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com> Co-authored-by: innokentii <innokentii@yandex-team.com> Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com> Co-authored-by: snermolaev <snermolaev@yandex-team.com> Co-authored-by: dimdim11 <dimdim11@yandex-team.com> Co-authored-by: kickbutt <kickbutt@yandex-team.com> Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com> Co-authored-by: korsunandrei <korsunandrei@yandex-team.com> Co-authored-by: petrk <petrk@yandex-team.com> Co-authored-by: miroslav2 <miroslav2@yandex-team.com> Co-authored-by: serjflint <serjflint@yandex-team.com> Co-authored-by: akhropov <akhropov@yandex-team.com> Co-authored-by: prettyboy <prettyboy@yandex-team.com> Co-authored-by: ilikepugs <ilikepugs@yandex-team.com> Co-authored-by: hiddenpath <hiddenpath@yandex-team.com> Co-authored-by: mikhnenko <mikhnenko@yandex-team.com> Co-authored-by: spreis <spreis@yandex-team.com> Co-authored-by: andreyshspb <andreyshspb@yandex-team.com> Co-authored-by: dimaandreev <dimaandreev@yandex-team.com> Co-authored-by: rashid <rashid@yandex-team.com> Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com> Co-authored-by: r-vetrov <r-vetrov@yandex-team.com> Co-authored-by: ypodlesov <ypodlesov@yandex-team.com> Co-authored-by: zaverden <zaverden@yandex-team.com> Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com> Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com> Co-authored-by: v-korovin <v-korovin@yandex-team.com> Co-authored-by: arikon <arikon@yandex-team.com> Co-authored-by: khoden <khoden@yandex-team.com> Co-authored-by: psydmm <psydmm@yandex-team.com> Co-authored-by: robot-javacom <robot-javacom@yandex-team.com> Co-authored-by: dtorilov <dtorilov@yandex-team.com> Co-authored-by: sennikovmv <sennikovmv@yandex-team.com> Co-authored-by: hcpp <hcpp@ydb.tech>
Diffstat (limited to 'contrib/tools/python3/Lib/tokenize.py')
-rw-r--r--contrib/tools/python3/Lib/tokenize.py547
1 files changed, 547 insertions, 0 deletions
diff --git a/contrib/tools/python3/Lib/tokenize.py b/contrib/tools/python3/Lib/tokenize.py
new file mode 100644
index 0000000000..49e8144edd
--- /dev/null
+++ b/contrib/tools/python3/Lib/tokenize.py
@@ -0,0 +1,547 @@
+"""Tokenization help for Python programs.
+
+tokenize(readline) is a generator that breaks a stream of bytes into
+Python tokens. It decodes the bytes according to PEP-0263 for
+determining source file encoding.
+
+It accepts a readline-like method which is called repeatedly to get the
+next line of input (or b"" for EOF). It generates 5-tuples with these
+members:
+
+ the token type (see token.py)
+ the token (a string)
+ the starting (row, column) indices of the token (a 2-tuple of ints)
+ the ending (row, column) indices of the token (a 2-tuple of ints)
+ the original line (string)
+
+It is designed to match the working of the Python tokenizer exactly, except
+that it produces COMMENT tokens for comments and gives type OP for all
+operators. Additionally, all token lists start with an ENCODING token
+which tells you which encoding was used to decode the bytes stream.
+"""
+
+__author__ = 'Ka-Ping Yee <ping@lfw.org>'
+__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
+ 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
+ 'Michael Foord')
+from builtins import open as _builtin_open
+from codecs import lookup, BOM_UTF8
+import collections
+import functools
+from io import TextIOWrapper
+import itertools as _itertools
+import re
+import sys
+from token import *
+from token import EXACT_TOKEN_TYPES
+import _tokenize
+
+cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
+
+import token
+__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
+ "untokenize", "TokenInfo"]
+del token
+
+class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
+ def __repr__(self):
+ annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
+ return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
+ self._replace(type=annotated_type))
+
+ @property
+ def exact_type(self):
+ if self.type == OP and self.string in EXACT_TOKEN_TYPES:
+ return EXACT_TOKEN_TYPES[self.string]
+ else:
+ return self.type
+
+def group(*choices): return '(' + '|'.join(choices) + ')'
+def any(*choices): return group(*choices) + '*'
+def maybe(*choices): return group(*choices) + '?'
+
+# Note: we use unicode matching for names ("\w") but ascii matching for
+# number literals.
+Whitespace = r'[ \f\t]*'
+Comment = r'#[^\r\n]*'
+Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
+Name = r'\w+'
+
+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
+Binnumber = r'0[bB](?:_?[01])+'
+Octnumber = r'0[oO](?:_?[0-7])+'
+Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
+Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
+Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
+Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
+ r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
+Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
+Floatnumber = group(Pointfloat, Expfloat)
+Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
+Number = group(Imagnumber, Floatnumber, Intnumber)
+
+# Return the empty string, plus all of the valid string prefixes.
+def _all_string_prefixes():
+ # The valid string prefixes. Only contain the lower case versions,
+ # and don't contain any permutations (include 'fr', but not
+ # 'rf'). The various permutations will be generated.
+ _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
+ # if we add binary f-strings, add: ['fb', 'fbr']
+ result = {''}
+ for prefix in _valid_string_prefixes:
+ for t in _itertools.permutations(prefix):
+ # create a list with upper and lower versions of each
+ # character
+ for u in _itertools.product(*[(c, c.upper()) for c in t]):
+ result.add(''.join(u))
+ return result
+
+@functools.lru_cache
+def _compile(expr):
+ return re.compile(expr, re.UNICODE)
+
+# Note that since _all_string_prefixes includes the empty string,
+# StringPrefix can be the empty string (making it optional).
+StringPrefix = group(*_all_string_prefixes())
+
+# Tail end of ' string.
+Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
+# Tail end of " string.
+Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
+# Tail end of ''' string.
+Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
+# Tail end of """ string.
+Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
+Triple = group(StringPrefix + "'''", StringPrefix + '"""')
+# Single-line ' or " string.
+String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+ StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+
+# Sorting in reverse order puts the long operators before their prefixes.
+# Otherwise if = came before ==, == would get recognized as two instances
+# of =.
+Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
+Funny = group(r'\r?\n', Special)
+
+PlainToken = group(Number, Funny, String, Name)
+Token = Ignore + PlainToken
+
+# First (or only) line of ' or " string.
+ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+ group("'", r'\\\r?\n'),
+ StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+ group('"', r'\\\r?\n'))
+PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
+PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
+
+# For a given string prefix plus quotes, endpats maps it to a regex
+# to match the remainder of that string. _prefix can be empty, for
+# a normal single or triple quoted string (with no prefix).
+endpats = {}
+for _prefix in _all_string_prefixes():
+ endpats[_prefix + "'"] = Single
+ endpats[_prefix + '"'] = Double
+ endpats[_prefix + "'''"] = Single3
+ endpats[_prefix + '"""'] = Double3
+del _prefix
+
+# A set of all of the single and triple quoted string prefixes,
+# including the opening quotes.
+single_quoted = set()
+triple_quoted = set()
+for t in _all_string_prefixes():
+ for u in (t + '"', t + "'"):
+ single_quoted.add(u)
+ for u in (t + '"""', t + "'''"):
+ triple_quoted.add(u)
+del t, u
+
+tabsize = 8
+
+class TokenError(Exception): pass
+
+
+class StopTokenizing(Exception): pass
+
+class Untokenizer:
+
+ def __init__(self):
+ self.tokens = []
+ self.prev_row = 1
+ self.prev_col = 0
+ self.encoding = None
+
+ def add_whitespace(self, start):
+ row, col = start
+ if row < self.prev_row or row == self.prev_row and col < self.prev_col:
+ raise ValueError("start ({},{}) precedes previous end ({},{})"
+ .format(row, col, self.prev_row, self.prev_col))
+ row_offset = row - self.prev_row
+ if row_offset:
+ self.tokens.append("\\\n" * row_offset)
+ self.prev_col = 0
+ col_offset = col - self.prev_col
+ if col_offset:
+ self.tokens.append(" " * col_offset)
+
+ def untokenize(self, iterable):
+ it = iter(iterable)
+ indents = []
+ startline = False
+ for t in it:
+ if len(t) == 2:
+ self.compat(t, it)
+ break
+ tok_type, token, start, end, line = t
+ if tok_type == ENCODING:
+ self.encoding = token
+ continue
+ if tok_type == ENDMARKER:
+ break
+ if tok_type == INDENT:
+ indents.append(token)
+ continue
+ elif tok_type == DEDENT:
+ indents.pop()
+ self.prev_row, self.prev_col = end
+ continue
+ elif tok_type in (NEWLINE, NL):
+ startline = True
+ elif startline and indents:
+ indent = indents[-1]
+ if start[1] >= len(indent):
+ self.tokens.append(indent)
+ self.prev_col = len(indent)
+ startline = False
+ elif tok_type == FSTRING_MIDDLE:
+ if '{' in token or '}' in token:
+ end_line, end_col = end
+ end = (end_line, end_col + token.count('{') + token.count('}'))
+ token = re.sub('{', '{{', token)
+ token = re.sub('}', '}}', token)
+
+
+ self.add_whitespace(start)
+ self.tokens.append(token)
+ self.prev_row, self.prev_col = end
+ if tok_type in (NEWLINE, NL):
+ self.prev_row += 1
+ self.prev_col = 0
+ return "".join(self.tokens)
+
+ def compat(self, token, iterable):
+ indents = []
+ toks_append = self.tokens.append
+ startline = token[0] in (NEWLINE, NL)
+ prevstring = False
+
+ for tok in _itertools.chain([token], iterable):
+ toknum, tokval = tok[:2]
+ if toknum == ENCODING:
+ self.encoding = tokval
+ continue
+
+ if toknum in (NAME, NUMBER):
+ tokval += ' '
+
+ # Insert a space between two consecutive strings
+ if toknum == STRING:
+ if prevstring:
+ tokval = ' ' + tokval
+ prevstring = True
+ else:
+ prevstring = False
+
+ if toknum == INDENT:
+ indents.append(tokval)
+ continue
+ elif toknum == DEDENT:
+ indents.pop()
+ continue
+ elif toknum in (NEWLINE, NL):
+ startline = True
+ elif startline and indents:
+ toks_append(indents[-1])
+ startline = False
+ elif toknum == FSTRING_MIDDLE:
+ if '{' in tokval or '}' in tokval:
+ tokval = re.sub('{', '{{', tokval)
+ tokval = re.sub('}', '}}', tokval)
+
+ toks_append(tokval)
+
+
+def untokenize(iterable):
+ """Transform tokens back into Python source code.
+ It returns a bytes object, encoded using the ENCODING
+ token, which is the first token sequence output by tokenize.
+
+ Each element returned by the iterable must be a token sequence
+ with at least two elements, a token number and token value. If
+ only two tokens are passed, the resulting output is poor.
+
+ Round-trip invariant for full input:
+ Untokenized source will match input source exactly
+
+ Round-trip invariant for limited input:
+ # Output bytes will tokenize back to the input
+ t1 = [tok[:2] for tok in tokenize(f.readline)]
+ newcode = untokenize(t1)
+ readline = BytesIO(newcode).readline
+ t2 = [tok[:2] for tok in tokenize(readline)]
+ assert t1 == t2
+ """
+ ut = Untokenizer()
+ out = ut.untokenize(iterable)
+ if ut.encoding is not None:
+ out = out.encode(ut.encoding)
+ return out
+
+
+def _get_normal_name(orig_enc):
+ """Imitates get_normal_name in tokenizer.c."""
+ # Only care about the first 12 characters.
+ enc = orig_enc[:12].lower().replace("_", "-")
+ if enc == "utf-8" or enc.startswith("utf-8-"):
+ return "utf-8"
+ if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+ enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+ return "iso-8859-1"
+ return orig_enc
+
+def detect_encoding(readline):
+ """
+ The detect_encoding() function is used to detect the encoding that should
+ be used to decode a Python source file. It requires one argument, readline,
+ in the same way as the tokenize() generator.
+
+ It will call readline a maximum of twice, and return the encoding used
+ (as a string) and a list of any lines (left as bytes) it has read in.
+
+ It detects the encoding from the presence of a utf-8 bom or an encoding
+ cookie as specified in pep-0263. If both a bom and a cookie are present,
+ but disagree, a SyntaxError will be raised. If the encoding cookie is an
+ invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
+ 'utf-8-sig' is returned.
+
+ If no encoding is specified, then the default of 'utf-8' will be returned.
+ """
+ try:
+ filename = readline.__self__.name
+ except AttributeError:
+ filename = None
+ bom_found = False
+ encoding = None
+ default = 'utf-8'
+ def read_or_stop():
+ try:
+ return readline()
+ except StopIteration:
+ return b''
+
+ def find_cookie(line):
+ try:
+ # Decode as UTF-8. Either the line is an encoding declaration,
+ # in which case it should be pure ASCII, or it must be UTF-8
+ # per default encoding.
+ line_string = line.decode('utf-8')
+ except UnicodeDecodeError:
+ msg = "invalid or missing encoding declaration"
+ if filename is not None:
+ msg = '{} for {!r}'.format(msg, filename)
+ raise SyntaxError(msg)
+
+ match = cookie_re.match(line_string)
+ if not match:
+ return None
+ encoding = _get_normal_name(match.group(1))
+ try:
+ codec = lookup(encoding)
+ except LookupError:
+ # This behaviour mimics the Python interpreter
+ if filename is None:
+ msg = "unknown encoding: " + encoding
+ else:
+ msg = "unknown encoding for {!r}: {}".format(filename,
+ encoding)
+ raise SyntaxError(msg)
+
+ if bom_found:
+ if encoding != 'utf-8':
+ # This behaviour mimics the Python interpreter
+ if filename is None:
+ msg = 'encoding problem: utf-8'
+ else:
+ msg = 'encoding problem for {!r}: utf-8'.format(filename)
+ raise SyntaxError(msg)
+ encoding += '-sig'
+ return encoding
+
+ first = read_or_stop()
+ if first.startswith(BOM_UTF8):
+ bom_found = True
+ first = first[3:]
+ default = 'utf-8-sig'
+ if not first:
+ return default, []
+
+ encoding = find_cookie(first)
+ if encoding:
+ return encoding, [first]
+ if not blank_re.match(first):
+ return default, [first]
+
+ second = read_or_stop()
+ if not second:
+ return default, [first]
+
+ encoding = find_cookie(second)
+ if encoding:
+ return encoding, [first, second]
+
+ return default, [first, second]
+
+
+def open(filename):
+ """Open a file in read only mode using the encoding detected by
+ detect_encoding().
+ """
+ buffer = _builtin_open(filename, 'rb')
+ try:
+ encoding, lines = detect_encoding(buffer.readline)
+ buffer.seek(0)
+ text = TextIOWrapper(buffer, encoding, line_buffering=True)
+ text.mode = 'r'
+ return text
+ except:
+ buffer.close()
+ raise
+
+def tokenize(readline):
+ """
+ The tokenize() generator requires one argument, readline, which
+ must be a callable object which provides the same interface as the
+ readline() method of built-in file objects. Each call to the function
+ should return one line of input as bytes. Alternatively, readline
+ can be a callable function terminating with StopIteration:
+ readline = open(myfile, 'rb').__next__ # Example of alternate readline
+
+ The generator produces 5-tuples with these members: the token type; the
+ token string; a 2-tuple (srow, scol) of ints specifying the row and
+ column where the token begins in the source; a 2-tuple (erow, ecol) of
+ ints specifying the row and column where the token ends in the source;
+ and the line on which the token was found. The line passed is the
+ physical line.
+
+ The first token sequence will always be an ENCODING token
+ which tells you which encoding was used to decode the bytes stream.
+ """
+ encoding, consumed = detect_encoding(readline)
+ rl_gen = _itertools.chain(consumed, iter(readline, b""))
+ if encoding is not None:
+ if encoding == "utf-8-sig":
+ # BOM will already have been stripped.
+ encoding = "utf-8"
+ yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
+ yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
+
+def generate_tokens(readline):
+ """Tokenize a source reading Python code as unicode strings.
+
+ This has the same API as tokenize(), except that it expects the *readline*
+ callable to return str objects instead of bytes.
+ """
+ return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
+
+def main():
+ import argparse
+
+ # Helper error handling routines
+ def perror(message):
+ sys.stderr.write(message)
+ sys.stderr.write('\n')
+
+ def error(message, filename=None, location=None):
+ if location:
+ args = (filename,) + location + (message,)
+ perror("%s:%d:%d: error: %s" % args)
+ elif filename:
+ perror("%s: error: %s" % (filename, message))
+ else:
+ perror("error: %s" % message)
+ sys.exit(1)
+
+ # Parse the arguments and options
+ parser = argparse.ArgumentParser(prog='python -m tokenize')
+ parser.add_argument(dest='filename', nargs='?',
+ metavar='filename.py',
+ help='the file to tokenize; defaults to stdin')
+ parser.add_argument('-e', '--exact', dest='exact', action='store_true',
+ help='display token names using the exact type')
+ args = parser.parse_args()
+
+ try:
+ # Tokenize the input
+ if args.filename:
+ filename = args.filename
+ with _builtin_open(filename, 'rb') as f:
+ tokens = list(tokenize(f.readline))
+ else:
+ filename = "<stdin>"
+ tokens = _generate_tokens_from_c_tokenizer(
+ sys.stdin.readline, extra_tokens=True)
+
+
+ # Output the tokenization
+ for token in tokens:
+ token_type = token.type
+ if args.exact:
+ token_type = token.exact_type
+ token_range = "%d,%d-%d,%d:" % (token.start + token.end)
+ print("%-20s%-15s%-15r" %
+ (token_range, tok_name[token_type], token.string))
+ except IndentationError as err:
+ line, column = err.args[1][1:3]
+ error(err.args[0], filename, (line, column))
+ except TokenError as err:
+ line, column = err.args[1]
+ error(err.args[0], filename, (line, column))
+ except SyntaxError as err:
+ error(err, filename)
+ except OSError as err:
+ error(err)
+ except KeyboardInterrupt:
+ print("interrupted\n")
+ except Exception as err:
+ perror("unexpected error: %s" % err)
+ raise
+
+def _transform_msg(msg):
+ """Transform error messages from the C tokenizer into the Python tokenize
+
+ The C tokenizer is more picky than the Python one, so we need to massage
+ the error messages a bit for backwards compatibility.
+ """
+ if "unterminated triple-quoted string literal" in msg:
+ return "EOF in multi-line string"
+ return msg
+
+def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
+ """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
+ if encoding is None:
+ it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
+ else:
+ it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
+ try:
+ for info in it:
+ yield TokenInfo._make(info)
+ except SyntaxError as e:
+ if type(e) != SyntaxError:
+ raise e from None
+ msg = _transform_msg(e.msg)
+ raise TokenError(msg, (e.lineno, e.offset)) from None
+
+
+if __name__ == "__main__":
+ main()