Restoring authorship annotation for Mikhail Borisov <borisov.mikhail@gmail.com>. Commit 1 of 2.

author: Mikhail Borisov <borisov.mikhail@gmail.com> 2022-02-10 16:45:39 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:45:39 +0300
commit: a6a92afe03e02795227d2641b49819b687f088f8 (patch)
tree: f6984a1d27d5a7ec88a6fdd6e20cd5b7693b6ece /contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py
parent: c6dc8b8bd530985bc4cce0137e9a5de32f1087cb (diff)
download: ydb-a6a92afe03e02795227d2641b49819b687f088f8.tar.gz
1 files changed, 595 insertions, 595 deletions
diff --git a/contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py b/contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py
index ee1fd9e639..ca85023c32 100644
--- a/contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py
+++ b/contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py
@@ -1,595 +1,595 @@
-"""Patched version of standard library tokenize, to deal with various bugs.
-
-Based on Python 3.2 code.
-
-Patches:
-
-- Gareth Rees' patch for Python issue #12691 (untokenizing)
-  - Except we don't encode the output of untokenize
-  - Python 2 compatible syntax, so that it can be byte-compiled at installation
-- Newlines in comments and blank lines should be either NL or NEWLINE, depending
-  on whether they are in a multi-line statement. Filed as Python issue #17061.
-- Export generate_tokens & TokenError
-- u and rb literals are allowed under Python 3.3 and above.
-
-------------------------------------------------------------------------------
-Tokenization help for Python programs.
-
-tokenize(readline) is a generator that breaks a stream of bytes into
-Python tokens.  It decodes the bytes according to PEP-0263 for
-determining source file encoding.
-
-It accepts a readline-like method which is called repeatedly to get the
-next line of input (or b"" for EOF).  It generates 5-tuples with these
-members:
-
-    the token type (see token.py)
-    the token (a string)
-    the starting (row, column) indices of the token (a 2-tuple of ints)
-    the ending (row, column) indices of the token (a 2-tuple of ints)
-    the original line (string)
-
-It is designed to match the working of the Python tokenizer exactly, except
-that it produces COMMENT tokens for comments and gives type OP for all
-operators.  Additionally, all token lists start with an ENCODING token
-which tells you which encoding was used to decode the bytes stream.
-"""
-from __future__ import absolute_import
-
-__author__ = 'Ka-Ping Yee <ping@lfw.org>'
-__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
-               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
-               'Michael Foord')
-import builtins
-import re
-import sys
-from token import *
-from codecs import lookup, BOM_UTF8
-import collections
-from io import TextIOWrapper
-cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
-
-import token
-__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
-                           "NL", "untokenize", "ENCODING", "TokenInfo"]
-del token
-
-__all__ += ["generate_tokens", "TokenError"]
-
-COMMENT = N_TOKENS
-tok_name[COMMENT] = 'COMMENT'
-NL = N_TOKENS + 1
-tok_name[NL] = 'NL'
-ENCODING = N_TOKENS + 2
-tok_name[ENCODING] = 'ENCODING'
-N_TOKENS += 3
-
-class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
-    def __repr__(self):
-        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
-        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
-                self._replace(type=annotated_type))
-
-def group(*choices): return '(' + '|'.join(choices) + ')'
-def any(*choices): return group(*choices) + '*'
-def maybe(*choices): return group(*choices) + '?'
-
-# Note: we use unicode matching for names ("\w") but ascii matching for
-# number literals.
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'\w+'
-
-Hexnumber = r'0[xX][0-9a-fA-F]+'
-Binnumber = r'0[bB][01]+'
-Octnumber = r'0[oO][0-7]+'
-Decnumber = r'(?:0+|[1-9][0-9]*)'
-Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?[0-9]+'
-Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
-Expfloat = r'[0-9]+' + Exponent
-Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
-Number = group(Imagnumber, Floatnumber, Intnumber)
-
-if sys.version_info.minor >= 3:
-    StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
-else:
-    StringPrefix = r'(?:[bB]?[rR]?)?'
-
-# Tail end of ' string.
-Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
-# Tail end of " string.
-Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
-# Tail end of ''' string.
-Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
-# Tail end of """ string.
-Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group(StringPrefix + "'''", StringPrefix + '"""')
-# Single-line ' or " string.
-String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
-               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-
-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
-                 r"//=?", r"->",
-                 r"[+\-*/%&|^=<>]=?",
-                 r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-Funny = group(Operator, Bracket, Special)
-
-PlainToken = group(Number, Funny, String, Name)
-Token = Ignore + PlainToken
-
-# First (or only) line of ' or " string.
-ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
-                group("'", r'\\\r?\n'),
-                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
-                group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n', Comment, Triple)
-PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-
-def _compile(expr):
-    return re.compile(expr, re.UNICODE)
-
-tokenprog, pseudoprog, single3prog, double3prog = map(
-    _compile, (Token, PseudoToken, Single3, Double3))
-endprogs = {"'": _compile(Single), '"': _compile(Double),
-            "'''": single3prog, '"""': double3prog,
-            "r'''": single3prog, 'r"""': double3prog,
-            "b'''": single3prog, 'b"""': double3prog,
-            "R'''": single3prog, 'R"""': double3prog,
-            "B'''": single3prog, 'B"""': double3prog,
-            "br'''": single3prog, 'br"""': double3prog,
-            "bR'''": single3prog, 'bR"""': double3prog,
-            "Br'''": single3prog, 'Br"""': double3prog,
-            "BR'''": single3prog, 'BR"""': double3prog,
-            'r': None, 'R': None, 'b': None, 'B': None}
-
-triple_quoted = {}
-for t in ("'''", '"""',
-          "r'''", 'r"""', "R'''", 'R"""',
-          "b'''", 'b"""', "B'''", 'B"""',
-          "br'''", 'br"""', "Br'''", 'Br"""',
-          "bR'''", 'bR"""', "BR'''", 'BR"""'):
-    triple_quoted[t] = t
-single_quoted = {}
-for t in ("'", '"',
-          "r'", 'r"', "R'", 'R"',
-          "b'", 'b"', "B'", 'B"',
-          "br'", 'br"', "Br'", 'Br"',
-          "bR'", 'bR"', "BR'", 'BR"' ):
-    single_quoted[t] = t
-
-if sys.version_info.minor >= 3:
-    # Python 3.3
-    for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
-        _t2 = _prefix+'"""'
-        endprogs[_t2] = double3prog
-        triple_quoted[_t2] = _t2
-        _t1 = _prefix + "'''"
-        endprogs[_t1] = single3prog
-        triple_quoted[_t1] = _t1
-        single_quoted[_prefix+'"'] = _prefix+'"'
-        single_quoted[_prefix+"'"] = _prefix+"'"
-    del _prefix, _t2, _t1
-    endprogs['u'] = None
-    endprogs['U'] = None
-
-del _compile
-
-tabsize = 8
-
-class TokenError(Exception): pass
-
-class StopTokenizing(Exception): pass
-
-
-class Untokenizer:
-
-    def __init__(self):
-        self.tokens = []
-        self.prev_row = 1
-        self.prev_col = 0
-        self.encoding = 'utf-8'
-
-    def add_whitespace(self, tok_type, start):
-        row, col = start
-        assert row >= self.prev_row
-        col_offset = col - self.prev_col
-        if col_offset > 0:
-            self.tokens.append(" " * col_offset)
-        elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
-            # Line was backslash-continued.
-            self.tokens.append(" ")
-
-    def untokenize(self, tokens):
-        iterable = iter(tokens)
-        for t in iterable:
-            if len(t) == 2:
-                self.compat(t, iterable)
-                break
-            tok_type, token, start, end = t[:4]
-            if tok_type == ENCODING:
-                self.encoding = token
-                continue
-            self.add_whitespace(tok_type, start)
-            self.tokens.append(token)
-            self.prev_row, self.prev_col = end
-            if tok_type in (NEWLINE, NL):
-                self.prev_row += 1
-                self.prev_col = 0
-        return "".join(self.tokens)
-
-    def compat(self, token, iterable):
-        # This import is here to avoid problems when the itertools
-        # module is not built yet and tokenize is imported.
-        from itertools import chain
-        startline = False
-        prevstring = False
-        indents = []
-        toks_append = self.tokens.append
-
-        for tok in chain([token], iterable):
-            toknum, tokval = tok[:2]
-            if toknum == ENCODING:
-                self.encoding = tokval
-                continue
-
-            if toknum in (NAME, NUMBER):
-                tokval += ' '
-
-            # Insert a space between two consecutive strings
-            if toknum == STRING:
-                if prevstring:
-                    tokval = ' ' + tokval
-                prevstring = True
-            else:
-                prevstring = False
-
-            if toknum == INDENT:
-                indents.append(tokval)
-                continue
-            elif toknum == DEDENT:
-                indents.pop()
-                continue
-            elif toknum in (NEWLINE, NL):
-                startline = True
-            elif startline and indents:
-                toks_append(indents[-1])
-                startline = False
-            toks_append(tokval)
-
-
-def untokenize(tokens):
-    """
-    Convert ``tokens`` (an iterable) back into Python source code. Return
-    a bytes object, encoded using the encoding specified by the last
-    ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
-
-    The result is guaranteed to tokenize back to match the input so that
-    the conversion is lossless and round-trips are assured.  The
-    guarantee applies only to the token type and token string as the
-    spacing between tokens (column positions) may change.
-
-    :func:`untokenize` has two modes. If the input tokens are sequences
-    of length 2 (``type``, ``string``) then spaces are added as necessary to
-    preserve the round-trip property.
-
-    If the input tokens are sequences of length 4 or more (``type``,
-    ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
-    spaces are added so that each token appears in the result at the
-    position indicated by ``start`` and ``end``, if possible.
-    """
-    return Untokenizer().untokenize(tokens)
-
-
-def _get_normal_name(orig_enc):
-    """Imitates get_normal_name in tokenizer.c."""
-    # Only care about the first 12 characters.
-    enc = orig_enc[:12].lower().replace("_", "-")
-    if enc == "utf-8" or enc.startswith("utf-8-"):
-        return "utf-8"
-    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
-       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
-        return "iso-8859-1"
-    return orig_enc
-
-def detect_encoding(readline):
-    """
-    The detect_encoding() function is used to detect the encoding that should
-    be used to decode a Python source file.  It requires one argment, readline,
-    in the same way as the tokenize() generator.
-
-    It will call readline a maximum of twice, and return the encoding used
-    (as a string) and a list of any lines (left as bytes) it has read in.
-
-    It detects the encoding from the presence of a utf-8 bom or an encoding
-    cookie as specified in pep-0263.  If both a bom and a cookie are present,
-    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
-    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
-    'utf-8-sig' is returned.
-
-    If no encoding is specified, then the default of 'utf-8' will be returned.
-    """
-    bom_found = False
-    encoding = None
-    default = 'utf-8'
-    def read_or_stop():
-        try:
-            return readline()
-        except StopIteration:
-            return b''
-
-    def find_cookie(line):
-        try:
-            # Decode as UTF-8. Either the line is an encoding declaration,
-            # in which case it should be pure ASCII, or it must be UTF-8
-            # per default encoding.
-            line_string = line.decode('utf-8')
-        except UnicodeDecodeError:
-            raise SyntaxError("invalid or missing encoding declaration")
-
-        matches = cookie_re.findall(line_string)
-        if not matches:
-            return None
-        encoding = _get_normal_name(matches[0])
-        try:
-            codec = lookup(encoding)
-        except LookupError:
-            # This behaviour mimics the Python interpreter
-            raise SyntaxError("unknown encoding: " + encoding)
-
-        if bom_found:
-            if encoding != 'utf-8':
-                # This behaviour mimics the Python interpreter
-                raise SyntaxError('encoding problem: utf-8')
-            encoding += '-sig'
-        return encoding
-
-    first = read_or_stop()
-    if first.startswith(BOM_UTF8):
-        bom_found = True
-        first = first[3:]
-        default = 'utf-8-sig'
-    if not first:
-        return default, []
-
-    encoding = find_cookie(first)
-    if encoding:
-        return encoding, [first]
-
-    second = read_or_stop()
-    if not second:
-        return default, [first]
-
-    encoding = find_cookie(second)
-    if encoding:
-        return encoding, [first, second]
-
-    return default, [first, second]
-
-
-def open(filename):
-    """Open a file in read only mode using the encoding detected by
-    detect_encoding().
-    """
-    buffer = builtins.open(filename, 'rb')
-    encoding, lines = detect_encoding(buffer.readline)
-    buffer.seek(0)
-    text = TextIOWrapper(buffer, encoding, line_buffering=True)
-    text.mode = 'r'
-    return text
-
-
-def tokenize(readline):
-    """
-    The tokenize() generator requires one argment, readline, which
-    must be a callable object which provides the same interface as the
-    readline() method of built-in file objects.  Each call to the function
-    should return one line of input as bytes.  Alternately, readline
-    can be a callable function terminating with StopIteration:
-        readline = open(myfile, 'rb').__next__  # Example of alternate readline
-
-    The generator produces 5-tuples with these members: the token type; the
-    token string; a 2-tuple (srow, scol) of ints specifying the row and
-    column where the token begins in the source; a 2-tuple (erow, ecol) of
-    ints specifying the row and column where the token ends in the source;
-    and the line on which the token was found.  The line passed is the
-    logical line; continuation lines are included.
-
-    The first token sequence will always be an ENCODING token
-    which tells you which encoding was used to decode the bytes stream.
-    """
-    # This import is here to avoid problems when the itertools module is not
-    # built yet and tokenize is imported.
-    from itertools import chain, repeat
-    encoding, consumed = detect_encoding(readline)
-    rl_gen = iter(readline, b"")
-    empty = repeat(b"")
-    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
-
-
-def _tokenize(readline, encoding):
-    lnum = parenlev = continued = 0
-    numchars = '0123456789'
-    contstr, needcont = '', 0
-    contline = None
-    indents = [0]
-
-    if encoding is not None:
-        if encoding == "utf-8-sig":
-            # BOM will already have been stripped.
-            encoding = "utf-8"
-        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    while True:             # loop over lines in stream
-        try:
-            line = readline()
-        except StopIteration:
-            line = b''
-
-        if encoding is not None:
-            line = line.decode(encoding)
-        lnum += 1
-        pos, max = 0, len(line)
-
-        if contstr:                            # continued string
-            if not line:
-                raise TokenError("EOF in multi-line string", strstart)
-            endmatch = endprog.match(line)
-            if endmatch:
-                pos = end = endmatch.end(0)
-                yield TokenInfo(STRING, contstr + line[:end],
-                       strstart, (lnum, end), contline + line)
-                contstr, needcont = '', 0
-                contline = None
-            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
-                yield TokenInfo(ERRORTOKEN, contstr + line,
-                           strstart, (lnum, len(line)), contline)
-                contstr = ''
-                contline = None
-                continue
-            else:
-                contstr = contstr + line
-                contline = contline + line
-                continue
-
-        elif parenlev == 0 and not continued:  # new statement
-            if not line: break
-            column = 0
-            while pos < max:                   # measure leading whitespace
-                if line[pos] == ' ':
-                    column += 1
-                elif line[pos] == '\t':
-                    column = (column//tabsize + 1)*tabsize
-                elif line[pos] == '\f':
-                    column = 0
-                else:
-                    break
-                pos += 1
-            if pos == max:
-                break
-
-            if line[pos] in '#\r\n':           # skip comments or blank lines
-                if line[pos] == '#':
-                    comment_token = line[pos:].rstrip('\r\n')
-                    nl_pos = pos + len(comment_token)
-                    yield TokenInfo(COMMENT, comment_token,
-                           (lnum, pos), (lnum, pos + len(comment_token)), line)
-                    yield TokenInfo(NEWLINE, line[nl_pos:],
-                           (lnum, nl_pos), (lnum, len(line)), line)
-                else:
-                    yield TokenInfo(NEWLINE, line[pos:],
-                           (lnum, pos), (lnum, len(line)), line)
-                continue
-
-            if column > indents[-1]:           # count indents or dedents
-                indents.append(column)
-                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-            while column < indents[-1]:
-                if column not in indents:
-                    raise IndentationError(
-                        "unindent does not match any outer indentation level",
-                        ("<tokenize>", lnum, pos, line))
-                indents = indents[:-1]
-                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
-
-        else:                                  # continued statement
-            if not line:
-                raise TokenError("EOF in multi-line statement", (lnum, 0))
-            continued = 0
-
-        while pos < max:
-            pseudomatch = pseudoprog.match(line, pos)
-            if pseudomatch:                                # scan for tokens
-                start, end = pseudomatch.span(1)
-                spos, epos, pos = (lnum, start), (lnum, end), end
-                token, initial = line[start:end], line[start]
-
-                if (initial in numchars or                  # ordinary number
-                    (initial == '.' and token != '.' and token != '...')):
-                    yield TokenInfo(NUMBER, token, spos, epos, line)
-                elif initial in '\r\n':
-                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
-                           token, spos, epos, line)
-                elif initial == '#':
-                    assert not token.endswith("\n")
-                    yield TokenInfo(COMMENT, token, spos, epos, line)
-                elif token in triple_quoted:
-                    endprog = endprogs[token]
-                    endmatch = endprog.match(line, pos)
-                    if endmatch:                           # all on one line
-                        pos = endmatch.end(0)
-                        token = line[start:pos]
-                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
-                    else:
-                        strstart = (lnum, start)           # multiple lines
-                        contstr = line[start:]
-                        contline = line
-                        break
-                elif initial in single_quoted or \
-                    token[:2] in single_quoted or \
-                    token[:3] in single_quoted:
-                    if token[-1] == '\n':                  # continued string
-                        strstart = (lnum, start)
-                        endprog = (endprogs[initial] or endprogs[token[1]] or
-                                   endprogs[token[2]])
-                        contstr, needcont = line[start:], 1
-                        contline = line
-                        break
-                    else:                                  # ordinary string
-                        yield TokenInfo(STRING, token, spos, epos, line)
-                elif initial.isidentifier():               # ordinary name
-                    yield TokenInfo(NAME, token, spos, epos, line)
-                elif initial == '\\':                      # continued stmt
-                    continued = 1
-                else:
-                    if initial in '([{':
-                        parenlev += 1
-                    elif initial in ')]}':
-                        parenlev -= 1
-                    yield TokenInfo(OP, token, spos, epos, line)
-            else:
-                yield TokenInfo(ERRORTOKEN, line[pos],
-                           (lnum, pos), (lnum, pos+1), line)
-                pos += 1
-
-    for indent in indents[1:]:                 # pop remaining indent levels
-        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
-    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
-
-
-# An undocumented, backwards compatible, API for all the places in the standard
-# library that expect to be able to use tokenize with strings
-def generate_tokens(readline):
-    return _tokenize(readline, None)
-
-if __name__ == "__main__":
-    # Quick sanity check
-    s = b'''def parseline(self, line):
-            """Parse the line into a command name and a string containing
-            the arguments.  Returns a tuple containing (command, args, line).
-            'command' and 'args' may be None if the line couldn't be parsed.
-            """
-            line = line.strip()
-            if not line:
-                return None, None, line
-            elif line[0] == '?':
-                line = 'help ' + line[1:]
-            elif line[0] == '!':
-                if hasattr(self, 'do_shell'):
-                    line = 'shell ' + line[1:]
-                else:
-                    return None, None, line
-            i, n = 0, len(line)
-            while i < n and line[i] in self.identchars: i = i+1
-            cmd, arg = line[:i], line[i:].strip()
-            return cmd, arg, line
-    '''
-    for tok in tokenize(iter(s.splitlines()).__next__):
-        print(tok)
+"""Patched version of standard library tokenize, to deal with various bugs. 
+ 
+Based on Python 3.2 code. 
+ 
+Patches: 
+ 
+- Gareth Rees' patch for Python issue #12691 (untokenizing) 
+  - Except we don't encode the output of untokenize 
+  - Python 2 compatible syntax, so that it can be byte-compiled at installation 
+- Newlines in comments and blank lines should be either NL or NEWLINE, depending 
+  on whether they are in a multi-line statement. Filed as Python issue #17061. 
+- Export generate_tokens & TokenError 
+- u and rb literals are allowed under Python 3.3 and above. 
+ 
+------------------------------------------------------------------------------ 
+Tokenization help for Python programs. 
+ 
+tokenize(readline) is a generator that breaks a stream of bytes into 
+Python tokens.  It decodes the bytes according to PEP-0263 for 
+determining source file encoding. 
+ 
+It accepts a readline-like method which is called repeatedly to get the 
+next line of input (or b"" for EOF).  It generates 5-tuples with these 
+members: 
+ 
+    the token type (see token.py) 
+    the token (a string) 
+    the starting (row, column) indices of the token (a 2-tuple of ints) 
+    the ending (row, column) indices of the token (a 2-tuple of ints) 
+    the original line (string) 
+ 
+It is designed to match the working of the Python tokenizer exactly, except 
+that it produces COMMENT tokens for comments and gives type OP for all 
+operators.  Additionally, all token lists start with an ENCODING token 
+which tells you which encoding was used to decode the bytes stream. 
+""" 
+from __future__ import absolute_import 
+ 
+__author__ = 'Ka-Ping Yee <ping@lfw.org>' 
+__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 
+               'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 
+               'Michael Foord') 
+import builtins 
+import re 
+import sys 
+from token import * 
+from codecs import lookup, BOM_UTF8 
+import collections 
+from io import TextIOWrapper 
+cookie_re = re.compile("coding[:=]\s*([-\w.]+)") 
+ 
+import token 
+__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", 
+                           "NL", "untokenize", "ENCODING", "TokenInfo"] 
+del token 
+ 
+__all__ += ["generate_tokens", "TokenError"] 
+ 
+COMMENT = N_TOKENS 
+tok_name[COMMENT] = 'COMMENT' 
+NL = N_TOKENS + 1 
+tok_name[NL] = 'NL' 
+ENCODING = N_TOKENS + 2 
+tok_name[ENCODING] = 'ENCODING' 
+N_TOKENS += 3 
+ 
+class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): 
+    def __repr__(self): 
+        annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) 
+        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % 
+                self._replace(type=annotated_type)) 
+ 
+def group(*choices): return '(' + '|'.join(choices) + ')' 
+def any(*choices): return group(*choices) + '*' 
+def maybe(*choices): return group(*choices) + '?' 
+ 
+# Note: we use unicode matching for names ("\w") but ascii matching for 
+# number literals. 
+Whitespace = r'[ \f\t]*' 
+Comment = r'#[^\r\n]*' 
+Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 
+Name = r'\w+' 
+ 
+Hexnumber = r'0[xX][0-9a-fA-F]+' 
+Binnumber = r'0[bB][01]+' 
+Octnumber = r'0[oO][0-7]+' 
+Decnumber = r'(?:0+|[1-9][0-9]*)' 
+Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 
+Exponent = r'[eE][-+]?[0-9]+' 
+Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) 
+Expfloat = r'[0-9]+' + Exponent 
+Floatnumber = group(Pointfloat, Expfloat) 
+Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') 
+Number = group(Imagnumber, Floatnumber, Intnumber) 
+ 
+if sys.version_info.minor >= 3: 
+    StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?' 
+else: 
+    StringPrefix = r'(?:[bB]?[rR]?)?' 
+ 
+# Tail end of ' string. 
+Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 
+# Tail end of " string. 
+Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 
+# Tail end of ''' string. 
+Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 
+# Tail end of """ string. 
+Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 
+Triple = group(StringPrefix + "'''", StringPrefix + '"""') 
+# Single-line ' or " string. 
+String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 
+               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 
+ 
+# Because of leftmost-then-longest match semantics, be sure to put the 
+# longest operators first (e.g., if = came before ==, == would get 
+# recognized as two instances of =). 
+Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", 
+                 r"//=?", r"->", 
+                 r"[+\-*/%&|^=<>]=?", 
+                 r"~") 
+ 
+Bracket = '[][(){}]' 
+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') 
+Funny = group(Operator, Bracket, Special) 
+ 
+PlainToken = group(Number, Funny, String, Name) 
+Token = Ignore + PlainToken 
+ 
+# First (or only) line of ' or " string. 
+ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 
+                group("'", r'\\\r?\n'), 
+                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 
+                group('"', r'\\\r?\n')) 
+PseudoExtras = group(r'\\\r?\n', Comment, Triple) 
+PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 
+ 
+def _compile(expr): 
+    return re.compile(expr, re.UNICODE) 
+ 
+tokenprog, pseudoprog, single3prog, double3prog = map( 
+    _compile, (Token, PseudoToken, Single3, Double3)) 
+endprogs = {"'": _compile(Single), '"': _compile(Double), 
+            "'''": single3prog, '"""': double3prog, 
+            "r'''": single3prog, 'r"""': double3prog, 
+            "b'''": single3prog, 'b"""': double3prog, 
+            "R'''": single3prog, 'R"""': double3prog, 
+            "B'''": single3prog, 'B"""': double3prog, 
+            "br'''": single3prog, 'br"""': double3prog, 
+            "bR'''": single3prog, 'bR"""': double3prog, 
+            "Br'''": single3prog, 'Br"""': double3prog, 
+            "BR'''": single3prog, 'BR"""': double3prog, 
+            'r': None, 'R': None, 'b': None, 'B': None} 
+ 
+triple_quoted = {} 
+for t in ("'''", '"""', 
+          "r'''", 'r"""', "R'''", 'R"""', 
+          "b'''", 'b"""', "B'''", 'B"""', 
+          "br'''", 'br"""', "Br'''", 'Br"""', 
+          "bR'''", 'bR"""', "BR'''", 'BR"""'): 
+    triple_quoted[t] = t 
+single_quoted = {} 
+for t in ("'", '"', 
+          "r'", 'r"', "R'", 'R"', 
+          "b'", 'b"', "B'", 'B"', 
+          "br'", 'br"', "Br'", 'Br"', 
+          "bR'", 'bR"', "BR'", 'BR"' ): 
+    single_quoted[t] = t 
+ 
+if sys.version_info.minor >= 3: 
+    # Python 3.3 
+    for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']: 
+        _t2 = _prefix+'"""' 
+        endprogs[_t2] = double3prog 
+        triple_quoted[_t2] = _t2 
+        _t1 = _prefix + "'''" 
+        endprogs[_t1] = single3prog 
+        triple_quoted[_t1] = _t1 
+        single_quoted[_prefix+'"'] = _prefix+'"' 
+        single_quoted[_prefix+"'"] = _prefix+"'" 
+    del _prefix, _t2, _t1 
+    endprogs['u'] = None 
+    endprogs['U'] = None 
+ 
+del _compile 
+ 
+tabsize = 8 
+ 
+class TokenError(Exception): pass 
+ 
+class StopTokenizing(Exception): pass 
+ 
+ 
+class Untokenizer: 
+ 
+    def __init__(self): 
+        self.tokens = [] 
+        self.prev_row = 1 
+        self.prev_col = 0 
+        self.encoding = 'utf-8' 
+ 
+    def add_whitespace(self, tok_type, start): 
+        row, col = start 
+        assert row >= self.prev_row 
+        col_offset = col - self.prev_col 
+        if col_offset > 0: 
+            self.tokens.append(" " * col_offset) 
+        elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER): 
+            # Line was backslash-continued. 
+            self.tokens.append(" ") 
+ 
+    def untokenize(self, tokens): 
+        iterable = iter(tokens) 
+        for t in iterable: 
+            if len(t) == 2: 
+                self.compat(t, iterable) 
+                break 
+            tok_type, token, start, end = t[:4] 
+            if tok_type == ENCODING: 
+                self.encoding = token 
+                continue 
+            self.add_whitespace(tok_type, start) 
+            self.tokens.append(token) 
+            self.prev_row, self.prev_col = end 
+            if tok_type in (NEWLINE, NL): 
+                self.prev_row += 1 
+                self.prev_col = 0 
+        return "".join(self.tokens) 
+ 
+    def compat(self, token, iterable): 
+        # This import is here to avoid problems when the itertools 
+        # module is not built yet and tokenize is imported. 
+        from itertools import chain 
+        startline = False 
+        prevstring = False 
+        indents = [] 
+        toks_append = self.tokens.append 
+ 
+        for tok in chain([token], iterable): 
+            toknum, tokval = tok[:2] 
+            if toknum == ENCODING: 
+                self.encoding = tokval 
+                continue 
+ 
+            if toknum in (NAME, NUMBER): 
+                tokval += ' ' 
+ 
+            # Insert a space between two consecutive strings 
+            if toknum == STRING: 
+                if prevstring: 
+                    tokval = ' ' + tokval 
+                prevstring = True 
+            else: 
+                prevstring = False 
+ 
+            if toknum == INDENT: 
+                indents.append(tokval) 
+                continue 
+            elif toknum == DEDENT: 
+                indents.pop() 
+                continue 
+            elif toknum in (NEWLINE, NL): 
+                startline = True 
+            elif startline and indents: 
+                toks_append(indents[-1]) 
+                startline = False 
+            toks_append(tokval) 
+ 
+ 
+def untokenize(tokens): 
+    """ 
+    Convert ``tokens`` (an iterable) back into Python source code. Return 
+    a bytes object, encoded using the encoding specified by the last 
+    ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found. 
+ 
+    The result is guaranteed to tokenize back to match the input so that 
+    the conversion is lossless and round-trips are assured.  The 
+    guarantee applies only to the token type and token string as the 
+    spacing between tokens (column positions) may change. 
+ 
+    :func:`untokenize` has two modes. If the input tokens are sequences 
+    of length 2 (``type``, ``string``) then spaces are added as necessary to 
+    preserve the round-trip property. 
+ 
+    If the input tokens are sequences of length 4 or more (``type``, 
+    ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then 
+    spaces are added so that each token appears in the result at the 
+    position indicated by ``start`` and ``end``, if possible. 
+    """ 
+    return Untokenizer().untokenize(tokens) 
+ 
+ 
+def _get_normal_name(orig_enc): 
+    """Imitates get_normal_name in tokenizer.c.""" 
+    # Only care about the first 12 characters. 
+    enc = orig_enc[:12].lower().replace("_", "-") 
+    if enc == "utf-8" or enc.startswith("utf-8-"): 
+        return "utf-8" 
+    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 
+       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 
+        return "iso-8859-1" 
+    return orig_enc 
+ 
+def detect_encoding(readline): 
+    """ 
+    The detect_encoding() function is used to detect the encoding that should 
+    be used to decode a Python source file.  It requires one argment, readline, 
+    in the same way as the tokenize() generator. 
+ 
+    It will call readline a maximum of twice, and return the encoding used 
+    (as a string) and a list of any lines (left as bytes) it has read in. 
+ 
+    It detects the encoding from the presence of a utf-8 bom or an encoding 
+    cookie as specified in pep-0263.  If both a bom and a cookie are present, 
+    but disagree, a SyntaxError will be raised.  If the encoding cookie is an 
+    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found, 
+    'utf-8-sig' is returned. 
+ 
+    If no encoding is specified, then the default of 'utf-8' will be returned. 
+    """ 
+    bom_found = False 
+    encoding = None 
+    default = 'utf-8' 
+    def read_or_stop(): 
+        try: 
+            return readline() 
+        except StopIteration: 
+            return b'' 
+ 
+    def find_cookie(line): 
+        try: 
+            # Decode as UTF-8. Either the line is an encoding declaration, 
+            # in which case it should be pure ASCII, or it must be UTF-8 
+            # per default encoding. 
+            line_string = line.decode('utf-8') 
+        except UnicodeDecodeError: 
+            raise SyntaxError("invalid or missing encoding declaration") 
+ 
+        matches = cookie_re.findall(line_string) 
+        if not matches: 
+            return None 
+        encoding = _get_normal_name(matches[0]) 
+        try: 
+            codec = lookup(encoding) 
+        except LookupError: 
+            # This behaviour mimics the Python interpreter 
+            raise SyntaxError("unknown encoding: " + encoding) 
+ 
+        if bom_found: 
+            if encoding != 'utf-8': 
+                # This behaviour mimics the Python interpreter 
+                raise SyntaxError('encoding problem: utf-8') 
+            encoding += '-sig' 
+        return encoding 
+ 
+    first = read_or_stop() 
+    if first.startswith(BOM_UTF8): 
+        bom_found = True 
+        first = first[3:] 
+        default = 'utf-8-sig' 
+    if not first: 
+        return default, [] 
+ 
+    encoding = find_cookie(first) 
+    if encoding: 
+        return encoding, [first] 
+ 
+    second = read_or_stop() 
+    if not second: 
+        return default, [first] 
+ 
+    encoding = find_cookie(second) 
+    if encoding: 
+        return encoding, [first, second] 
+ 
+    return default, [first, second] 
+ 
+ 
+def open(filename): 
+    """Open a file in read only mode using the encoding detected by 
+    detect_encoding(). 
+    """ 
+    buffer = builtins.open(filename, 'rb') 
+    encoding, lines = detect_encoding(buffer.readline) 
+    buffer.seek(0) 
+    text = TextIOWrapper(buffer, encoding, line_buffering=True) 
+    text.mode = 'r' 
+    return text 
+ 
+ 
+def tokenize(readline): 
+    """ 
+    The tokenize() generator requires one argment, readline, which 
+    must be a callable object which provides the same interface as the 
+    readline() method of built-in file objects.  Each call to the function 
+    should return one line of input as bytes.  Alternately, readline 
+    can be a callable function terminating with StopIteration: 
+        readline = open(myfile, 'rb').__next__  # Example of alternate readline 
+ 
+    The generator produces 5-tuples with these members: the token type; the 
+    token string; a 2-tuple (srow, scol) of ints specifying the row and 
+    column where the token begins in the source; a 2-tuple (erow, ecol) of 
+    ints specifying the row and column where the token ends in the source; 
+    and the line on which the token was found.  The line passed is the 
+    logical line; continuation lines are included. 
+ 
+    The first token sequence will always be an ENCODING token 
+    which tells you which encoding was used to decode the bytes stream. 
+    """ 
+    # This import is here to avoid problems when the itertools module is not 
+    # built yet and tokenize is imported. 
+    from itertools import chain, repeat 
+    encoding, consumed = detect_encoding(readline) 
+    rl_gen = iter(readline, b"") 
+    empty = repeat(b"") 
+    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding) 
+ 
+ 
+def _tokenize(readline, encoding): 
+    lnum = parenlev = continued = 0 
+    numchars = '0123456789' 
+    contstr, needcont = '', 0 
+    contline = None 
+    indents = [0] 
+ 
+    if encoding is not None: 
+        if encoding == "utf-8-sig": 
+            # BOM will already have been stripped. 
+            encoding = "utf-8" 
+        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 
+    while True:             # loop over lines in stream 
+        try: 
+            line = readline() 
+        except StopIteration: 
+            line = b'' 
+ 
+        if encoding is not None: 
+            line = line.decode(encoding) 
+        lnum += 1 
+        pos, max = 0, len(line) 
+ 
+        if contstr:                            # continued string 
+            if not line: 
+                raise TokenError("EOF in multi-line string", strstart) 
+            endmatch = endprog.match(line) 
+            if endmatch: 
+                pos = end = endmatch.end(0) 
+                yield TokenInfo(STRING, contstr + line[:end], 
+                       strstart, (lnum, end), contline + line) 
+                contstr, needcont = '', 0 
+                contline = None 
+            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 
+                yield TokenInfo(ERRORTOKEN, contstr + line, 
+                           strstart, (lnum, len(line)), contline) 
+                contstr = '' 
+                contline = None 
+                continue 
+            else: 
+                contstr = contstr + line 
+                contline = contline + line 
+                continue 
+ 
+        elif parenlev == 0 and not continued:  # new statement 
+            if not line: break 
+            column = 0 
+            while pos < max:                   # measure leading whitespace 
+                if line[pos] == ' ': 
+                    column += 1 
+                elif line[pos] == '\t': 
+                    column = (column//tabsize + 1)*tabsize 
+                elif line[pos] == '\f': 
+                    column = 0 
+                else: 
+                    break 
+                pos += 1 
+            if pos == max: 
+                break 
+ 
+            if line[pos] in '#\r\n':           # skip comments or blank lines 
+                if line[pos] == '#': 
+                    comment_token = line[pos:].rstrip('\r\n') 
+                    nl_pos = pos + len(comment_token) 
+                    yield TokenInfo(COMMENT, comment_token, 
+                           (lnum, pos), (lnum, pos + len(comment_token)), line) 
+                    yield TokenInfo(NEWLINE, line[nl_pos:], 
+                           (lnum, nl_pos), (lnum, len(line)), line) 
+                else: 
+                    yield TokenInfo(NEWLINE, line[pos:], 
+                           (lnum, pos), (lnum, len(line)), line) 
+                continue 
+ 
+            if column > indents[-1]:           # count indents or dedents 
+                indents.append(column) 
+                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 
+            while column < indents[-1]: 
+                if column not in indents: 
+                    raise IndentationError( 
+                        "unindent does not match any outer indentation level", 
+                        ("<tokenize>", lnum, pos, line)) 
+                indents = indents[:-1] 
+                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 
+ 
+        else:                                  # continued statement 
+            if not line: 
+                raise TokenError("EOF in multi-line statement", (lnum, 0)) 
+            continued = 0 
+ 
+        while pos < max: 
+            pseudomatch = pseudoprog.match(line, pos) 
+            if pseudomatch:                                # scan for tokens 
+                start, end = pseudomatch.span(1) 
+                spos, epos, pos = (lnum, start), (lnum, end), end 
+                token, initial = line[start:end], line[start] 
+ 
+                if (initial in numchars or                  # ordinary number 
+                    (initial == '.' and token != '.' and token != '...')): 
+                    yield TokenInfo(NUMBER, token, spos, epos, line) 
+                elif initial in '\r\n': 
+                    yield TokenInfo(NL if parenlev > 0 else NEWLINE, 
+                           token, spos, epos, line) 
+                elif initial == '#': 
+                    assert not token.endswith("\n") 
+                    yield TokenInfo(COMMENT, token, spos, epos, line) 
+                elif token in triple_quoted: 
+                    endprog = endprogs[token] 
+                    endmatch = endprog.match(line, pos) 
+                    if endmatch:                           # all on one line 
+                        pos = endmatch.end(0) 
+                        token = line[start:pos] 
+                        yield TokenInfo(STRING, token, spos, (lnum, pos), line) 
+                    else: 
+                        strstart = (lnum, start)           # multiple lines 
+                        contstr = line[start:] 
+                        contline = line 
+                        break 
+                elif initial in single_quoted or \ 
+                    token[:2] in single_quoted or \ 
+                    token[:3] in single_quoted: 
+                    if token[-1] == '\n':                  # continued string 
+                        strstart = (lnum, start) 
+                        endprog = (endprogs[initial] or endprogs[token[1]] or 
+                                   endprogs[token[2]]) 
+                        contstr, needcont = line[start:], 1 
+                        contline = line 
+                        break 
+                    else:                                  # ordinary string 
+                        yield TokenInfo(STRING, token, spos, epos, line) 
+                elif initial.isidentifier():               # ordinary name 
+                    yield TokenInfo(NAME, token, spos, epos, line) 
+                elif initial == '\\':                      # continued stmt 
+                    continued = 1 
+                else: 
+                    if initial in '([{': 
+                        parenlev += 1 
+                    elif initial in ')]}': 
+                        parenlev -= 1 
+                    yield TokenInfo(OP, token, spos, epos, line) 
+            else: 
+                yield TokenInfo(ERRORTOKEN, line[pos], 
+                           (lnum, pos), (lnum, pos+1), line) 
+                pos += 1 
+ 
+    for indent in indents[1:]:                 # pop remaining indent levels 
+        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 
+    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 
+ 
+ 
+# An undocumented, backwards compatible, API for all the places in the standard 
+# library that expect to be able to use tokenize with strings 
+def generate_tokens(readline): 
+    return _tokenize(readline, None) 
+ 
+if __name__ == "__main__": 
+    # Quick sanity check 
+    s = b'''def parseline(self, line): 
+            """Parse the line into a command name and a string containing 
+            the arguments.  Returns a tuple containing (command, args, line). 
+            'command' and 'args' may be None if the line couldn't be parsed. 
+            """ 
+            line = line.strip() 
+            if not line: 
+                return None, None, line 
+            elif line[0] == '?': 
+                line = 'help ' + line[1:] 
+            elif line[0] == '!': 
+                if hasattr(self, 'do_shell'): 
+                    line = 'shell ' + line[1:] 
+                else: 
+                    return None, None, line 
+            i, n = 0, len(line) 
+            while i < n and line[i] in self.identchars: i = i+1 
+            cmd, arg = line[:i], line[i:].strip() 
+            return cmd, arg, line 
+    ''' 
+    for tok in tokenize(iter(s.splitlines()).__next__): 
+        print(tok)
author	Mikhail Borisov <borisov.mikhail@gmail.com>	2022-02-10 16:45:39 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:45:39 +0300
commit	a6a92afe03e02795227d2641b49819b687f088f8 (patch)
tree	f6984a1d27d5a7ec88a6fdd6e20cd5b7693b6ece /contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py
parent	c6dc8b8bd530985bc4cce0137e9a5de32f1087cb (diff)
download	ydb-a6a92afe03e02795227d2641b49819b687f088f8.tar.gz