add ydb deps

author: nkozlovskiy <nmk@ydb.tech> 2023-09-29 12:24:06 +0300
committer: nkozlovskiy <nmk@ydb.tech> 2023-09-29 12:41:34 +0300
commit: e0e3e1717e3d33762ce61950504f9637a6e669ed (patch)
tree: bca3ff6939b10ed60c3d5c12439963a1146b9711 /contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py
parent: 38f2c5852db84c7b4d83adfcb009eb61541d1ccd (diff)
download: ydb-e0e3e1717e3d33762ce61950504f9637a6e669ed.tar.gz
1 files changed, 595 insertions, 0 deletions
diff --git a/contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py b/contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py
new file mode 100644
index 0000000000..ee1fd9e639
--- /dev/null
+++ b/contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py
@@ -0,0 +1,595 @@
+"""Patched version of standard library tokenize, to deal with various bugs.
+
+Based on Python 3.2 code.
+
+Patches:
+
+- Gareth Rees' patch for Python issue #12691 (untokenizing)
+  - Except we don't encode the output of untokenize
+  - Python 2 compatible syntax, so that it can be byte-compiled at installation
+- Newlines in comments and blank lines should be either NL or NEWLINE, depending
+  on whether they are in a multi-line statement. Filed as Python issue #17061.
+- Export generate_tokens & TokenError
+- u and rb literals are allowed under Python 3.3 and above.
+
+------------------------------------------------------------------------------
+Tokenization help for Python programs.
+
+tokenize(readline) is a generator that breaks a stream of bytes into
+Python tokens.  It decodes the bytes according to PEP-0263 for
+determining source file encoding.
+
+It accepts a readline-like method which is called repeatedly to get the
+next line of input (or b"" for EOF).  It generates 5-tuples with these
+members:
+
+    the token type (see token.py)
+    the token (a string)
+    the starting (row, column) indices of the token (a 2-tuple of ints)
+    the ending (row, column) indices of the token (a 2-tuple of ints)
+    the original line (string)
+
+It is designed to match the working of the Python tokenizer exactly, except
+that it produces COMMENT tokens for comments and gives type OP for all
+operators.  Additionally, all token lists start with an ENCODING token
+which tells you which encoding was used to decode the bytes stream.
+"""
+from __future__ import absolute_import
+
+__author__ = 'Ka-Ping Yee <ping@lfw.org>'
+__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
+               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
+               'Michael Foord')
+import builtins
+import re
+import sys
+from token import *
+from codecs import lookup, BOM_UTF8
+import collections
+from io import TextIOWrapper
+cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
+
+import token
+__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
+                           "NL", "untokenize", "ENCODING", "TokenInfo"]
+del token
+
+__all__ += ["generate_tokens", "TokenError"]
+
+COMMENT = N_TOKENS
+tok_name[COMMENT] = 'COMMENT'
+NL = N_TOKENS + 1
+tok_name[NL] = 'NL'
+ENCODING = N_TOKENS + 2
+tok_name[ENCODING] = 'ENCODING'
+N_TOKENS += 3
+
+class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
+    def __repr__(self):
+        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
+        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
+                self._replace(type=annotated_type))
+
+def group(*choices): return '(' + '|'.join(choices) + ')'
+def any(*choices): return group(*choices) + '*'
+def maybe(*choices): return group(*choices) + '?'
+
+# Note: we use unicode matching for names ("\w") but ascii matching for
+# number literals.
+Whitespace = r'[ \f\t]*'
+Comment = r'#[^\r\n]*'
+Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
+Name = r'\w+'
+
+Hexnumber = r'0[xX][0-9a-fA-F]+'
+Binnumber = r'0[bB][01]+'
+Octnumber = r'0[oO][0-7]+'
+Decnumber = r'(?:0+|[1-9][0-9]*)'
+Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
+Exponent = r'[eE][-+]?[0-9]+'
+Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
+Expfloat = r'[0-9]+' + Exponent
+Floatnumber = group(Pointfloat, Expfloat)
+Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
+Number = group(Imagnumber, Floatnumber, Intnumber)
+
+if sys.version_info.minor >= 3:
+    StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
+else:
+    StringPrefix = r'(?:[bB]?[rR]?)?'
+
+# Tail end of ' string.
+Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
+# Tail end of " string.
+Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
+# Tail end of ''' string.
+Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
+# Tail end of """ string.
+Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
+Triple = group(StringPrefix + "'''", StringPrefix + '"""')
+# Single-line ' or " string.
+String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+
+# Because of leftmost-then-longest match semantics, be sure to put the
+# longest operators first (e.g., if = came before ==, == would get
+# recognized as two instances of =).
+Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
+                 r"//=?", r"->",
+                 r"[+\-*/%&|^=<>]=?",
+                 r"~")
+
+Bracket = '[][(){}]'
+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
+Funny = group(Operator, Bracket, Special)
+
+PlainToken = group(Number, Funny, String, Name)
+Token = Ignore + PlainToken
+
+# First (or only) line of ' or " string.
+ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+                group("'", r'\\\r?\n'),
+                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+                group('"', r'\\\r?\n'))
+PseudoExtras = group(r'\\\r?\n', Comment, Triple)
+PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
+
+def _compile(expr):
+    return re.compile(expr, re.UNICODE)
+
+tokenprog, pseudoprog, single3prog, double3prog = map(
+    _compile, (Token, PseudoToken, Single3, Double3))
+endprogs = {"'": _compile(Single), '"': _compile(Double),
+            "'''": single3prog, '"""': double3prog,
+            "r'''": single3prog, 'r"""': double3prog,
+            "b'''": single3prog, 'b"""': double3prog,
+            "R'''": single3prog, 'R"""': double3prog,
+            "B'''": single3prog, 'B"""': double3prog,
+            "br'''": single3prog, 'br"""': double3prog,
+            "bR'''": single3prog, 'bR"""': double3prog,
+            "Br'''": single3prog, 'Br"""': double3prog,
+            "BR'''": single3prog, 'BR"""': double3prog,
+            'r': None, 'R': None, 'b': None, 'B': None}
+
+triple_quoted = {}
+for t in ("'''", '"""',
+          "r'''", 'r"""', "R'''", 'R"""',
+          "b'''", 'b"""', "B'''", 'B"""',
+          "br'''", 'br"""', "Br'''", 'Br"""',
+          "bR'''", 'bR"""', "BR'''", 'BR"""'):
+    triple_quoted[t] = t
+single_quoted = {}
+for t in ("'", '"',
+          "r'", 'r"', "R'", 'R"',
+          "b'", 'b"', "B'", 'B"',
+          "br'", 'br"', "Br'", 'Br"',
+          "bR'", 'bR"', "BR'", 'BR"' ):
+    single_quoted[t] = t
+
+if sys.version_info.minor >= 3:
+    # Python 3.3
+    for _prefix in ['rb', 'rB', 'Rb', 'RB', 'u', 'U']:
+        _t2 = _prefix+'"""'
+        endprogs[_t2] = double3prog
+        triple_quoted[_t2] = _t2
+        _t1 = _prefix + "'''"
+        endprogs[_t1] = single3prog
+        triple_quoted[_t1] = _t1
+        single_quoted[_prefix+'"'] = _prefix+'"'
+        single_quoted[_prefix+"'"] = _prefix+"'"
+    del _prefix, _t2, _t1
+    endprogs['u'] = None
+    endprogs['U'] = None
+
+del _compile
+
+tabsize = 8
+
+class TokenError(Exception): pass
+
+class StopTokenizing(Exception): pass
+
+
+class Untokenizer:
+
+    def __init__(self):
+        self.tokens = []
+        self.prev_row = 1
+        self.prev_col = 0
+        self.encoding = 'utf-8'
+
+    def add_whitespace(self, tok_type, start):
+        row, col = start
+        assert row >= self.prev_row
+        col_offset = col - self.prev_col
+        if col_offset > 0:
+            self.tokens.append(" " * col_offset)
+        elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
+            # Line was backslash-continued.
+            self.tokens.append(" ")
+
+    def untokenize(self, tokens):
+        iterable = iter(tokens)
+        for t in iterable:
+            if len(t) == 2:
+                self.compat(t, iterable)
+                break
+            tok_type, token, start, end = t[:4]
+            if tok_type == ENCODING:
+                self.encoding = token
+                continue
+            self.add_whitespace(tok_type, start)
+            self.tokens.append(token)
+            self.prev_row, self.prev_col = end
+            if tok_type in (NEWLINE, NL):
+                self.prev_row += 1
+                self.prev_col = 0
+        return "".join(self.tokens)
+
+    def compat(self, token, iterable):
+        # This import is here to avoid problems when the itertools
+        # module is not built yet and tokenize is imported.
+        from itertools import chain
+        startline = False
+        prevstring = False
+        indents = []
+        toks_append = self.tokens.append
+
+        for tok in chain([token], iterable):
+            toknum, tokval = tok[:2]
+            if toknum == ENCODING:
+                self.encoding = tokval
+                continue
+
+            if toknum in (NAME, NUMBER):
+                tokval += ' '
+
+            # Insert a space between two consecutive strings
+            if toknum == STRING:
+                if prevstring:
+                    tokval = ' ' + tokval
+                prevstring = True
+            else:
+                prevstring = False
+
+            if toknum == INDENT:
+                indents.append(tokval)
+                continue
+            elif toknum == DEDENT:
+                indents.pop()
+                continue
+            elif toknum in (NEWLINE, NL):
+                startline = True
+            elif startline and indents:
+                toks_append(indents[-1])
+                startline = False
+            toks_append(tokval)
+
+
+def untokenize(tokens):
+    """
+    Convert ``tokens`` (an iterable) back into Python source code. Return
+    a bytes object, encoded using the encoding specified by the last
+    ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
+
+    The result is guaranteed to tokenize back to match the input so that
+    the conversion is lossless and round-trips are assured.  The
+    guarantee applies only to the token type and token string as the
+    spacing between tokens (column positions) may change.
+
+    :func:`untokenize` has two modes. If the input tokens are sequences
+    of length 2 (``type``, ``string``) then spaces are added as necessary to
+    preserve the round-trip property.
+
+    If the input tokens are sequences of length 4 or more (``type``,
+    ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
+    spaces are added so that each token appears in the result at the
+    position indicated by ``start`` and ``end``, if possible.
+    """
+    return Untokenizer().untokenize(tokens)
+
+
+def _get_normal_name(orig_enc):
+    """Imitates get_normal_name in tokenizer.c."""
+    # Only care about the first 12 characters.
+    enc = orig_enc[:12].lower().replace("_", "-")
+    if enc == "utf-8" or enc.startswith("utf-8-"):
+        return "utf-8"
+    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+        return "iso-8859-1"
+    return orig_enc
+
+def detect_encoding(readline):
+    """
+    The detect_encoding() function is used to detect the encoding that should
+    be used to decode a Python source file.  It requires one argment, readline,
+    in the same way as the tokenize() generator.
+
+    It will call readline a maximum of twice, and return the encoding used
+    (as a string) and a list of any lines (left as bytes) it has read in.
+
+    It detects the encoding from the presence of a utf-8 bom or an encoding
+    cookie as specified in pep-0263.  If both a bom and a cookie are present,
+    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
+    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+    'utf-8-sig' is returned.
+
+    If no encoding is specified, then the default of 'utf-8' will be returned.
+    """
+    bom_found = False
+    encoding = None
+    default = 'utf-8'
+    def read_or_stop():
+        try:
+            return readline()
+        except StopIteration:
+            return b''
+
+    def find_cookie(line):
+        try:
+            # Decode as UTF-8. Either the line is an encoding declaration,
+            # in which case it should be pure ASCII, or it must be UTF-8
+            # per default encoding.
+            line_string = line.decode('utf-8')
+        except UnicodeDecodeError:
+            raise SyntaxError("invalid or missing encoding declaration")
+
+        matches = cookie_re.findall(line_string)
+        if not matches:
+            return None
+        encoding = _get_normal_name(matches[0])
+        try:
+            codec = lookup(encoding)
+        except LookupError:
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError("unknown encoding: " + encoding)
+
+        if bom_found:
+            if encoding != 'utf-8':
+                # This behaviour mimics the Python interpreter
+                raise SyntaxError('encoding problem: utf-8')
+            encoding += '-sig'
+        return encoding
+
+    first = read_or_stop()
+    if first.startswith(BOM_UTF8):
+        bom_found = True
+        first = first[3:]
+        default = 'utf-8-sig'
+    if not first:
+        return default, []
+
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding, [first]
+
+    second = read_or_stop()
+    if not second:
+        return default, [first]
+
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding, [first, second]
+
+    return default, [first, second]
+
+
+def open(filename):
+    """Open a file in read only mode using the encoding detected by
+    detect_encoding().
+    """
+    buffer = builtins.open(filename, 'rb')
+    encoding, lines = detect_encoding(buffer.readline)
+    buffer.seek(0)
+    text = TextIOWrapper(buffer, encoding, line_buffering=True)
+    text.mode = 'r'
+    return text
+
+
+def tokenize(readline):
+    """
+    The tokenize() generator requires one argment, readline, which
+    must be a callable object which provides the same interface as the
+    readline() method of built-in file objects.  Each call to the function
+    should return one line of input as bytes.  Alternately, readline
+    can be a callable function terminating with StopIteration:
+        readline = open(myfile, 'rb').__next__  # Example of alternate readline
+
+    The generator produces 5-tuples with these members: the token type; the
+    token string; a 2-tuple (srow, scol) of ints specifying the row and
+    column where the token begins in the source; a 2-tuple (erow, ecol) of
+    ints specifying the row and column where the token ends in the source;
+    and the line on which the token was found.  The line passed is the
+    logical line; continuation lines are included.
+
+    The first token sequence will always be an ENCODING token
+    which tells you which encoding was used to decode the bytes stream.
+    """
+    # This import is here to avoid problems when the itertools module is not
+    # built yet and tokenize is imported.
+    from itertools import chain, repeat
+    encoding, consumed = detect_encoding(readline)
+    rl_gen = iter(readline, b"")
+    empty = repeat(b"")
+    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
+
+
+def _tokenize(readline, encoding):
+    lnum = parenlev = continued = 0
+    numchars = '0123456789'
+    contstr, needcont = '', 0
+    contline = None
+    indents = [0]
+
+    if encoding is not None:
+        if encoding == "utf-8-sig":
+            # BOM will already have been stripped.
+            encoding = "utf-8"
+        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
+    while True:             # loop over lines in stream
+        try:
+            line = readline()
+        except StopIteration:
+            line = b''
+
+        if encoding is not None:
+            line = line.decode(encoding)
+        lnum += 1
+        pos, max = 0, len(line)
+
+        if contstr:                            # continued string
+            if not line:
+                raise TokenError("EOF in multi-line string", strstart)
+            endmatch = endprog.match(line)
+            if endmatch:
+                pos = end = endmatch.end(0)
+                yield TokenInfo(STRING, contstr + line[:end],
+                       strstart, (lnum, end), contline + line)
+                contstr, needcont = '', 0
+                contline = None
+            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
+                yield TokenInfo(ERRORTOKEN, contstr + line,
+                           strstart, (lnum, len(line)), contline)
+                contstr = ''
+                contline = None
+                continue
+            else:
+                contstr = contstr + line
+                contline = contline + line
+                continue
+
+        elif parenlev == 0 and not continued:  # new statement
+            if not line: break
+            column = 0
+            while pos < max:                   # measure leading whitespace
+                if line[pos] == ' ':
+                    column += 1
+                elif line[pos] == '\t':
+                    column = (column//tabsize + 1)*tabsize
+                elif line[pos] == '\f':
+                    column = 0
+                else:
+                    break
+                pos += 1
+            if pos == max:
+                break
+
+            if line[pos] in '#\r\n':           # skip comments or blank lines
+                if line[pos] == '#':
+                    comment_token = line[pos:].rstrip('\r\n')
+                    nl_pos = pos + len(comment_token)
+                    yield TokenInfo(COMMENT, comment_token,
+                           (lnum, pos), (lnum, pos + len(comment_token)), line)
+                    yield TokenInfo(NEWLINE, line[nl_pos:],
+                           (lnum, nl_pos), (lnum, len(line)), line)
+                else:
+                    yield TokenInfo(NEWLINE, line[pos:],
+                           (lnum, pos), (lnum, len(line)), line)
+                continue
+
+            if column > indents[-1]:           # count indents or dedents
+                indents.append(column)
+                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
+            while column < indents[-1]:
+                if column not in indents:
+                    raise IndentationError(
+                        "unindent does not match any outer indentation level",
+                        ("<tokenize>", lnum, pos, line))
+                indents = indents[:-1]
+                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
+
+        else:                                  # continued statement
+            if not line:
+                raise TokenError("EOF in multi-line statement", (lnum, 0))
+            continued = 0
+
+        while pos < max:
+            pseudomatch = pseudoprog.match(line, pos)
+            if pseudomatch:                                # scan for tokens
+                start, end = pseudomatch.span(1)
+                spos, epos, pos = (lnum, start), (lnum, end), end
+                token, initial = line[start:end], line[start]
+
+                if (initial in numchars or                  # ordinary number
+                    (initial == '.' and token != '.' and token != '...')):
+                    yield TokenInfo(NUMBER, token, spos, epos, line)
+                elif initial in '\r\n':
+                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
+                           token, spos, epos, line)
+                elif initial == '#':
+                    assert not token.endswith("\n")
+                    yield TokenInfo(COMMENT, token, spos, epos, line)
+                elif token in triple_quoted:
+                    endprog = endprogs[token]
+                    endmatch = endprog.match(line, pos)
+                    if endmatch:                           # all on one line
+                        pos = endmatch.end(0)
+                        token = line[start:pos]
+                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
+                    else:
+                        strstart = (lnum, start)           # multiple lines
+                        contstr = line[start:]
+                        contline = line
+                        break
+                elif initial in single_quoted or \
+                    token[:2] in single_quoted or \
+                    token[:3] in single_quoted:
+                    if token[-1] == '\n':                  # continued string
+                        strstart = (lnum, start)
+                        endprog = (endprogs[initial] or endprogs[token[1]] or
+                                   endprogs[token[2]])
+                        contstr, needcont = line[start:], 1
+                        contline = line
+                        break
+                    else:                                  # ordinary string
+                        yield TokenInfo(STRING, token, spos, epos, line)
+                elif initial.isidentifier():               # ordinary name
+                    yield TokenInfo(NAME, token, spos, epos, line)
+                elif initial == '\\':                      # continued stmt
+                    continued = 1
+                else:
+                    if initial in '([{':
+                        parenlev += 1
+                    elif initial in ')]}':
+                        parenlev -= 1
+                    yield TokenInfo(OP, token, spos, epos, line)
+            else:
+                yield TokenInfo(ERRORTOKEN, line[pos],
+                           (lnum, pos), (lnum, pos+1), line)
+                pos += 1
+
+    for indent in indents[1:]:                 # pop remaining indent levels
+        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
+    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+
+
+# An undocumented, backwards compatible, API for all the places in the standard
+# library that expect to be able to use tokenize with strings
+def generate_tokens(readline):
+    return _tokenize(readline, None)
+
+if __name__ == "__main__":
+    # Quick sanity check
+    s = b'''def parseline(self, line):
+            """Parse the line into a command name and a string containing
+            the arguments.  Returns a tuple containing (command, args, line).
+            'command' and 'args' may be None if the line couldn't be parsed.
+            """
+            line = line.strip()
+            if not line:
+                return None, None, line
+            elif line[0] == '?':
+                line = 'help ' + line[1:]
+            elif line[0] == '!':
+                if hasattr(self, 'do_shell'):
+                    line = 'shell ' + line[1:]
+                else:
+                    return None, None, line
+            i, n = 0, len(line)
+            while i < n and line[i] in self.identchars: i = i+1
+            cmd, arg = line[:i], line[i:].strip()
+            return cmd, arg, line
+    '''
+    for tok in tokenize(iter(s.splitlines()).__next__):
+        print(tok)
author	nkozlovskiy <nmk@ydb.tech>	2023-09-29 12:24:06 +0300
committer	nkozlovskiy <nmk@ydb.tech>	2023-09-29 12:41:34 +0300
commit	e0e3e1717e3d33762ce61950504f9637a6e669ed (patch)
tree	bca3ff6939b10ed60c3d5c12439963a1146b9711 /contrib/python/ipython/py2/IPython/utils/_tokenize_py3.py
parent	38f2c5852db84c7b4d83adfcb009eb61541d1ccd (diff)
download	ydb-e0e3e1717e3d33762ce61950504f9637a6e669ed.tar.gz