diff options
author | nkozlovskiy <nmk@ydb.tech> | 2023-09-29 12:24:06 +0300 |
---|---|---|
committer | nkozlovskiy <nmk@ydb.tech> | 2023-09-29 12:41:34 +0300 |
commit | e0e3e1717e3d33762ce61950504f9637a6e669ed (patch) | |
tree | bca3ff6939b10ed60c3d5c12439963a1146b9711 /contrib/python/ipython/py2/IPython/lib/lexers.py | |
parent | 38f2c5852db84c7b4d83adfcb009eb61541d1ccd (diff) | |
download | ydb-e0e3e1717e3d33762ce61950504f9637a6e669ed.tar.gz |
add ydb deps
Diffstat (limited to 'contrib/python/ipython/py2/IPython/lib/lexers.py')
-rw-r--r-- | contrib/python/ipython/py2/IPython/lib/lexers.py | 517 |
1 files changed, 517 insertions, 0 deletions
diff --git a/contrib/python/ipython/py2/IPython/lib/lexers.py b/contrib/python/ipython/py2/IPython/lib/lexers.py new file mode 100644 index 0000000000..9160ae1245 --- /dev/null +++ b/contrib/python/ipython/py2/IPython/lib/lexers.py @@ -0,0 +1,517 @@ +# -*- coding: utf-8 -*- +""" +Defines a variety of Pygments lexers for highlighting IPython code. + +This includes: + + IPythonLexer, IPython3Lexer + Lexers for pure IPython (python + magic/shell commands) + + IPythonPartialTracebackLexer, IPythonTracebackLexer + Supports 2.x and 3.x via keyword `python3`. The partial traceback + lexer reads everything but the Python code appearing in a traceback. + The full lexer combines the partial lexer with an IPython lexer. + + IPythonConsoleLexer + A lexer for IPython console sessions, with support for tracebacks. + + IPyLexer + A friendly lexer which examines the first line of text and from it, + decides whether to use an IPython lexer or an IPython console lexer. + This is probably the only lexer that needs to be explicitly added + to Pygments. + +""" +#----------------------------------------------------------------------------- +# Copyright (c) 2013, the IPython Development Team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +# Standard library +import re + +# Third party +from pygments.lexers import BashLexer, Python3Lexer +try: + # PythonLexer was renamed to Python2Lexer in pygments 2.5 + from pygments.lexers import Python2Lexer +except ImportError: + from pygments.lexers import PythonLexer as Python2Lexer +from pygments.lexer import ( + Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using, +) +from pygments.token import ( + Generic, Keyword, Literal, Name, Operator, Other, Text, Error, +) +from pygments.util import get_bool_opt + +# Local + +line_re = re.compile('.*?\n') + +__all__ = ['build_ipy_lexer', 'IPython3Lexer', 'IPythonLexer', + 'IPythonPartialTracebackLexer', 'IPythonTracebackLexer', + 'IPythonConsoleLexer', 'IPyLexer'] + +ipython_tokens = [ + (r"(?s)(\s*)(%%)(\w+)(.*)", bygroups(Text, Operator, Keyword, Text)), + (r'(?s)(^\s*)(%%!)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(BashLexer))), + (r"(%%?)(\w+)(\?\??)$", bygroups(Operator, Keyword, Operator)), + (r"\b(\?\??)(\s*)$", bygroups(Operator, Text)), + (r'(%)(sx|sc|system)(.*)(\n)', bygroups(Operator, Keyword, + using(BashLexer), Text)), + (r'(%)(\w+)(.*\n)', bygroups(Operator, Keyword, Text)), + (r'^(!!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)), + (r'(!)(?!=)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)), + (r'^(\s*)(\?\??)(\s*%{0,2}[\w\.\*]*)', bygroups(Text, Operator, Text)), + (r'(\s*%{0,2}[\w\.\*]*)(\?\??)(\s*)$', bygroups(Text, Operator, Text)), +] + +def build_ipy_lexer(python3): + """Builds IPython lexers depending on the value of `python3`. + + The lexer inherits from an appropriate Python lexer and then adds + information about IPython specific keywords (i.e. magic commands, + shell commands, etc.) + + Parameters + ---------- + python3 : bool + If `True`, then build an IPython lexer from a Python 3 lexer. + + """ + # It would be nice to have a single IPython lexer class which takes + # a boolean `python3`. But since there are two Python lexer classes, + # we will also have two IPython lexer classes. + if python3: + PyLexer = Python3Lexer + name = 'IPython3' + aliases = ['ipython3'] + doc = """IPython3 Lexer""" + else: + PyLexer = Python2Lexer + name = 'IPython' + aliases = ['ipython2', 'ipython'] + doc = """IPython Lexer""" + + tokens = PyLexer.tokens.copy() + tokens['root'] = ipython_tokens + tokens['root'] + + attrs = {'name': name, 'aliases': aliases, 'filenames': [], + '__doc__': doc, 'tokens': tokens} + + return type(name, (PyLexer,), attrs) + + +IPython3Lexer = build_ipy_lexer(python3=True) +IPythonLexer = build_ipy_lexer(python3=False) + + +class IPythonPartialTracebackLexer(RegexLexer): + """ + Partial lexer for IPython tracebacks. + + Handles all the non-python output. This works for both Python 2.x and 3.x. + + """ + name = 'IPython Partial Traceback' + + tokens = { + 'root': [ + # Tracebacks for syntax errors have a different style. + # For both types of tracebacks, we mark the first line with + # Generic.Traceback. For syntax errors, we mark the filename + # as we mark the filenames for non-syntax tracebacks. + # + # These two regexps define how IPythonConsoleLexer finds a + # traceback. + # + ## Non-syntax traceback + (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)), + ## Syntax traceback + (r'^( File)(.*)(, line )(\d+\n)', + bygroups(Generic.Traceback, Name.Namespace, + Generic.Traceback, Literal.Number.Integer)), + + # (Exception Identifier)(Whitespace)(Traceback Message) + (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)', + bygroups(Name.Exception, Generic.Whitespace, Text)), + # (Module/Filename)(Text)(Callee)(Function Signature) + # Better options for callee and function signature? + (r'(.*)( in )(.*)(\(.*\)\n)', + bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)), + # Regular line: (Whitespace)(Line Number)(Python Code) + (r'(\s*?)(\d+)(.*?\n)', + bygroups(Generic.Whitespace, Literal.Number.Integer, Other)), + # Emphasized line: (Arrow)(Line Number)(Python Code) + # Using Exception token so arrow color matches the Exception. + (r'(-*>?\s?)(\d+)(.*?\n)', + bygroups(Name.Exception, Literal.Number.Integer, Other)), + # (Exception Identifier)(Message) + (r'(?u)(^[^\d\W]\w*)(:.*?\n)', + bygroups(Name.Exception, Text)), + # Tag everything else as Other, will be handled later. + (r'.*\n', Other), + ], + } + + +class IPythonTracebackLexer(DelegatingLexer): + """ + IPython traceback lexer. + + For doctests, the tracebacks can be snipped as much as desired with the + exception to the lines that designate a traceback. For non-syntax error + tracebacks, this is the line of hyphens. For syntax error tracebacks, + this is the line which lists the File and line number. + + """ + # The lexer inherits from DelegatingLexer. The "root" lexer is an + # appropriate IPython lexer, which depends on the value of the boolean + # `python3`. First, we parse with the partial IPython traceback lexer. + # Then, any code marked with the "Other" token is delegated to the root + # lexer. + # + name = 'IPython Traceback' + aliases = ['ipythontb'] + + def __init__(self, **options): + self.python3 = get_bool_opt(options, 'python3', False) + if self.python3: + self.aliases = ['ipython3tb'] + else: + self.aliases = ['ipython2tb', 'ipythontb'] + + if self.python3: + IPyLexer = IPython3Lexer + else: + IPyLexer = IPythonLexer + + DelegatingLexer.__init__(self, IPyLexer, + IPythonPartialTracebackLexer, **options) + +class IPythonConsoleLexer(Lexer): + """ + An IPython console lexer for IPython code-blocks and doctests, such as: + + .. code-block:: rst + + .. code-block:: ipythonconsole + + In [1]: a = 'foo' + + In [2]: a + Out[2]: 'foo' + + In [3]: print a + foo + + In [4]: 1 / 0 + + + Support is also provided for IPython exceptions: + + .. code-block:: rst + + .. code-block:: ipythonconsole + + In [1]: raise Exception + + --------------------------------------------------------------------------- + Exception Traceback (most recent call last) + <ipython-input-1-fca2ab0ca76b> in <module>() + ----> 1 raise Exception + + Exception: + + """ + name = 'IPython console session' + aliases = ['ipythonconsole'] + mimetypes = ['text/x-ipython-console'] + + # The regexps used to determine what is input and what is output. + # The default prompts for IPython are: + # + # in = 'In [#]: ' + # continuation = ' .D.: ' + # template = 'Out[#]: ' + # + # Where '#' is the 'prompt number' or 'execution count' and 'D' + # D is a number of dots matching the width of the execution count + # + in1_regex = r'In \[[0-9]+\]: ' + in2_regex = r' \.\.+\.: ' + out_regex = r'Out\[[0-9]+\]: ' + + #: The regex to determine when a traceback starts. + ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)') + + def __init__(self, **options): + """Initialize the IPython console lexer. + + Parameters + ---------- + python3 : bool + If `True`, then the console inputs are parsed using a Python 3 + lexer. Otherwise, they are parsed using a Python 2 lexer. + in1_regex : RegexObject + The compiled regular expression used to detect the start + of inputs. Although the IPython configuration setting may have a + trailing whitespace, do not include it in the regex. If `None`, + then the default input prompt is assumed. + in2_regex : RegexObject + The compiled regular expression used to detect the continuation + of inputs. Although the IPython configuration setting may have a + trailing whitespace, do not include it in the regex. If `None`, + then the default input prompt is assumed. + out_regex : RegexObject + The compiled regular expression used to detect outputs. If `None`, + then the default output prompt is assumed. + + """ + self.python3 = get_bool_opt(options, 'python3', False) + if self.python3: + self.aliases = ['ipython3console'] + else: + self.aliases = ['ipython2console', 'ipythonconsole'] + + in1_regex = options.get('in1_regex', self.in1_regex) + in2_regex = options.get('in2_regex', self.in2_regex) + out_regex = options.get('out_regex', self.out_regex) + + # So that we can work with input and output prompts which have been + # rstrip'd (possibly by editors) we also need rstrip'd variants. If + # we do not do this, then such prompts will be tagged as 'output'. + # The reason can't just use the rstrip'd variants instead is because + # we want any whitespace associated with the prompt to be inserted + # with the token. This allows formatted code to be modified so as hide + # the appearance of prompts, with the whitespace included. One example + # use of this is in copybutton.js from the standard lib Python docs. + in1_regex_rstrip = in1_regex.rstrip() + '\n' + in2_regex_rstrip = in2_regex.rstrip() + '\n' + out_regex_rstrip = out_regex.rstrip() + '\n' + + # Compile and save them all. + attrs = ['in1_regex', 'in2_regex', 'out_regex', + 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip'] + for attr in attrs: + self.__setattr__(attr, re.compile(locals()[attr])) + + Lexer.__init__(self, **options) + + if self.python3: + pylexer = IPython3Lexer + tblexer = IPythonTracebackLexer + else: + pylexer = IPythonLexer + tblexer = IPythonTracebackLexer + + self.pylexer = pylexer(**options) + self.tblexer = tblexer(**options) + + self.reset() + + def reset(self): + self.mode = 'output' + self.index = 0 + self.buffer = u'' + self.insertions = [] + + def buffered_tokens(self): + """ + Generator of unprocessed tokens after doing insertions and before + changing to a new state. + + """ + if self.mode == 'output': + tokens = [(0, Generic.Output, self.buffer)] + elif self.mode == 'input': + tokens = self.pylexer.get_tokens_unprocessed(self.buffer) + else: # traceback + tokens = self.tblexer.get_tokens_unprocessed(self.buffer) + + for i, t, v in do_insertions(self.insertions, tokens): + # All token indexes are relative to the buffer. + yield self.index + i, t, v + + # Clear it all + self.index += len(self.buffer) + self.buffer = u'' + self.insertions = [] + + def get_mci(self, line): + """ + Parses the line and returns a 3-tuple: (mode, code, insertion). + + `mode` is the next mode (or state) of the lexer, and is always equal + to 'input', 'output', or 'tb'. + + `code` is a portion of the line that should be added to the buffer + corresponding to the next mode and eventually lexed by another lexer. + For example, `code` could be Python code if `mode` were 'input'. + + `insertion` is a 3-tuple (index, token, text) representing an + unprocessed "token" that will be inserted into the stream of tokens + that are created from the buffer once we change modes. This is usually + the input or output prompt. + + In general, the next mode depends on current mode and on the contents + of `line`. + + """ + # To reduce the number of regex match checks, we have multiple + # 'if' blocks instead of 'if-elif' blocks. + + # Check for possible end of input + in2_match = self.in2_regex.match(line) + in2_match_rstrip = self.in2_regex_rstrip.match(line) + if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \ + in2_match_rstrip: + end_input = True + else: + end_input = False + if end_input and self.mode != 'tb': + # Only look for an end of input when not in tb mode. + # An ellipsis could appear within the traceback. + mode = 'output' + code = u'' + insertion = (0, Generic.Prompt, line) + return mode, code, insertion + + # Check for output prompt + out_match = self.out_regex.match(line) + out_match_rstrip = self.out_regex_rstrip.match(line) + if out_match or out_match_rstrip: + mode = 'output' + if out_match: + idx = out_match.end() + else: + idx = out_match_rstrip.end() + code = line[idx:] + # Use the 'heading' token for output. We cannot use Generic.Error + # since it would conflict with exceptions. + insertion = (0, Generic.Heading, line[:idx]) + return mode, code, insertion + + + # Check for input or continuation prompt (non stripped version) + in1_match = self.in1_regex.match(line) + if in1_match or (in2_match and self.mode != 'tb'): + # New input or when not in tb, continued input. + # We do not check for continued input when in tb since it is + # allowable to replace a long stack with an ellipsis. + mode = 'input' + if in1_match: + idx = in1_match.end() + else: # in2_match + idx = in2_match.end() + code = line[idx:] + insertion = (0, Generic.Prompt, line[:idx]) + return mode, code, insertion + + # Check for input or continuation prompt (stripped version) + in1_match_rstrip = self.in1_regex_rstrip.match(line) + if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'): + # New input or when not in tb, continued input. + # We do not check for continued input when in tb since it is + # allowable to replace a long stack with an ellipsis. + mode = 'input' + if in1_match_rstrip: + idx = in1_match_rstrip.end() + else: # in2_match + idx = in2_match_rstrip.end() + code = line[idx:] + insertion = (0, Generic.Prompt, line[:idx]) + return mode, code, insertion + + # Check for traceback + if self.ipytb_start.match(line): + mode = 'tb' + code = line + insertion = None + return mode, code, insertion + + # All other stuff... + if self.mode in ('input', 'output'): + # We assume all other text is output. Multiline input that + # does not use the continuation marker cannot be detected. + # For example, the 3 in the following is clearly output: + # + # In [1]: print 3 + # 3 + # + # But the following second line is part of the input: + # + # In [2]: while True: + # print True + # + # In both cases, the 2nd line will be 'output'. + # + mode = 'output' + else: + mode = 'tb' + + code = line + insertion = None + + return mode, code, insertion + + def get_tokens_unprocessed(self, text): + self.reset() + for match in line_re.finditer(text): + line = match.group() + mode, code, insertion = self.get_mci(line) + + if mode != self.mode: + # Yield buffered tokens before transitioning to new mode. + for token in self.buffered_tokens(): + yield token + self.mode = mode + + if insertion: + self.insertions.append((len(self.buffer), [insertion])) + self.buffer += code + + for token in self.buffered_tokens(): + yield token + +class IPyLexer(Lexer): + """ + Primary lexer for all IPython-like code. + + This is a simple helper lexer. If the first line of the text begins with + "In \[[0-9]+\]:", then the entire text is parsed with an IPython console + lexer. If not, then the entire text is parsed with an IPython lexer. + + The goal is to reduce the number of lexers that are registered + with Pygments. + + """ + name = 'IPy session' + aliases = ['ipy'] + + def __init__(self, **options): + self.python3 = get_bool_opt(options, 'python3', False) + if self.python3: + self.aliases = ['ipy3'] + else: + self.aliases = ['ipy2', 'ipy'] + + Lexer.__init__(self, **options) + + self.IPythonLexer = IPythonLexer(**options) + self.IPythonConsoleLexer = IPythonConsoleLexer(**options) + + def get_tokens_unprocessed(self, text): + # Search for the input prompt anywhere...this allows code blocks to + # begin with comments as well. + if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL): + lex = self.IPythonConsoleLexer + else: + lex = self.IPythonLexer + for token in lex.get_tokens_unprocessed(text): + yield token + |