diff options
author | Mikhail Borisov <borisov.mikhail@gmail.com> | 2022-02-10 16:45:39 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:39 +0300 |
commit | a6a92afe03e02795227d2641b49819b687f088f8 (patch) | |
tree | f6984a1d27d5a7ec88a6fdd6e20cd5b7693b6ece /contrib/python/ipython/py2/IPython/lib/lexers.py | |
parent | c6dc8b8bd530985bc4cce0137e9a5de32f1087cb (diff) | |
download | ydb-a6a92afe03e02795227d2641b49819b687f088f8.tar.gz |
Restoring authorship annotation for Mikhail Borisov <borisov.mikhail@gmail.com>. Commit 1 of 2.
Diffstat (limited to 'contrib/python/ipython/py2/IPython/lib/lexers.py')
-rw-r--r-- | contrib/python/ipython/py2/IPython/lib/lexers.py | 1000 |
1 files changed, 500 insertions, 500 deletions
diff --git a/contrib/python/ipython/py2/IPython/lib/lexers.py b/contrib/python/ipython/py2/IPython/lib/lexers.py index 9160ae1245..ec43856115 100644 --- a/contrib/python/ipython/py2/IPython/lib/lexers.py +++ b/contrib/python/ipython/py2/IPython/lib/lexers.py @@ -1,517 +1,517 @@ -# -*- coding: utf-8 -*- -""" -Defines a variety of Pygments lexers for highlighting IPython code. - -This includes: - - IPythonLexer, IPython3Lexer - Lexers for pure IPython (python + magic/shell commands) - - IPythonPartialTracebackLexer, IPythonTracebackLexer - Supports 2.x and 3.x via keyword `python3`. The partial traceback - lexer reads everything but the Python code appearing in a traceback. - The full lexer combines the partial lexer with an IPython lexer. - - IPythonConsoleLexer - A lexer for IPython console sessions, with support for tracebacks. - - IPyLexer - A friendly lexer which examines the first line of text and from it, - decides whether to use an IPython lexer or an IPython console lexer. - This is probably the only lexer that needs to be explicitly added - to Pygments. - -""" -#----------------------------------------------------------------------------- -# Copyright (c) 2013, the IPython Development Team. -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file COPYING.txt, distributed with this software. -#----------------------------------------------------------------------------- - -# Standard library -import re - -# Third party +# -*- coding: utf-8 -*- +""" +Defines a variety of Pygments lexers for highlighting IPython code. + +This includes: + + IPythonLexer, IPython3Lexer + Lexers for pure IPython (python + magic/shell commands) + + IPythonPartialTracebackLexer, IPythonTracebackLexer + Supports 2.x and 3.x via keyword `python3`. The partial traceback + lexer reads everything but the Python code appearing in a traceback. + The full lexer combines the partial lexer with an IPython lexer. + + IPythonConsoleLexer + A lexer for IPython console sessions, with support for tracebacks. + + IPyLexer + A friendly lexer which examines the first line of text and from it, + decides whether to use an IPython lexer or an IPython console lexer. + This is probably the only lexer that needs to be explicitly added + to Pygments. + +""" +#----------------------------------------------------------------------------- +# Copyright (c) 2013, the IPython Development Team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +# Standard library +import re + +# Third party from pygments.lexers import BashLexer, Python3Lexer try: # PythonLexer was renamed to Python2Lexer in pygments 2.5 from pygments.lexers import Python2Lexer except ImportError: from pygments.lexers import PythonLexer as Python2Lexer -from pygments.lexer import ( - Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using, -) -from pygments.token import ( +from pygments.lexer import ( + Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, using, +) +from pygments.token import ( Generic, Keyword, Literal, Name, Operator, Other, Text, Error, -) -from pygments.util import get_bool_opt - -# Local - -line_re = re.compile('.*?\n') - -__all__ = ['build_ipy_lexer', 'IPython3Lexer', 'IPythonLexer', - 'IPythonPartialTracebackLexer', 'IPythonTracebackLexer', - 'IPythonConsoleLexer', 'IPyLexer'] - -ipython_tokens = [ - (r"(?s)(\s*)(%%)(\w+)(.*)", bygroups(Text, Operator, Keyword, Text)), - (r'(?s)(^\s*)(%%!)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(BashLexer))), - (r"(%%?)(\w+)(\?\??)$", bygroups(Operator, Keyword, Operator)), - (r"\b(\?\??)(\s*)$", bygroups(Operator, Text)), - (r'(%)(sx|sc|system)(.*)(\n)', bygroups(Operator, Keyword, - using(BashLexer), Text)), - (r'(%)(\w+)(.*\n)', bygroups(Operator, Keyword, Text)), - (r'^(!!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)), - (r'(!)(?!=)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)), - (r'^(\s*)(\?\??)(\s*%{0,2}[\w\.\*]*)', bygroups(Text, Operator, Text)), - (r'(\s*%{0,2}[\w\.\*]*)(\?\??)(\s*)$', bygroups(Text, Operator, Text)), -] - -def build_ipy_lexer(python3): - """Builds IPython lexers depending on the value of `python3`. - - The lexer inherits from an appropriate Python lexer and then adds - information about IPython specific keywords (i.e. magic commands, - shell commands, etc.) - - Parameters - ---------- - python3 : bool - If `True`, then build an IPython lexer from a Python 3 lexer. - - """ - # It would be nice to have a single IPython lexer class which takes - # a boolean `python3`. But since there are two Python lexer classes, - # we will also have two IPython lexer classes. - if python3: - PyLexer = Python3Lexer - name = 'IPython3' - aliases = ['ipython3'] - doc = """IPython3 Lexer""" - else: +) +from pygments.util import get_bool_opt + +# Local + +line_re = re.compile('.*?\n') + +__all__ = ['build_ipy_lexer', 'IPython3Lexer', 'IPythonLexer', + 'IPythonPartialTracebackLexer', 'IPythonTracebackLexer', + 'IPythonConsoleLexer', 'IPyLexer'] + +ipython_tokens = [ + (r"(?s)(\s*)(%%)(\w+)(.*)", bygroups(Text, Operator, Keyword, Text)), + (r'(?s)(^\s*)(%%!)([^\n]*\n)(.*)', bygroups(Text, Operator, Text, using(BashLexer))), + (r"(%%?)(\w+)(\?\??)$", bygroups(Operator, Keyword, Operator)), + (r"\b(\?\??)(\s*)$", bygroups(Operator, Text)), + (r'(%)(sx|sc|system)(.*)(\n)', bygroups(Operator, Keyword, + using(BashLexer), Text)), + (r'(%)(\w+)(.*\n)', bygroups(Operator, Keyword, Text)), + (r'^(!!)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)), + (r'(!)(?!=)(.+)(\n)', bygroups(Operator, using(BashLexer), Text)), + (r'^(\s*)(\?\??)(\s*%{0,2}[\w\.\*]*)', bygroups(Text, Operator, Text)), + (r'(\s*%{0,2}[\w\.\*]*)(\?\??)(\s*)$', bygroups(Text, Operator, Text)), +] + +def build_ipy_lexer(python3): + """Builds IPython lexers depending on the value of `python3`. + + The lexer inherits from an appropriate Python lexer and then adds + information about IPython specific keywords (i.e. magic commands, + shell commands, etc.) + + Parameters + ---------- + python3 : bool + If `True`, then build an IPython lexer from a Python 3 lexer. + + """ + # It would be nice to have a single IPython lexer class which takes + # a boolean `python3`. But since there are two Python lexer classes, + # we will also have two IPython lexer classes. + if python3: + PyLexer = Python3Lexer + name = 'IPython3' + aliases = ['ipython3'] + doc = """IPython3 Lexer""" + else: PyLexer = Python2Lexer - name = 'IPython' - aliases = ['ipython2', 'ipython'] - doc = """IPython Lexer""" - - tokens = PyLexer.tokens.copy() - tokens['root'] = ipython_tokens + tokens['root'] - - attrs = {'name': name, 'aliases': aliases, 'filenames': [], - '__doc__': doc, 'tokens': tokens} - - return type(name, (PyLexer,), attrs) - - -IPython3Lexer = build_ipy_lexer(python3=True) -IPythonLexer = build_ipy_lexer(python3=False) - - -class IPythonPartialTracebackLexer(RegexLexer): - """ - Partial lexer for IPython tracebacks. - - Handles all the non-python output. This works for both Python 2.x and 3.x. - - """ - name = 'IPython Partial Traceback' - - tokens = { - 'root': [ - # Tracebacks for syntax errors have a different style. - # For both types of tracebacks, we mark the first line with - # Generic.Traceback. For syntax errors, we mark the filename - # as we mark the filenames for non-syntax tracebacks. - # - # These two regexps define how IPythonConsoleLexer finds a - # traceback. - # - ## Non-syntax traceback - (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)), - ## Syntax traceback - (r'^( File)(.*)(, line )(\d+\n)', - bygroups(Generic.Traceback, Name.Namespace, - Generic.Traceback, Literal.Number.Integer)), - - # (Exception Identifier)(Whitespace)(Traceback Message) - (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)', - bygroups(Name.Exception, Generic.Whitespace, Text)), - # (Module/Filename)(Text)(Callee)(Function Signature) - # Better options for callee and function signature? - (r'(.*)( in )(.*)(\(.*\)\n)', - bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)), - # Regular line: (Whitespace)(Line Number)(Python Code) - (r'(\s*?)(\d+)(.*?\n)', - bygroups(Generic.Whitespace, Literal.Number.Integer, Other)), - # Emphasized line: (Arrow)(Line Number)(Python Code) - # Using Exception token so arrow color matches the Exception. - (r'(-*>?\s?)(\d+)(.*?\n)', - bygroups(Name.Exception, Literal.Number.Integer, Other)), - # (Exception Identifier)(Message) - (r'(?u)(^[^\d\W]\w*)(:.*?\n)', - bygroups(Name.Exception, Text)), - # Tag everything else as Other, will be handled later. - (r'.*\n', Other), - ], - } - - -class IPythonTracebackLexer(DelegatingLexer): - """ - IPython traceback lexer. - - For doctests, the tracebacks can be snipped as much as desired with the - exception to the lines that designate a traceback. For non-syntax error - tracebacks, this is the line of hyphens. For syntax error tracebacks, - this is the line which lists the File and line number. - - """ - # The lexer inherits from DelegatingLexer. The "root" lexer is an - # appropriate IPython lexer, which depends on the value of the boolean - # `python3`. First, we parse with the partial IPython traceback lexer. - # Then, any code marked with the "Other" token is delegated to the root - # lexer. - # - name = 'IPython Traceback' - aliases = ['ipythontb'] - - def __init__(self, **options): - self.python3 = get_bool_opt(options, 'python3', False) - if self.python3: - self.aliases = ['ipython3tb'] - else: - self.aliases = ['ipython2tb', 'ipythontb'] - - if self.python3: - IPyLexer = IPython3Lexer - else: - IPyLexer = IPythonLexer - - DelegatingLexer.__init__(self, IPyLexer, - IPythonPartialTracebackLexer, **options) - -class IPythonConsoleLexer(Lexer): - """ - An IPython console lexer for IPython code-blocks and doctests, such as: - - .. code-block:: rst - - .. code-block:: ipythonconsole - - In [1]: a = 'foo' - - In [2]: a - Out[2]: 'foo' - - In [3]: print a - foo - - In [4]: 1 / 0 - - - Support is also provided for IPython exceptions: - - .. code-block:: rst - - .. code-block:: ipythonconsole - - In [1]: raise Exception - - --------------------------------------------------------------------------- - Exception Traceback (most recent call last) - <ipython-input-1-fca2ab0ca76b> in <module>() - ----> 1 raise Exception - - Exception: - - """ - name = 'IPython console session' - aliases = ['ipythonconsole'] - mimetypes = ['text/x-ipython-console'] - - # The regexps used to determine what is input and what is output. - # The default prompts for IPython are: - # + name = 'IPython' + aliases = ['ipython2', 'ipython'] + doc = """IPython Lexer""" + + tokens = PyLexer.tokens.copy() + tokens['root'] = ipython_tokens + tokens['root'] + + attrs = {'name': name, 'aliases': aliases, 'filenames': [], + '__doc__': doc, 'tokens': tokens} + + return type(name, (PyLexer,), attrs) + + +IPython3Lexer = build_ipy_lexer(python3=True) +IPythonLexer = build_ipy_lexer(python3=False) + + +class IPythonPartialTracebackLexer(RegexLexer): + """ + Partial lexer for IPython tracebacks. + + Handles all the non-python output. This works for both Python 2.x and 3.x. + + """ + name = 'IPython Partial Traceback' + + tokens = { + 'root': [ + # Tracebacks for syntax errors have a different style. + # For both types of tracebacks, we mark the first line with + # Generic.Traceback. For syntax errors, we mark the filename + # as we mark the filenames for non-syntax tracebacks. + # + # These two regexps define how IPythonConsoleLexer finds a + # traceback. + # + ## Non-syntax traceback + (r'^(\^C)?(-+\n)', bygroups(Error, Generic.Traceback)), + ## Syntax traceback + (r'^( File)(.*)(, line )(\d+\n)', + bygroups(Generic.Traceback, Name.Namespace, + Generic.Traceback, Literal.Number.Integer)), + + # (Exception Identifier)(Whitespace)(Traceback Message) + (r'(?u)(^[^\d\W]\w*)(\s*)(Traceback.*?\n)', + bygroups(Name.Exception, Generic.Whitespace, Text)), + # (Module/Filename)(Text)(Callee)(Function Signature) + # Better options for callee and function signature? + (r'(.*)( in )(.*)(\(.*\)\n)', + bygroups(Name.Namespace, Text, Name.Entity, Name.Tag)), + # Regular line: (Whitespace)(Line Number)(Python Code) + (r'(\s*?)(\d+)(.*?\n)', + bygroups(Generic.Whitespace, Literal.Number.Integer, Other)), + # Emphasized line: (Arrow)(Line Number)(Python Code) + # Using Exception token so arrow color matches the Exception. + (r'(-*>?\s?)(\d+)(.*?\n)', + bygroups(Name.Exception, Literal.Number.Integer, Other)), + # (Exception Identifier)(Message) + (r'(?u)(^[^\d\W]\w*)(:.*?\n)', + bygroups(Name.Exception, Text)), + # Tag everything else as Other, will be handled later. + (r'.*\n', Other), + ], + } + + +class IPythonTracebackLexer(DelegatingLexer): + """ + IPython traceback lexer. + + For doctests, the tracebacks can be snipped as much as desired with the + exception to the lines that designate a traceback. For non-syntax error + tracebacks, this is the line of hyphens. For syntax error tracebacks, + this is the line which lists the File and line number. + + """ + # The lexer inherits from DelegatingLexer. The "root" lexer is an + # appropriate IPython lexer, which depends on the value of the boolean + # `python3`. First, we parse with the partial IPython traceback lexer. + # Then, any code marked with the "Other" token is delegated to the root + # lexer. + # + name = 'IPython Traceback' + aliases = ['ipythontb'] + + def __init__(self, **options): + self.python3 = get_bool_opt(options, 'python3', False) + if self.python3: + self.aliases = ['ipython3tb'] + else: + self.aliases = ['ipython2tb', 'ipythontb'] + + if self.python3: + IPyLexer = IPython3Lexer + else: + IPyLexer = IPythonLexer + + DelegatingLexer.__init__(self, IPyLexer, + IPythonPartialTracebackLexer, **options) + +class IPythonConsoleLexer(Lexer): + """ + An IPython console lexer for IPython code-blocks and doctests, such as: + + .. code-block:: rst + + .. code-block:: ipythonconsole + + In [1]: a = 'foo' + + In [2]: a + Out[2]: 'foo' + + In [3]: print a + foo + + In [4]: 1 / 0 + + + Support is also provided for IPython exceptions: + + .. code-block:: rst + + .. code-block:: ipythonconsole + + In [1]: raise Exception + + --------------------------------------------------------------------------- + Exception Traceback (most recent call last) + <ipython-input-1-fca2ab0ca76b> in <module>() + ----> 1 raise Exception + + Exception: + + """ + name = 'IPython console session' + aliases = ['ipythonconsole'] + mimetypes = ['text/x-ipython-console'] + + # The regexps used to determine what is input and what is output. + # The default prompts for IPython are: + # # in = 'In [#]: ' # continuation = ' .D.: ' # template = 'Out[#]: ' - # + # # Where '#' is the 'prompt number' or 'execution count' and 'D' # D is a number of dots matching the width of the execution count # - in1_regex = r'In \[[0-9]+\]: ' - in2_regex = r' \.\.+\.: ' - out_regex = r'Out\[[0-9]+\]: ' - - #: The regex to determine when a traceback starts. - ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)') - - def __init__(self, **options): - """Initialize the IPython console lexer. - - Parameters - ---------- - python3 : bool - If `True`, then the console inputs are parsed using a Python 3 - lexer. Otherwise, they are parsed using a Python 2 lexer. - in1_regex : RegexObject - The compiled regular expression used to detect the start - of inputs. Although the IPython configuration setting may have a - trailing whitespace, do not include it in the regex. If `None`, - then the default input prompt is assumed. - in2_regex : RegexObject - The compiled regular expression used to detect the continuation - of inputs. Although the IPython configuration setting may have a - trailing whitespace, do not include it in the regex. If `None`, - then the default input prompt is assumed. - out_regex : RegexObject - The compiled regular expression used to detect outputs. If `None`, - then the default output prompt is assumed. - - """ - self.python3 = get_bool_opt(options, 'python3', False) - if self.python3: - self.aliases = ['ipython3console'] - else: - self.aliases = ['ipython2console', 'ipythonconsole'] - - in1_regex = options.get('in1_regex', self.in1_regex) - in2_regex = options.get('in2_regex', self.in2_regex) - out_regex = options.get('out_regex', self.out_regex) - - # So that we can work with input and output prompts which have been - # rstrip'd (possibly by editors) we also need rstrip'd variants. If - # we do not do this, then such prompts will be tagged as 'output'. - # The reason can't just use the rstrip'd variants instead is because - # we want any whitespace associated with the prompt to be inserted - # with the token. This allows formatted code to be modified so as hide - # the appearance of prompts, with the whitespace included. One example - # use of this is in copybutton.js from the standard lib Python docs. - in1_regex_rstrip = in1_regex.rstrip() + '\n' - in2_regex_rstrip = in2_regex.rstrip() + '\n' - out_regex_rstrip = out_regex.rstrip() + '\n' - - # Compile and save them all. - attrs = ['in1_regex', 'in2_regex', 'out_regex', - 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip'] - for attr in attrs: - self.__setattr__(attr, re.compile(locals()[attr])) - - Lexer.__init__(self, **options) - - if self.python3: - pylexer = IPython3Lexer - tblexer = IPythonTracebackLexer - else: - pylexer = IPythonLexer - tblexer = IPythonTracebackLexer - - self.pylexer = pylexer(**options) - self.tblexer = tblexer(**options) - - self.reset() - - def reset(self): - self.mode = 'output' - self.index = 0 - self.buffer = u'' - self.insertions = [] - - def buffered_tokens(self): - """ - Generator of unprocessed tokens after doing insertions and before - changing to a new state. - - """ - if self.mode == 'output': - tokens = [(0, Generic.Output, self.buffer)] - elif self.mode == 'input': - tokens = self.pylexer.get_tokens_unprocessed(self.buffer) - else: # traceback - tokens = self.tblexer.get_tokens_unprocessed(self.buffer) - - for i, t, v in do_insertions(self.insertions, tokens): - # All token indexes are relative to the buffer. - yield self.index + i, t, v - - # Clear it all - self.index += len(self.buffer) - self.buffer = u'' - self.insertions = [] - - def get_mci(self, line): - """ - Parses the line and returns a 3-tuple: (mode, code, insertion). - - `mode` is the next mode (or state) of the lexer, and is always equal - to 'input', 'output', or 'tb'. - - `code` is a portion of the line that should be added to the buffer - corresponding to the next mode and eventually lexed by another lexer. - For example, `code` could be Python code if `mode` were 'input'. - - `insertion` is a 3-tuple (index, token, text) representing an - unprocessed "token" that will be inserted into the stream of tokens - that are created from the buffer once we change modes. This is usually - the input or output prompt. - - In general, the next mode depends on current mode and on the contents - of `line`. - - """ - # To reduce the number of regex match checks, we have multiple - # 'if' blocks instead of 'if-elif' blocks. - - # Check for possible end of input - in2_match = self.in2_regex.match(line) - in2_match_rstrip = self.in2_regex_rstrip.match(line) - if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \ - in2_match_rstrip: - end_input = True - else: - end_input = False - if end_input and self.mode != 'tb': - # Only look for an end of input when not in tb mode. - # An ellipsis could appear within the traceback. - mode = 'output' - code = u'' - insertion = (0, Generic.Prompt, line) - return mode, code, insertion - - # Check for output prompt - out_match = self.out_regex.match(line) - out_match_rstrip = self.out_regex_rstrip.match(line) - if out_match or out_match_rstrip: - mode = 'output' - if out_match: - idx = out_match.end() - else: - idx = out_match_rstrip.end() - code = line[idx:] - # Use the 'heading' token for output. We cannot use Generic.Error - # since it would conflict with exceptions. - insertion = (0, Generic.Heading, line[:idx]) - return mode, code, insertion - - - # Check for input or continuation prompt (non stripped version) - in1_match = self.in1_regex.match(line) - if in1_match or (in2_match and self.mode != 'tb'): - # New input or when not in tb, continued input. - # We do not check for continued input when in tb since it is - # allowable to replace a long stack with an ellipsis. - mode = 'input' - if in1_match: - idx = in1_match.end() - else: # in2_match - idx = in2_match.end() - code = line[idx:] - insertion = (0, Generic.Prompt, line[:idx]) - return mode, code, insertion - - # Check for input or continuation prompt (stripped version) - in1_match_rstrip = self.in1_regex_rstrip.match(line) - if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'): - # New input or when not in tb, continued input. - # We do not check for continued input when in tb since it is - # allowable to replace a long stack with an ellipsis. - mode = 'input' - if in1_match_rstrip: - idx = in1_match_rstrip.end() - else: # in2_match - idx = in2_match_rstrip.end() - code = line[idx:] - insertion = (0, Generic.Prompt, line[:idx]) - return mode, code, insertion - - # Check for traceback - if self.ipytb_start.match(line): - mode = 'tb' - code = line - insertion = None - return mode, code, insertion - - # All other stuff... - if self.mode in ('input', 'output'): - # We assume all other text is output. Multiline input that - # does not use the continuation marker cannot be detected. - # For example, the 3 in the following is clearly output: - # - # In [1]: print 3 - # 3 - # - # But the following second line is part of the input: - # - # In [2]: while True: - # print True - # - # In both cases, the 2nd line will be 'output'. - # - mode = 'output' - else: - mode = 'tb' - - code = line - insertion = None - - return mode, code, insertion - - def get_tokens_unprocessed(self, text): - self.reset() - for match in line_re.finditer(text): - line = match.group() - mode, code, insertion = self.get_mci(line) - - if mode != self.mode: - # Yield buffered tokens before transitioning to new mode. - for token in self.buffered_tokens(): - yield token - self.mode = mode - - if insertion: - self.insertions.append((len(self.buffer), [insertion])) - self.buffer += code - + in1_regex = r'In \[[0-9]+\]: ' + in2_regex = r' \.\.+\.: ' + out_regex = r'Out\[[0-9]+\]: ' + + #: The regex to determine when a traceback starts. + ipytb_start = re.compile(r'^(\^C)?(-+\n)|^( File)(.*)(, line )(\d+\n)') + + def __init__(self, **options): + """Initialize the IPython console lexer. + + Parameters + ---------- + python3 : bool + If `True`, then the console inputs are parsed using a Python 3 + lexer. Otherwise, they are parsed using a Python 2 lexer. + in1_regex : RegexObject + The compiled regular expression used to detect the start + of inputs. Although the IPython configuration setting may have a + trailing whitespace, do not include it in the regex. If `None`, + then the default input prompt is assumed. + in2_regex : RegexObject + The compiled regular expression used to detect the continuation + of inputs. Although the IPython configuration setting may have a + trailing whitespace, do not include it in the regex. If `None`, + then the default input prompt is assumed. + out_regex : RegexObject + The compiled regular expression used to detect outputs. If `None`, + then the default output prompt is assumed. + + """ + self.python3 = get_bool_opt(options, 'python3', False) + if self.python3: + self.aliases = ['ipython3console'] + else: + self.aliases = ['ipython2console', 'ipythonconsole'] + + in1_regex = options.get('in1_regex', self.in1_regex) + in2_regex = options.get('in2_regex', self.in2_regex) + out_regex = options.get('out_regex', self.out_regex) + + # So that we can work with input and output prompts which have been + # rstrip'd (possibly by editors) we also need rstrip'd variants. If + # we do not do this, then such prompts will be tagged as 'output'. + # The reason can't just use the rstrip'd variants instead is because + # we want any whitespace associated with the prompt to be inserted + # with the token. This allows formatted code to be modified so as hide + # the appearance of prompts, with the whitespace included. One example + # use of this is in copybutton.js from the standard lib Python docs. + in1_regex_rstrip = in1_regex.rstrip() + '\n' + in2_regex_rstrip = in2_regex.rstrip() + '\n' + out_regex_rstrip = out_regex.rstrip() + '\n' + + # Compile and save them all. + attrs = ['in1_regex', 'in2_regex', 'out_regex', + 'in1_regex_rstrip', 'in2_regex_rstrip', 'out_regex_rstrip'] + for attr in attrs: + self.__setattr__(attr, re.compile(locals()[attr])) + + Lexer.__init__(self, **options) + + if self.python3: + pylexer = IPython3Lexer + tblexer = IPythonTracebackLexer + else: + pylexer = IPythonLexer + tblexer = IPythonTracebackLexer + + self.pylexer = pylexer(**options) + self.tblexer = tblexer(**options) + + self.reset() + + def reset(self): + self.mode = 'output' + self.index = 0 + self.buffer = u'' + self.insertions = [] + + def buffered_tokens(self): + """ + Generator of unprocessed tokens after doing insertions and before + changing to a new state. + + """ + if self.mode == 'output': + tokens = [(0, Generic.Output, self.buffer)] + elif self.mode == 'input': + tokens = self.pylexer.get_tokens_unprocessed(self.buffer) + else: # traceback + tokens = self.tblexer.get_tokens_unprocessed(self.buffer) + + for i, t, v in do_insertions(self.insertions, tokens): + # All token indexes are relative to the buffer. + yield self.index + i, t, v + + # Clear it all + self.index += len(self.buffer) + self.buffer = u'' + self.insertions = [] + + def get_mci(self, line): + """ + Parses the line and returns a 3-tuple: (mode, code, insertion). + + `mode` is the next mode (or state) of the lexer, and is always equal + to 'input', 'output', or 'tb'. + + `code` is a portion of the line that should be added to the buffer + corresponding to the next mode and eventually lexed by another lexer. + For example, `code` could be Python code if `mode` were 'input'. + + `insertion` is a 3-tuple (index, token, text) representing an + unprocessed "token" that will be inserted into the stream of tokens + that are created from the buffer once we change modes. This is usually + the input or output prompt. + + In general, the next mode depends on current mode and on the contents + of `line`. + + """ + # To reduce the number of regex match checks, we have multiple + # 'if' blocks instead of 'if-elif' blocks. + + # Check for possible end of input + in2_match = self.in2_regex.match(line) + in2_match_rstrip = self.in2_regex_rstrip.match(line) + if (in2_match and in2_match.group().rstrip() == line.rstrip()) or \ + in2_match_rstrip: + end_input = True + else: + end_input = False + if end_input and self.mode != 'tb': + # Only look for an end of input when not in tb mode. + # An ellipsis could appear within the traceback. + mode = 'output' + code = u'' + insertion = (0, Generic.Prompt, line) + return mode, code, insertion + + # Check for output prompt + out_match = self.out_regex.match(line) + out_match_rstrip = self.out_regex_rstrip.match(line) + if out_match or out_match_rstrip: + mode = 'output' + if out_match: + idx = out_match.end() + else: + idx = out_match_rstrip.end() + code = line[idx:] + # Use the 'heading' token for output. We cannot use Generic.Error + # since it would conflict with exceptions. + insertion = (0, Generic.Heading, line[:idx]) + return mode, code, insertion + + + # Check for input or continuation prompt (non stripped version) + in1_match = self.in1_regex.match(line) + if in1_match or (in2_match and self.mode != 'tb'): + # New input or when not in tb, continued input. + # We do not check for continued input when in tb since it is + # allowable to replace a long stack with an ellipsis. + mode = 'input' + if in1_match: + idx = in1_match.end() + else: # in2_match + idx = in2_match.end() + code = line[idx:] + insertion = (0, Generic.Prompt, line[:idx]) + return mode, code, insertion + + # Check for input or continuation prompt (stripped version) + in1_match_rstrip = self.in1_regex_rstrip.match(line) + if in1_match_rstrip or (in2_match_rstrip and self.mode != 'tb'): + # New input or when not in tb, continued input. + # We do not check for continued input when in tb since it is + # allowable to replace a long stack with an ellipsis. + mode = 'input' + if in1_match_rstrip: + idx = in1_match_rstrip.end() + else: # in2_match + idx = in2_match_rstrip.end() + code = line[idx:] + insertion = (0, Generic.Prompt, line[:idx]) + return mode, code, insertion + + # Check for traceback + if self.ipytb_start.match(line): + mode = 'tb' + code = line + insertion = None + return mode, code, insertion + + # All other stuff... + if self.mode in ('input', 'output'): + # We assume all other text is output. Multiline input that + # does not use the continuation marker cannot be detected. + # For example, the 3 in the following is clearly output: + # + # In [1]: print 3 + # 3 + # + # But the following second line is part of the input: + # + # In [2]: while True: + # print True + # + # In both cases, the 2nd line will be 'output'. + # + mode = 'output' + else: + mode = 'tb' + + code = line + insertion = None + + return mode, code, insertion + + def get_tokens_unprocessed(self, text): + self.reset() + for match in line_re.finditer(text): + line = match.group() + mode, code, insertion = self.get_mci(line) + + if mode != self.mode: + # Yield buffered tokens before transitioning to new mode. + for token in self.buffered_tokens(): + yield token + self.mode = mode + + if insertion: + self.insertions.append((len(self.buffer), [insertion])) + self.buffer += code + for token in self.buffered_tokens(): yield token -class IPyLexer(Lexer): - """ - Primary lexer for all IPython-like code. - - This is a simple helper lexer. If the first line of the text begins with - "In \[[0-9]+\]:", then the entire text is parsed with an IPython console - lexer. If not, then the entire text is parsed with an IPython lexer. - - The goal is to reduce the number of lexers that are registered - with Pygments. - - """ - name = 'IPy session' - aliases = ['ipy'] - - def __init__(self, **options): - self.python3 = get_bool_opt(options, 'python3', False) - if self.python3: - self.aliases = ['ipy3'] - else: - self.aliases = ['ipy2', 'ipy'] - - Lexer.__init__(self, **options) - - self.IPythonLexer = IPythonLexer(**options) - self.IPythonConsoleLexer = IPythonConsoleLexer(**options) - - def get_tokens_unprocessed(self, text): - # Search for the input prompt anywhere...this allows code blocks to - # begin with comments as well. - if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL): - lex = self.IPythonConsoleLexer - else: - lex = self.IPythonLexer - for token in lex.get_tokens_unprocessed(text): - yield token - +class IPyLexer(Lexer): + """ + Primary lexer for all IPython-like code. + + This is a simple helper lexer. If the first line of the text begins with + "In \[[0-9]+\]:", then the entire text is parsed with an IPython console + lexer. If not, then the entire text is parsed with an IPython lexer. + + The goal is to reduce the number of lexers that are registered + with Pygments. + + """ + name = 'IPy session' + aliases = ['ipy'] + + def __init__(self, **options): + self.python3 = get_bool_opt(options, 'python3', False) + if self.python3: + self.aliases = ['ipy3'] + else: + self.aliases = ['ipy2', 'ipy'] + + Lexer.__init__(self, **options) + + self.IPythonLexer = IPythonLexer(**options) + self.IPythonConsoleLexer = IPythonConsoleLexer(**options) + + def get_tokens_unprocessed(self, text): + # Search for the input prompt anywhere...this allows code blocks to + # begin with comments as well. + if re.match(r'.*(In \[[0-9]+\]:)', text.strip(), re.DOTALL): + lex = self.IPythonConsoleLexer + else: + lex = self.IPythonLexer + for token in lex.get_tokens_unprocessed(text): + yield token + |