diff options
author | arcadia-devtools <arcadia-devtools@yandex-team.ru> | 2022-06-09 14:39:19 +0300 |
---|---|---|
committer | arcadia-devtools <arcadia-devtools@yandex-team.ru> | 2022-06-09 14:39:19 +0300 |
commit | c04b663c7bb4b750deeb8f48f620497ec13da8fa (patch) | |
tree | 151ebc8bfdd2ad918caf5e6e2d8013e14272ddf8 /contrib/python/Pygments/py3/pygments/lexers/lisp.py | |
parent | 0d55ca22c507d18c2f35718687e0b06d9915397b (diff) | |
download | ydb-c04b663c7bb4b750deeb8f48f620497ec13da8fa.tar.gz |
intermediate changes
ref:2d4f292087954c9344efdabb7b2a67f466263c65
Diffstat (limited to 'contrib/python/Pygments/py3/pygments/lexers/lisp.py')
-rw-r--r-- | contrib/python/Pygments/py3/pygments/lexers/lisp.py | 314 |
1 files changed, 212 insertions, 102 deletions
diff --git a/contrib/python/Pygments/py3/pygments/lexers/lisp.py b/contrib/python/Pygments/py3/pygments/lexers/lisp.py index 5628e336ca4..6dab1fdaea7 100644 --- a/contrib/python/Pygments/py3/pygments/lexers/lisp.py +++ b/contrib/python/Pygments/py3/pygments/lexers/lisp.py @@ -4,7 +4,7 @@ Lexers for Lispy languages. - :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS. + :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ @@ -12,21 +12,19 @@ import re from pygments.lexer import RegexLexer, include, bygroups, words, default from pygments.token import Text, Comment, Operator, Keyword, Name, String, \ - Number, Punctuation, Literal, Error + Number, Punctuation, Literal, Error, Whitespace from pygments.lexers.python import PythonLexer +from pygments.lexers._scheme_builtins import scheme_keywords, scheme_builtins + __all__ = ['SchemeLexer', 'CommonLispLexer', 'HyLexer', 'RacketLexer', 'NewLispLexer', 'EmacsLispLexer', 'ShenLexer', 'CPSALexer', 'XtlangLexer', 'FennelLexer'] - class SchemeLexer(RegexLexer): """ - A Scheme lexer, parsing a stream and outputting the tokens - needed to highlight scheme code. - This lexer could be most probably easily subclassed to parse - other LISP-Dialects like Common Lisp, Emacs Lisp or AutoLisp. + A Scheme lexer. This parser is checked with pastes from the LISP pastebin at http://paste.lisp.org/ to cover as much syntax as possible. @@ -36,65 +34,154 @@ class SchemeLexer(RegexLexer): .. versionadded:: 0.6 """ name = 'Scheme' + url = 'http://www.scheme-reports.org/' aliases = ['scheme', 'scm'] filenames = ['*.scm', '*.ss'] mimetypes = ['text/x-scheme', 'application/x-scheme'] flags = re.DOTALL | re.MULTILINE - # list of known keywords and builtins taken form vim 6.4 scheme.vim - # syntax file. - keywords = ( - 'lambda', 'define', 'if', 'else', 'cond', 'and', 'or', 'case', 'let', - 'let*', 'letrec', 'begin', 'do', 'delay', 'set!', '=>', 'quote', - 'quasiquote', 'unquote', 'unquote-splicing', 'define-syntax', - 'let-syntax', 'letrec-syntax', 'syntax-rules' - ) - builtins = ( - '*', '+', '-', '/', '<', '<=', '=', '>', '>=', 'abs', 'acos', 'angle', - 'append', 'apply', 'asin', 'assoc', 'assq', 'assv', 'atan', - 'boolean?', 'caaaar', 'caaadr', 'caaar', 'caadar', 'caaddr', 'caadr', - 'caar', 'cadaar', 'cadadr', 'cadar', 'caddar', 'cadddr', 'caddr', - 'cadr', 'call-with-current-continuation', 'call-with-input-file', - 'call-with-output-file', 'call-with-values', 'call/cc', 'car', - 'cdaaar', 'cdaadr', 'cdaar', 'cdadar', 'cdaddr', 'cdadr', 'cdar', - 'cddaar', 'cddadr', 'cddar', 'cdddar', 'cddddr', 'cdddr', 'cddr', - 'cdr', 'ceiling', 'char->integer', 'char-alphabetic?', 'char-ci<=?', - 'char-ci<?', 'char-ci=?', 'char-ci>=?', 'char-ci>?', 'char-downcase', - 'char-lower-case?', 'char-numeric?', 'char-ready?', 'char-upcase', - 'char-upper-case?', 'char-whitespace?', 'char<=?', 'char<?', 'char=?', - 'char>=?', 'char>?', 'char?', 'close-input-port', 'close-output-port', - 'complex?', 'cons', 'cos', 'current-input-port', 'current-output-port', - 'denominator', 'display', 'dynamic-wind', 'eof-object?', 'eq?', - 'equal?', 'eqv?', 'eval', 'even?', 'exact->inexact', 'exact?', 'exp', - 'expt', 'floor', 'for-each', 'force', 'gcd', 'imag-part', - 'inexact->exact', 'inexact?', 'input-port?', 'integer->char', - 'integer?', 'interaction-environment', 'lcm', 'length', 'list', - 'list->string', 'list->vector', 'list-ref', 'list-tail', 'list?', - 'load', 'log', 'magnitude', 'make-polar', 'make-rectangular', - 'make-string', 'make-vector', 'map', 'max', 'member', 'memq', 'memv', - 'min', 'modulo', 'negative?', 'newline', 'not', 'null-environment', - 'null?', 'number->string', 'number?', 'numerator', 'odd?', - 'open-input-file', 'open-output-file', 'output-port?', 'pair?', - 'peek-char', 'port?', 'positive?', 'procedure?', 'quotient', - 'rational?', 'rationalize', 'read', 'read-char', 'real-part', 'real?', - 'remainder', 'reverse', 'round', 'scheme-report-environment', - 'set-car!', 'set-cdr!', 'sin', 'sqrt', 'string', 'string->list', - 'string->number', 'string->symbol', 'string-append', 'string-ci<=?', - 'string-ci<?', 'string-ci=?', 'string-ci>=?', 'string-ci>?', - 'string-copy', 'string-fill!', 'string-length', 'string-ref', - 'string-set!', 'string<=?', 'string<?', 'string=?', 'string>=?', - 'string>?', 'string?', 'substring', 'symbol->string', 'symbol?', - 'tan', 'transcript-off', 'transcript-on', 'truncate', 'values', - 'vector', 'vector->list', 'vector-fill!', 'vector-length', - 'vector-ref', 'vector-set!', 'vector?', 'with-input-from-file', - 'with-output-to-file', 'write', 'write-char', 'zero?' - ) # valid names for identifiers # well, names can only not consist fully of numbers # but this should be good enough for now valid_name = r'[\w!$%&*+,/:<=>?@^~|-]+' + # Use within verbose regexes + token_end = r''' + (?= + \s # whitespace + | ; # comment + | \#[;|!] # fancy comments + | [)\]] # end delimiters + | $ # end of file + ) + ''' + + # Recognizing builtins. + def get_tokens_unprocessed(self, text): + for index, token, value in super().get_tokens_unprocessed(text): + if token is Name.Function or token is Name.Variable: + if value in scheme_keywords: + yield index, Keyword, value + elif value in scheme_builtins: + yield index, Name.Builtin, value + else: + yield index, token, value + else: + yield index, token, value + + # Scheme has funky syntactic rules for numbers. These are all + # valid number literals: 5.0e55|14, 14/13, -1+5j, +1@5, #b110, + # #o#Iinf.0-nan.0i. This is adapted from the formal grammar given + # in http://www.r6rs.org/final/r6rs.pdf, section 4.2.1. Take a + # deep breath ... + + # It would be simpler if we could just not bother about invalid + # numbers like #b35. But we cannot parse 'abcdef' without #x as a + # number. + + number_rules = {} + for base in (2, 8, 10, 16): + if base == 2: + digit = r'[01]' + radix = r'( \#[bB] )' + elif base == 8: + digit = r'[0-7]' + radix = r'( \#[oO] )' + elif base == 10: + digit = r'[0-9]' + radix = r'( (\#[dD])? )' + elif base == 16: + digit = r'[0-9a-fA-F]' + radix = r'( \#[xX] )' + + # Radix, optional exactness indicator. + prefix = rf''' + ( + {radix} (\#[iIeE])? + | \#[iIeE] {radix} + ) + ''' + + # Simple unsigned number or fraction. + ureal = rf''' + ( + {digit}+ + ( / {digit}+ )? + ) + ''' + + # Add decimal numbers. + if base == 10: + decimal = r''' + ( + # Decimal part + ( + [0-9]+ ([.][0-9]*)? + | [.][0-9]+ + ) + + # Optional exponent + ( + [eEsSfFdDlL] [+-]? [0-9]+ + )? + + # Optional mantissa width + ( + \|[0-9]+ + )? + ) + ''' + ureal = rf''' + ( + {decimal} (?!/) + | {ureal} + ) + ''' + + naninf = r'(nan.0|inf.0)' + + real = rf''' + ( + [+-] {naninf} # Sign mandatory + | [+-]? {ureal} # Sign optional + ) + ''' + + complex_ = rf''' + ( + {real}? [+-] ({naninf}|{ureal})? i + | {real} (@ {real})? + + ) + ''' + + num = rf'''(?x) + ( + {prefix} + {complex_} + ) + # Need to ensure we have a full token. 1+ is not a + # number followed by something else, but a function + # name. + {token_end} + ''' + + number_rules[base] = num + + # If you have a headache now, say thanks to RnRS editors. + + # Doing it this way is simpler than splitting the number(10) + # regex in a floating-point and a no-floating-point version. + def decimal_cb(self, match): + if '.' in match.group(): + token_type = Number.Float # includes [+-](inf|nan).0 + else: + token_type = Number.Integer + yield match.start(), token_type, match.group() + + # -- + # The 'scheme-root' state parses as many expressions as needed, always # delegating to the 'scheme-value' state. The latter parses one complete # expression and immediately pops back. This is needed for the LilyPondLexer. @@ -119,25 +206,27 @@ class SchemeLexer(RegexLexer): (r';.*?$', Comment.Single), # multi-line comment (r'#\|', Comment.Multiline, 'multiline-comment'), - # commented form (entire sexpr folliwng) - (r'#;\s*\(', Comment, 'commented-form'), + # commented form (entire sexpr following) + (r'#;[([]', Comment, 'commented-form'), + # commented datum + (r'#;', Comment, 'commented-datum'), # signifies that the program text that follows is written with the # lexical and datum syntax described in r6rs (r'#!r6rs', Comment), # whitespaces - usually not relevant - (r'\s+', Text), + (r'\s+', Whitespace), # numbers - (r'-?\d+\.\d+', Number.Float, '#pop'), - (r'-?\d+', Number.Integer, '#pop'), - # support for uncommon kinds of numbers - - # have to figure out what the characters mean - # (r'(#e|#i|#b|#o|#d|#x)[\d.]+', Number), + (number_rules[2], Number.Bin, '#pop'), + (number_rules[8], Number.Oct, '#pop'), + (number_rules[10], decimal_cb, '#pop'), + (number_rules[16], Number.Hex, '#pop'), - # strings, symbols and characters - (r'"(\\\\|\\[^\\]|[^"\\])*"', String, "#pop"), + # strings, symbols, keywords and characters + (r'"', String, 'string'), (r"'" + valid_name, String.Symbol, "#pop"), + (r'#:' + valid_name, Keyword.Declaration, '#pop'), (r"#\\([()/'\"._!ยง$%& ?=+-]|[a-zA-Z0-9]+)", String.Char, "#pop"), # constants @@ -146,23 +235,16 @@ class SchemeLexer(RegexLexer): # special operators (r"('|#|`|,@|,|\.)", Operator), - # highlight the keywords - ('(%s)' % '|'.join(re.escape(entry) + ' ' for entry in keywords), - Keyword, - '#pop'), - # first variable in a quoted string like # '(this is syntactic sugar) (r"(?<='\()" + valid_name, Name.Variable, '#pop'), (r"(?<=#\()" + valid_name, Name.Variable, '#pop'), - # highlight the builtins - (r"(?<=\()(%s)" % '|'.join(re.escape(entry) + ' ' for entry in builtins), - Name.Builtin, - '#pop'), - - # the remaining functions + # Functions -- note that this also catches variables + # defined in let/let*, but there is little that can + # be done about it. (r'(?<=\()' + valid_name, Name.Function, '#pop'), + # find the remaining variables (valid_name, Name.Variable, '#pop'), @@ -170,11 +252,11 @@ class SchemeLexer(RegexLexer): # Push scheme-root to enter a state that will parse as many things # as needed in the parentheses. - (r'\(|\[', Punctuation, 'scheme-root'), + (r'[([]', Punctuation, 'scheme-root'), # Pop one 'value', one 'scheme-root', and yet another 'value', so # we get back to a state parsing expressions as needed in the # enclosing context. - (r'\)|\]', Punctuation, '#pop:3'), + (r'[)\]]', Punctuation, '#pop:3'), ], 'multiline-comment': [ (r'#\|', Comment.Multiline, '#push'), @@ -183,10 +265,30 @@ class SchemeLexer(RegexLexer): (r'[|#]', Comment.Multiline), ], 'commented-form': [ - (r'\(', Comment, '#push'), - (r'\)', Comment, '#pop'), - (r'[^()]+', Comment), + (r'[([]', Comment, '#push'), + (r'[)\]]', Comment, '#pop'), + (r'[^()[\]]+', Comment), ], + 'commented-datum': [ + (rf'(?x).*?{token_end}', Comment, '#pop'), + ], + 'string': [ + # Pops back from 'string', and pops 'value' as well. + ('"', String, '#pop:2'), + # Hex escape sequences, R6RS-style. + (r'\\x[0-9a-fA-F]+;', String.Escape), + # We try R6RS style first, but fall back to Guile-style. + (r'\\x[0-9a-fA-F]{2}', String.Escape), + # Other special escape sequences implemented by Guile. + (r'\\u[0-9a-fA-F]{4}', String.Escape), + (r'\\U[0-9a-fA-F]{6}', String.Escape), + # Escape sequences are not overly standardized. Recognizing + # a single character after the backslash should be good enough. + # NB: we have DOTALL. + (r'\\.', String.Escape), + # The rest + (r'[^\\"]+', String), + ] } @@ -197,6 +299,7 @@ class CommonLispLexer(RegexLexer): .. versionadded:: 0.9 """ name = 'Common Lisp' + url = 'https://lisp-lang.org/' aliases = ['common-lisp', 'cl', 'lisp'] filenames = ['*.cl', '*.lisp'] mimetypes = ['text/x-common-lisp'] @@ -271,7 +374,7 @@ class CommonLispLexer(RegexLexer): ], 'body': [ # whitespace - (r'\s+', Text), + (r'\s+', Whitespace), # single-line comment (r';.*$', Comment.Single), @@ -370,11 +473,12 @@ class CommonLispLexer(RegexLexer): class HyLexer(RegexLexer): """ - Lexer for `Hy <http://hylang.org/>`_ source code. + Lexer for Hy source code. .. versionadded:: 2.0 """ name = 'Hy' + url = 'http://hylang.org/' aliases = ['hylang'] filenames = ['*.hy'] mimetypes = ['text/x-hy', 'application/x-hy'] @@ -419,7 +523,8 @@ class HyLexer(RegexLexer): (r';.*$', Comment.Single), # whitespaces - usually not relevant - (r'[,\s]+', Text), + (r',+', Text), + (r'\s+', Whitespace), # numbers (r'-?\d+\.\d+', Number.Float), @@ -483,13 +588,14 @@ class HyLexer(RegexLexer): class RacketLexer(RegexLexer): """ - Lexer for `Racket <http://racket-lang.org/>`_ source code (formerly + Lexer for Racket source code (formerly known as PLT Scheme). .. versionadded:: 1.6 """ name = 'Racket' + url = 'http://racket-lang.org/' aliases = ['racket', 'rkt'] filenames = ['*.rkt', '*.rktd', '*.rktl'] mimetypes = ['text/x-racket', 'application/x-racket'] @@ -1299,7 +1405,7 @@ class RacketLexer(RegexLexer): (r'#\|', Comment.Multiline, 'block-comment'), # Whitespaces - (r'(?u)\s+', Text), + (r'(?u)\s+', Whitespace), # Numbers: Keep in mind Racket reader hash prefixes, which # can denote the base or the type. These don't map neatly @@ -1348,7 +1454,7 @@ class RacketLexer(RegexLexer): (r'#(true|false|[tTfF])', Name.Constant, '#pop'), # Keyword argument names (e.g. #:keyword) - (r'(?u)#:%s' % _symbol, Keyword.Declaration, '#pop'), + (r'#:%s' % _symbol, Keyword.Declaration, '#pop'), # Reader extensions (r'(#lang |#!)(\S+)', @@ -1377,9 +1483,9 @@ class RacketLexer(RegexLexer): (r'quasiquote(?=[%s])' % _delimiters, Keyword, ('#pop', 'quasiquoted-datum')), (_opening_parenthesis, Punctuation, ('#pop', 'unquoted-list')), - (words(_keywords, prefix='(?u)', suffix='(?=[%s])' % _delimiters), + (words(_keywords, suffix='(?=[%s])' % _delimiters), Keyword, '#pop'), - (words(_builtins, prefix='(?u)', suffix='(?=[%s])' % _delimiters), + (words(_builtins, suffix='(?=[%s])' % _delimiters), Name.Builtin, '#pop'), (_symbol, Name, '#pop'), include('datum*') @@ -1425,17 +1531,18 @@ class RacketLexer(RegexLexer): class NewLispLexer(RegexLexer): """ - For `newLISP. <http://www.newlisp.org/>`_ source code (version 10.3.0). + For newLISP source code (version 10.3.0). .. versionadded:: 1.5 """ name = 'NewLisp' + url = 'http://www.newlisp.org/' aliases = ['newlisp'] filenames = ['*.lsp', '*.nl', '*.kif'] mimetypes = ['text/x-newlisp', 'application/x-newlisp'] - flags = re.IGNORECASE | re.MULTILINE | re.UNICODE + flags = re.IGNORECASE | re.MULTILINE # list of built-in functions for newLISP version 10.3 builtins = ( @@ -1512,7 +1619,7 @@ class NewLispLexer(RegexLexer): (r'#.*$', Comment.Single), # whitespace - (r'\s+', Text), + (r'\s+', Whitespace), # strings, symbols and characters (r'"(\\\\|\\[^\\]|[^"\\])*"', String), @@ -2124,7 +2231,7 @@ class EmacsLispLexer(RegexLexer): ], 'body': [ # whitespace - (r'\s+', Text), + (r'\s+', Whitespace), # single-line comment (r';.*$', Comment.Single), @@ -2201,11 +2308,12 @@ class EmacsLispLexer(RegexLexer): class ShenLexer(RegexLexer): """ - Lexer for `Shen <http://shenlanguage.org/>`_ source code. + Lexer for Shen source code. .. versionadded:: 2.1 """ name = 'Shen' + url = 'http://shenlanguage.org/' aliases = ['shen'] filenames = ['*.shen'] mimetypes = ['text/x-shen', 'application/x-shen'] @@ -2271,7 +2379,7 @@ class ShenLexer(RegexLexer): 'root': [ (r'(?s)\\\*.*?\*\\', Comment.Multiline), # \* ... *\ (r'\\\\.*', Comment.Single), # \\ ... - (r'\s+', Text), + (r'\s+', Whitespace), (r'_{5,}', Punctuation), (r'={5,}', Punctuation), (r'(;|:=|\||--?>|<--?)', Punctuation), @@ -2293,7 +2401,7 @@ class ShenLexer(RegexLexer): return tokens def _relevant(self, token): - return token not in (Text, Comment.Single, Comment.Multiline) + return token not in (Text, Whitespace, Comment.Single, Comment.Multiline) def _process_declarations(self, tokens): opening_paren = False @@ -2398,7 +2506,7 @@ class CPSALexer(RegexLexer): (r';.*$', Comment.Single), # whitespaces - usually not relevant - (r'\s+', Text), + (r'\s+', Whitespace), # numbers (r'-?\d+\.\d+', Number.Float), @@ -2442,8 +2550,7 @@ class CPSALexer(RegexLexer): class XtlangLexer(RegexLexer): - """An xtlang lexer for the `Extempore programming environment - <http://extempore.moso.com.au>`_. + """An xtlang lexer for the Extempore programming environment. This is a mixture of Scheme and xtlang, really. Keyword lists are taken from the Extempore Emacs mode @@ -2452,6 +2559,7 @@ class XtlangLexer(RegexLexer): .. versionadded:: 2.2 """ name = 'xtlang' + url = 'http://extempore.moso.com.au' aliases = ['extempore'] filenames = ['*.xtm'] mimetypes = [] @@ -2611,7 +2719,7 @@ class XtlangLexer(RegexLexer): (r';.*$', Comment.Single), # whitespaces - usually not relevant - (r'\s+', Text), + (r'\s+', Whitespace), # numbers (r'-?\d+\.\d+', Number.Float), @@ -2646,7 +2754,7 @@ class XtlangLexer(RegexLexer): class FennelLexer(RegexLexer): - """A lexer for the `Fennel programming language <https://fennel-lang.org>`_. + """A lexer for the Fennel programming language. Fennel compiles to Lua, so all the Lua builtins are recognized as well as the special forms that are particular to the Fennel compiler. @@ -2654,6 +2762,7 @@ class FennelLexer(RegexLexer): .. versionadded:: 2.3 """ name = 'Fennel' + url = 'https://fennel-lang.org' aliases = ['fennel', 'fnl'] filenames = ['*.fnl'] @@ -2692,7 +2801,8 @@ class FennelLexer(RegexLexer): # the only comment form is a semicolon; goes to the end of the line (r';.*$', Comment.Single), - (r'[,\s]+', Text), + (r',+', Text), + (r'\s+', Whitespace), (r'-?\d+\.\d+', Number.Float), (r'-?\d+', Number.Integer), |