contrib/python/Pygments/py2/pygments/lexers/special.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

# -*- coding: utf-8 -*- 
""" 
    pygments.lexers.special 
    ~~~~~~~~~~~~~~~~~~~~~~~ 
 
    Special lexers. 
 
    :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
    :license: BSD, see LICENSE for details. 
""" 
 
import re 
 
from pygments.lexer import Lexer 
from pygments.token import Token, Error, Text 
from pygments.util import get_choice_opt, text_type, BytesIO 
 
 
__all__ = ['TextLexer', 'RawTokenLexer'] 
 
 
class TextLexer(Lexer): 
    """ 
    "Null" lexer, doesn't highlight anything. 
    """ 
    name = 'Text only' 
    aliases = ['text'] 
    filenames = ['*.txt'] 
    mimetypes = ['text/plain'] 
    priority = 0.01
 
    def get_tokens_unprocessed(self, text): 
        yield 0, Text, text 
 
    def analyse_text(text):
        return TextLexer.priority
 

_ttype_cache = {} 
 
line_re = re.compile(b'.*?\n') 
 
 
class RawTokenLexer(Lexer): 
    """ 
    Recreate a token stream formatted with the `RawTokenFormatter`.  This 
    lexer raises exceptions during parsing if the token stream in the 
    file is malformed. 
 
    Additional options accepted: 
 
    `compress` 
        If set to ``"gz"`` or ``"bz2"``, decompress the token stream with 
        the given compression algorithm before lexing (default: ``""``). 
    """ 
    name = 'Raw token data' 
    aliases = ['raw'] 
    filenames = [] 
    mimetypes = ['application/x-pygments-tokens'] 
 
    def __init__(self, **options): 
        self.compress = get_choice_opt(options, 'compress', 
                                       ['', 'none', 'gz', 'bz2'], '') 
        Lexer.__init__(self, **options) 
 
    def get_tokens(self, text): 
        if isinstance(text, text_type): 
            # raw token stream never has any non-ASCII characters 
            text = text.encode('ascii') 
        if self.compress == 'gz': 
            import gzip 
            gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text)) 
            text = gzipfile.read() 
        elif self.compress == 'bz2': 
            import bz2 
            text = bz2.decompress(text) 
 
        # do not call Lexer.get_tokens() because we do not want Unicode 
        # decoding to occur, and stripping is not optional. 
        text = text.strip(b'\n') + b'\n' 
        for i, t, v in self.get_tokens_unprocessed(text): 
            yield t, v 
 
    def get_tokens_unprocessed(self, text): 
        length = 0 
        for match in line_re.finditer(text): 
            try: 
                ttypestr, val = match.group().split(b'\t', 1) 
            except ValueError: 
                val = match.group().decode('ascii', 'replace') 
                ttype = Error 
            else: 
                ttype = _ttype_cache.get(ttypestr) 
                if not ttype: 
                    ttype = Token 
                    ttypes = ttypestr.split('.')[1:] 
                    for ttype_ in ttypes: 
                        if not ttype_ or not ttype_[0].isupper(): 
                            raise ValueError('malformed token name') 
                        ttype = getattr(ttype, ttype_) 
                    _ttype_cache[ttypestr] = ttype 
                val = val[2:-2].decode('unicode-escape') 
            yield length, ttype, val 
            length += len(val)