contrib/python/ipython/py2/IPython/utils/_tokenize_py2.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439

"""Patched version of standard library tokenize, to deal with various bugs. 
 
Patches 
 
- Relevant parts of Gareth Rees' patch for Python issue #12691 (untokenizing), 
  manually applied. 
- Newlines in comments and blank lines should be either NL or NEWLINE, depending 
  on whether they are in a multi-line statement. Filed as Python issue #17061. 
 
------------------------------------------------------------------------------- 
Tokenization help for Python programs. 
 
generate_tokens(readline) is a generator that breaks a stream of 
text into Python tokens.  It accepts a readline-like method which is called 
repeatedly to get the next line of input (or "" for EOF).  It generates 
5-tuples with these members: 
 
    the token type (see token.py) 
    the token (a string) 
    the starting (row, column) indices of the token (a 2-tuple of ints) 
    the ending (row, column) indices of the token (a 2-tuple of ints) 
    the original line (string) 
 
It is designed to match the working of the Python tokenizer exactly, except 
that it produces COMMENT tokens for comments and gives type OP for all 
operators 
 
Older entry points 
    tokenize_loop(readline, tokeneater) 
    tokenize(readline, tokeneater=printtoken) 
are the same, except instead of generating tokens, tokeneater is a callback 
function to which the 5 fields described above are passed as 5 arguments, 
each time a new token is found.""" 
from __future__ import print_function 
 
__author__ = 'Ka-Ping Yee <ping@lfw.org>' 
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 
               'Skip Montanaro, Raymond Hettinger') 
 
import string, re 
from token import * 
 
import token 
__all__ = [x for x in dir(token) if not x.startswith("_")] 
__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"] 
del x 
del token 
 
__all__ += ["TokenError"] 
 
COMMENT = N_TOKENS 
tok_name[COMMENT] = 'COMMENT' 
NL = N_TOKENS + 1 
tok_name[NL] = 'NL' 
N_TOKENS += 2 
 
def group(*choices): return '(' + '|'.join(choices) + ')' 
def any(*choices): return group(*choices) + '*' 
def maybe(*choices): return group(*choices) + '?' 
 
Whitespace = r'[ \f\t]*' 
Comment = r'#[^\r\n]*' 
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 
Name = r'[a-zA-Z_]\w*' 
 
Hexnumber = r'0[xX][\da-fA-F]+[lL]?' 
Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?' 
Binnumber = r'0[bB][01]+[lL]?' 
Decnumber = r'[1-9]\d*[lL]?' 
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 
Exponent = r'[eE][-+]?\d+' 
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 
Expfloat = r'\d+' + Exponent 
Floatnumber = group(Pointfloat, Expfloat) 
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 
Number = group(Imagnumber, Floatnumber, Intnumber) 
 
# Tail end of ' string. 
Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 
# Tail end of " string. 
Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 
# Tail end of ''' string. 
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 
# Tail end of """ string. 
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 
Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""') 
# Single-line ' or " string. 
String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 
               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 
 
# Because of leftmost-then-longest match semantics, be sure to put the 
# longest operators first (e.g., if = came before ==, == would get 
# recognized as two instances of =). 
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 
                 r"//=?", 
                 r"[+\-*/%&|^=<>]=?", 
                 r"~") 
 
Bracket = '[][(){}]' 
Special = group(r'\r?\n', r'[:;.,`@]') 
Funny = group(Operator, Bracket, Special) 
 
PlainToken = group(Number, Funny, String, Name) 
Token = Ignore + PlainToken 
 
# First (or only) line of ' or " string. 
ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 
                group("'", r'\\\r?\n'), 
                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 
                group('"', r'\\\r?\n')) 
PseudoExtras = group(r'\\\r?\n', Comment, Triple) 
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 
 
tokenprog, pseudoprog, single3prog, double3prog = map( 
    re.compile, (Token, PseudoToken, Single3, Double3)) 
endprogs = {"'": re.compile(Single), '"': re.compile(Double), 
            "'''": single3prog, '"""': double3prog, 
            "r'''": single3prog, 'r"""': double3prog, 
            "u'''": single3prog, 'u"""': double3prog, 
            "ur'''": single3prog, 'ur"""': double3prog, 
            "R'''": single3prog, 'R"""': double3prog, 
            "U'''": single3prog, 'U"""': double3prog, 
            "uR'''": single3prog, 'uR"""': double3prog, 
            "Ur'''": single3prog, 'Ur"""': double3prog, 
            "UR'''": single3prog, 'UR"""': double3prog, 
            "b'''": single3prog, 'b"""': double3prog, 
            "br'''": single3prog, 'br"""': double3prog, 
            "B'''": single3prog, 'B"""': double3prog, 
            "bR'''": single3prog, 'bR"""': double3prog, 
            "Br'''": single3prog, 'Br"""': double3prog, 
            "BR'''": single3prog, 'BR"""': double3prog, 
            'r': None, 'R': None, 'u': None, 'U': None, 
            'b': None, 'B': None} 
 
triple_quoted = {} 
for t in ("'''", '"""', 
          "r'''", 'r"""', "R'''", 'R"""', 
          "u'''", 'u"""', "U'''", 'U"""', 
          "ur'''", 'ur"""', "Ur'''", 'Ur"""', 
          "uR'''", 'uR"""', "UR'''", 'UR"""', 
          "b'''", 'b"""', "B'''", 'B"""', 
          "br'''", 'br"""', "Br'''", 'Br"""', 
          "bR'''", 'bR"""', "BR'''", 'BR"""'): 
    triple_quoted[t] = t 
single_quoted = {} 
for t in ("'", '"', 
          "r'", 'r"', "R'", 'R"', 
          "u'", 'u"', "U'", 'U"', 
          "ur'", 'ur"', "Ur'", 'Ur"', 
          "uR'", 'uR"', "UR'", 'UR"', 
          "b'", 'b"', "B'", 'B"', 
          "br'", 'br"', "Br'", 'Br"', 
          "bR'", 'bR"', "BR'", 'BR"' ): 
    single_quoted[t] = t 
 
tabsize = 8 
 
class TokenError(Exception): pass 
 
class StopTokenizing(Exception): pass 
 
def printtoken(type, token, srow_scol, erow_ecol, line): # for testing 
    srow, scol = srow_scol 
    erow, ecol = erow_ecol 
    print("%d,%d-%d,%d:\t%s\t%s" % \ 
        (srow, scol, erow, ecol, tok_name[type], repr(token))) 
 
def tokenize(readline, tokeneater=printtoken): 
    """ 
    The tokenize() function accepts two parameters: one representing the 
    input stream, and one providing an output mechanism for tokenize(). 
 
    The first parameter, readline, must be a callable object which provides 
    the same interface as the readline() method of built-in file objects. 
    Each call to the function should return one line of input as a string. 
 
    The second parameter, tokeneater, must also be a callable object. It is 
    called once for each token, with five arguments, corresponding to the 
    tuples generated by generate_tokens(). 
    """ 
    try: 
        tokenize_loop(readline, tokeneater) 
    except StopTokenizing: 
        pass 
 
# backwards compatible interface 
def tokenize_loop(readline, tokeneater): 
    for token_info in generate_tokens(readline): 
        tokeneater(*token_info) 
 
class Untokenizer: 
 
    def __init__(self): 
        self.tokens = [] 
        self.prev_row = 1 
        self.prev_col = 0 
 
    def add_whitespace(self, start): 
        row, col = start 
        assert row >= self.prev_row 
        col_offset = col - self.prev_col 
        if col_offset > 0: 
            self.tokens.append(" " * col_offset) 
        elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER): 
            # Line was backslash-continued 
            self.tokens.append(" ") 
 
    def untokenize(self, tokens): 
        iterable = iter(tokens) 
        for t in iterable: 
            if len(t) == 2: 
                self.compat(t, iterable) 
                break 
            tok_type, token, start, end = t[:4] 
            self.add_whitespace(start) 
            self.tokens.append(token) 
            self.prev_row, self.prev_col = end 
            if tok_type in (NEWLINE, NL): 
                self.prev_row += 1 
                self.prev_col = 0 
        return "".join(self.tokens) 
 
    def compat(self, token, iterable): 
        # This import is here to avoid problems when the itertools 
        # module is not built yet and tokenize is imported. 
        from itertools import chain 
        startline = False 
        prevstring = False 
        indents = [] 
        toks_append = self.tokens.append 
        for tok in chain([token], iterable): 
            toknum, tokval = tok[:2] 
 
            if toknum in (NAME, NUMBER): 
                tokval += ' ' 
 
            # Insert a space between two consecutive strings 
            if toknum == STRING: 
                if prevstring: 
                    tokval = ' ' + tokval 
                prevstring = True 
            else: 
                prevstring = False 
 
            if toknum == INDENT: 
                indents.append(tokval) 
                continue 
            elif toknum == DEDENT: 
                indents.pop() 
                continue 
            elif toknum in (NEWLINE, NL): 
                startline = True 
            elif startline and indents: 
                toks_append(indents[-1]) 
                startline = False 
            toks_append(tokval) 
 
def untokenize(iterable): 
    """Transform tokens back into Python source code. 
 
    Each element returned by the iterable must be a token sequence 
    with at least two elements, a token number and token value.  If 
    only two tokens are passed, the resulting output is poor. 
 
    Round-trip invariant for full input: 
        Untokenized source will match input source exactly 
 
    Round-trip invariant for limited intput: 
        # Output text will tokenize the back to the input 
        t1 = [tok[:2] for tok in generate_tokens(f.readline)] 
        newcode = untokenize(t1) 
        readline = iter(newcode.splitlines(1)).next 
        t2 = [tok[:2] for tok in generate_tokens(readline)] 
        assert t1 == t2 
    """ 
    ut = Untokenizer() 
    return ut.untokenize(iterable) 
 
def generate_tokens(readline): 
    """ 
    The generate_tokens() generator requires one argment, readline, which 
    must be a callable object which provides the same interface as the 
    readline() method of built-in file objects. Each call to the function 
    should return one line of input as a string.  Alternately, readline 
    can be a callable function terminating with StopIteration: 
        readline = open(myfile).next    # Example of alternate readline 
 
    The generator produces 5-tuples with these members: the token type; the 
    token string; a 2-tuple (srow, scol) of ints specifying the row and 
    column where the token begins in the source; a 2-tuple (erow, ecol) of 
    ints specifying the row and column where the token ends in the source; 
    and the line on which the token was found. The line passed is the 
    logical line; continuation lines are included. 
    """ 
    lnum = parenlev = continued = 0 
    namechars, numchars = string.ascii_letters + '_', '0123456789' 
    contstr, needcont = '', 0 
    contline = None 
    indents = [0] 
 
    while 1:                                   # loop over lines in stream 
        try: 
            line = readline() 
        except StopIteration: 
            line = '' 
        lnum += 1 
        pos, max = 0, len(line) 
 
        if contstr:                            # continued string 
            if not line: 
                raise TokenError("EOF in multi-line string", strstart) 
            endmatch = endprog.match(line) 
            if endmatch: 
                pos = end = endmatch.end(0) 
                yield (STRING, contstr + line[:end], 
                       strstart, (lnum, end), contline + line) 
                contstr, needcont = '', 0 
                contline = None 
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 
                yield (ERRORTOKEN, contstr + line, 
                           strstart, (lnum, len(line)), contline) 
                contstr = '' 
                contline = None 
                continue 
            else: 
                contstr = contstr + line 
                contline = contline + line 
                continue 
 
        elif parenlev == 0 and not continued:  # new statement 
            if not line: break 
            column = 0 
            while pos < max:                   # measure leading whitespace 
                if line[pos] == ' ': 
                    column += 1 
                elif line[pos] == '\t': 
                    column = (column//tabsize + 1)*tabsize 
                elif line[pos] == '\f': 
                    column = 0 
                else: 
                    break 
                pos += 1 
            if pos == max: 
                break 
 
            if line[pos] in '#\r\n':           # skip comments or blank lines 
                if line[pos] == '#': 
                    comment_token = line[pos:].rstrip('\r\n') 
                    nl_pos = pos + len(comment_token) 
                    yield (COMMENT, comment_token, 
                           (lnum, pos), (lnum, pos + len(comment_token)), line) 
                    yield (NEWLINE, line[nl_pos:], 
                           (lnum, nl_pos), (lnum, len(line)), line) 
                else: 
                    yield (NEWLINE, line[pos:], 
                           (lnum, pos), (lnum, len(line)), line) 
                continue 
 
            if column > indents[-1]:           # count indents or dedents 
                indents.append(column) 
                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 
            while column < indents[-1]: 
                if column not in indents: 
                    raise IndentationError( 
                        "unindent does not match any outer indentation level", 
                        ("<tokenize>", lnum, pos, line)) 
                indents = indents[:-1] 
                yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 
 
        else:                                  # continued statement 
            if not line: 
                raise TokenError("EOF in multi-line statement", (lnum, 0)) 
            continued = 0 
 
        while pos < max: 
            pseudomatch = pseudoprog.match(line, pos) 
            if pseudomatch:                                # scan for tokens 
                start, end = pseudomatch.span(1) 
                spos, epos, pos = (lnum, start), (lnum, end), end 
                token, initial = line[start:end], line[start] 
 
                if initial in numchars or \ 
                   (initial == '.' and token != '.'):      # ordinary number 
                    yield (NUMBER, token, spos, epos, line) 
                elif initial in '\r\n': 
                    yield (NL if parenlev > 0 else NEWLINE, 
                           token, spos, epos, line) 
                elif initial == '#': 
                    assert not token.endswith("\n") 
                    yield (COMMENT, token, spos, epos, line) 
                elif token in triple_quoted: 
                    endprog = endprogs[token] 
                    endmatch = endprog.match(line, pos) 
                    if endmatch:                           # all on one line 
                        pos = endmatch.end(0) 
                        token = line[start:pos] 
                        yield (STRING, token, spos, (lnum, pos), line) 
                    else: 
                        strstart = (lnum, start)           # multiple lines 
                        contstr = line[start:] 
                        contline = line 
                        break 
                elif initial in single_quoted or \ 
                    token[:2] in single_quoted or \ 
                    token[:3] in single_quoted: 
                    if token[-1] == '\n':                  # continued string 
                        strstart = (lnum, start) 
                        endprog = (endprogs[initial] or endprogs[token[1]] or 
                                   endprogs[token[2]]) 
                        contstr, needcont = line[start:], 1 
                        contline = line 
                        break 
                    else:                                  # ordinary string 
                        yield (STRING, token, spos, epos, line) 
                elif initial in namechars:                 # ordinary name 
                    yield (NAME, token, spos, epos, line) 
                elif initial == '\\':                      # continued stmt 
                    continued = 1 
                else: 
                    if initial in '([{': 
                        parenlev += 1 
                    elif initial in ')]}': 
                        parenlev -= 1 
                    yield (OP, token, spos, epos, line) 
            else: 
                yield (ERRORTOKEN, line[pos], 
                           (lnum, pos), (lnum, pos+1), line) 
                pos += 1 
 
    for indent in indents[1:]:                 # pop remaining indent levels 
        yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 
    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 
 
if __name__ == '__main__':                     # testing 
    import sys 
    if len(sys.argv) > 1: 
        tokenize(open(sys.argv[1]).readline) 
    else: 
        tokenize(sys.stdin.readline)