aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/python/parso/py3/tests/test_tokenize.py
blob: 0029fc8a514a38b21d7e2b22def5362195aedf14 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
# -*- coding: utf-8    # This file contains Unicode characters.

from textwrap import dedent

import pytest

from parso.utils import split_lines, parse_version_string
from parso.python.token import PythonTokenTypes
from parso.python import tokenize
from parso import parse
from parso.python.tokenize import PythonToken


# To make it easier to access some of the token types, just put them here.
NAME = PythonTokenTypes.NAME
NEWLINE = PythonTokenTypes.NEWLINE
STRING = PythonTokenTypes.STRING
NUMBER = PythonTokenTypes.NUMBER
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
OP = PythonTokenTypes.OP
ENDMARKER = PythonTokenTypes.ENDMARKER
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
FSTRING_START = PythonTokenTypes.FSTRING_START
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
FSTRING_END = PythonTokenTypes.FSTRING_END


def _get_token_list(string, version=None):
    # Load the current version.
    version_info = parse_version_string(version)
    return list(tokenize.tokenize(string, version_info=version_info))


def test_end_pos_one_line():
    parsed = parse(dedent('''
    def testit():
        a = "huhu"
    '''))
    simple_stmt = next(parsed.iter_funcdefs()).get_suite().children[-1]
    string = simple_stmt.children[0].get_rhs()
    assert string.end_pos == (3, 14)


def test_end_pos_multi_line():
    parsed = parse(dedent('''
    def testit():
        a = """huhu
    asdfasdf""" + "h"
    '''))
    expr_stmt = next(parsed.iter_funcdefs()).get_suite().children[1].children[0]
    string_leaf = expr_stmt.get_rhs().children[0]
    assert string_leaf.end_pos == (4, 11)


def test_simple_no_whitespace():
    # Test a simple one line string, no preceding whitespace
    simple_docstring = '"""simple one line docstring"""'
    token_list = _get_token_list(simple_docstring)
    _, value, _, prefix = token_list[0]
    assert prefix == ''
    assert value == '"""simple one line docstring"""'


def test_simple_with_whitespace():
    # Test a simple one line string with preceding whitespace and newline
    simple_docstring = '  """simple one line docstring""" \r\n'
    token_list = _get_token_list(simple_docstring)
    assert token_list[0][0] == INDENT
    typ, value, start_pos, prefix = token_list[1]
    assert prefix == '  '
    assert value == '"""simple one line docstring"""'
    assert typ == STRING
    typ, value, start_pos, prefix = token_list[2]
    assert prefix == ' '
    assert typ == NEWLINE


def test_function_whitespace():
    # Test function definition whitespace identification
    fundef = dedent('''
    def test_whitespace(*args, **kwargs):
        x = 1
        if x > 0:
            print(True)
    ''')
    token_list = _get_token_list(fundef)
    for _, value, _, prefix in token_list:
        if value == 'test_whitespace':
            assert prefix == ' '
        if value == '(':
            assert prefix == ''
        if value == '*':
            assert prefix == ''
        if value == '**':
            assert prefix == ' '
        if value == 'print':
            assert prefix == '        '
        if value == 'if':
            assert prefix == '    '


def test_tokenize_multiline_I():
    # Make sure multiline string having newlines have the end marker on the
    # next line
    fundef = '''""""\n'''
    token_list = _get_token_list(fundef)
    assert token_list == [PythonToken(ERRORTOKEN, '""""\n', (1, 0), ''),
                          PythonToken(ENDMARKER, '', (2, 0), '')]


def test_tokenize_multiline_II():
    # Make sure multiline string having no newlines have the end marker on
    # same line
    fundef = '''""""'''
    token_list = _get_token_list(fundef)
    assert token_list == [PythonToken(ERRORTOKEN, '""""', (1, 0), ''),
                          PythonToken(ENDMARKER, '', (1, 4), '')]


def test_tokenize_multiline_III():
    # Make sure multiline string having newlines have the end marker on the
    # next line even if several newline
    fundef = '''""""\n\n'''
    token_list = _get_token_list(fundef)
    assert token_list == [PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ''),
                          PythonToken(ENDMARKER, '', (3, 0), '')]


def test_identifier_contains_unicode():
    fundef = dedent('''
    def 我あφ():
        pass
    ''')
    token_list = _get_token_list(fundef)
    unicode_token = token_list[1]
    assert unicode_token[0] == NAME


def test_quoted_strings():
    string_tokens = [
        'u"test"',
        'u"""test"""',
        'U"""test"""',
        "u'''test'''",
        "U'''test'''",
    ]

    for s in string_tokens:
        module = parse('''a = %s\n''' % s)
        simple_stmt = module.children[0]
        expr_stmt = simple_stmt.children[0]
        assert len(expr_stmt.children) == 3
        string_tok = expr_stmt.children[2]
        assert string_tok.type == 'string'
        assert string_tok.value == s


def test_ur_literals():
    """
    Decided to parse `u''` literals regardless of Python version. This makes
    probably sense:

    - Python 3+ doesn't support it, but it doesn't hurt
      not be. While this is incorrect, it's just incorrect for one "old" and in
      the future not very important version.
    - All the other Python versions work very well with it.
    """
    def check(literal, is_literal=True):
        token_list = _get_token_list(literal)
        typ, result_literal, _, _ = token_list[0]
        if is_literal:
            if typ != FSTRING_START:
                assert typ == STRING
                assert result_literal == literal
        else:
            assert typ == NAME

    check('u""')
    check('ur""', is_literal=False)
    check('Ur""', is_literal=False)
    check('UR""', is_literal=False)
    check('bR""')
    check('Rb""')

    check('fr""')
    check('rF""')
    check('f""')
    check('F""')


def test_error_literal():
    error_token, newline, endmarker = _get_token_list('"\n')
    assert error_token.type == ERRORTOKEN
    assert error_token.string == '"'
    assert newline.type == NEWLINE
    assert endmarker.type == ENDMARKER
    assert endmarker.prefix == ''

    bracket, error_token, endmarker = _get_token_list('( """')
    assert error_token.type == ERRORTOKEN
    assert error_token.prefix == ' '
    assert error_token.string == '"""'
    assert endmarker.type == ENDMARKER
    assert endmarker.prefix == ''


def test_endmarker_end_pos():
    def check(code):
        tokens = _get_token_list(code)
        lines = split_lines(code)
        assert tokens[-1].end_pos == (len(lines), len(lines[-1]))

    check('#c')
    check('#c\n')
    check('a\n')
    check('a')
    check(r'a\\n')
    check('a\\')


@pytest.mark.parametrize(
    ('code', 'types'), [
        # Indentation
        (' foo', [INDENT, NAME, DEDENT]),
        ('  foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        ('  foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
                                NEWLINE, NAME, DEDENT]),
        (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),

        # Name stuff
        ('1foo1', [NUMBER, NAME]),
        ('மெல்லினம்', [NAME]),
        ('²', [ERRORTOKEN]),
        ('ä²ö', [NAME, ERRORTOKEN, NAME]),
        ('ää²¹öö', [NAME, ERRORTOKEN, NAME]),
        (' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]),
        (dedent('''\
            class BaseCache:
                    a
                def
                    b
                def
                    c
            '''), [NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE,
                   ERROR_DEDENT, NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
                   NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT, DEDENT]),
        ('  )\n foo', [INDENT, OP, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
        ('a\n b\n  )\n c', [NAME, NEWLINE, INDENT, NAME, NEWLINE, INDENT, OP,
                            NEWLINE, DEDENT, NAME, DEDENT]),
        (' 1 \\\ndef', [INDENT, NUMBER, NAME, DEDENT]),
    ]
)
def test_token_types(code, types):
    actual_types = [t.type for t in _get_token_list(code)]
    assert actual_types == types + [ENDMARKER]


def test_error_string():
    indent, t1, newline, token, endmarker = _get_token_list(' "\n')
    assert t1.type == ERRORTOKEN
    assert t1.prefix == ' '
    assert t1.string == '"'
    assert newline.type == NEWLINE
    assert endmarker.prefix == ''
    assert endmarker.string == ''


def test_indent_error_recovery():
    code = dedent("""\
                        str(
        from x import a
        def
        """)
    lst = _get_token_list(code)
    expected = [
        # `str(`
        INDENT, NAME, OP,
        # `from parso`
        NAME, NAME,
        # `import a` on same line as the previous from parso
        NAME, NAME, NEWLINE,
        # Dedent happens, because there's an import now and the import
        # statement "breaks" out of the opening paren on the first line.
        DEDENT,
        # `b`
        NAME, NEWLINE, ENDMARKER]
    assert [t.type for t in lst] == expected


def test_error_token_after_dedent():
    code = dedent("""\
        class C:
            pass
        $foo
        """)
    lst = _get_token_list(code)
    expected = [
        NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
        # $foo\n
        ERRORTOKEN, NAME, NEWLINE, ENDMARKER
    ]
    assert [t.type for t in lst] == expected


def test_brackets_no_indentation():
    """
    There used to be an issue that the parentheses counting would go below
    zero. This should not happen.
    """
    code = dedent("""\
        }
        {
          }
        """)
    lst = _get_token_list(code)
    assert [t.type for t in lst] == [OP, NEWLINE, OP, OP, NEWLINE, ENDMARKER]


def test_form_feed():
    indent, error_token, dedent_, endmarker = _get_token_list(dedent('''\
        \f"""'''))
    assert error_token.prefix == '\f'
    assert error_token.string == '"""'
    assert endmarker.prefix == ''
    assert indent.type == INDENT
    assert dedent_.type == DEDENT


def test_carriage_return():
    lst = _get_token_list(' =\\\rclass')
    assert [t.type for t in lst] == [INDENT, OP, NAME, DEDENT, ENDMARKER]


def test_backslash():
    code = '\\\n# 1 \n'
    endmarker, = _get_token_list(code)
    assert endmarker.prefix == code


@pytest.mark.parametrize(
    ('code', 'types'), [
        # f-strings
        ('f"', [FSTRING_START]),
        ('f""', [FSTRING_START, FSTRING_END]),
        ('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
        ('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP]),
        (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
        (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),

        # format spec
        (r'f"Some {x:.2f}{y}"', [FSTRING_START, FSTRING_STRING, OP, NAME, OP,
                                 FSTRING_STRING, OP, OP, NAME, OP, FSTRING_END]),

        # multiline f-string
        ('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
        ('f"""abc{\n123}def"""', [
            FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
            FSTRING_END
        ]),

        # a line continuation inside of an fstring_string
        ('f"abc\\\ndef"', [
            FSTRING_START, FSTRING_STRING, FSTRING_END
        ]),
        ('f"\\\n{123}\\\n"', [
            FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
            FSTRING_END
        ]),

        # a line continuation inside of an fstring_expr
        ('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END]),

        # a line continuation inside of an format spec
        ('f"{123:.2\\\nf}"', [
            FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END
        ]),

        # a newline without a line continuation inside a single-line string is
        # wrong, and will generate an ERRORTOKEN
        ('f"abc\ndef"', [
            FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN
        ]),

        # a more complex example
        (r'print(f"Some {x:.2f}a{y}")', [
            NAME, OP, FSTRING_START, FSTRING_STRING, OP, NAME, OP,
            FSTRING_STRING, OP, FSTRING_STRING, OP, NAME, OP, FSTRING_END, OP
        ]),
        # issue #86, a string-like in an f-string expression
        ('f"{ ""}"', [
            FSTRING_START, OP, FSTRING_END, STRING
        ]),
        ('f"{ f""}"', [
            FSTRING_START, OP, NAME, FSTRING_END, STRING
        ]),
    ]
)
def test_fstring_token_types(code, types, each_version):
    actual_types = [t.type for t in _get_token_list(code, each_version)]
    assert types + [ENDMARKER] == actual_types


@pytest.mark.parametrize(
    ('code', 'types'), [
        # issue #87, `:=` in the outest paratheses should be tokenized
        # as a format spec marker and part of the format
        ('f"{x:=10}"', [
            FSTRING_START, OP, NAME, OP, FSTRING_STRING, OP, FSTRING_END
        ]),
        ('f"{(x:=10)}"', [
            FSTRING_START, OP, OP, NAME, OP, NUMBER, OP, OP, FSTRING_END
        ]),
    ]
)
def test_fstring_assignment_expression(code, types, version_ge_py38):
    actual_types = [t.type for t in _get_token_list(code, version_ge_py38)]
    assert types + [ENDMARKER] == actual_types


def test_fstring_end_error_pos(version_ge_py38):
    f_start, f_string, bracket, f_end, endmarker = \
        _get_token_list('f" { "', version_ge_py38)
    assert f_start.start_pos == (1, 0)
    assert f_string.start_pos == (1, 2)
    assert bracket.start_pos == (1, 3)
    assert f_end.start_pos == (1, 5)
    assert endmarker.start_pos == (1, 6)