contrib/python/Pygments/py3/pygments/regexopt.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

""" 
    pygments.regexopt 
    ~~~~~~~~~~~~~~~~~ 
 
    An algorithm that generates optimized regexes for matching long lists of 
    literal strings. 
 
    :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS.
    :license: BSD, see LICENSE for details. 
""" 
 
import re 
from re import escape 
from os.path import commonprefix 
from itertools import groupby 
from operator import itemgetter 
 
CS_ESCAPE = re.compile(r'[\[\^\\\-\]]')
FIRST_ELEMENT = itemgetter(0) 
 
 
def make_charset(letters): 
    return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' 
 
 
def regex_opt_inner(strings, open_paren): 
    """Return a regex that matches any string in the sorted list of strings.""" 
    close_paren = open_paren and ')' or '' 
    # print strings, repr(open_paren) 
    if not strings: 
        # print '-> nothing left' 
        return '' 
    first = strings[0] 
    if len(strings) == 1: 
        # print '-> only 1 string' 
        return open_paren + escape(first) + close_paren 
    if not first: 
        # print '-> first string empty' 
        return open_paren + regex_opt_inner(strings[1:], '(?:') \ 
            + '?' + close_paren 
    if len(first) == 1: 
        # multiple one-char strings? make a charset 
        oneletter = [] 
        rest = [] 
        for s in strings: 
            if len(s) == 1: 
                oneletter.append(s) 
            else: 
                rest.append(s) 
        if len(oneletter) > 1:  # do we have more than one oneletter string? 
            if rest: 
                # print '-> 1-character + rest' 
                return open_paren + regex_opt_inner(rest, '') + '|' \ 
                    + make_charset(oneletter) + close_paren 
            # print '-> only 1-character' 
            return open_paren + make_charset(oneletter) + close_paren
    prefix = commonprefix(strings) 
    if prefix: 
        plen = len(prefix) 
        # we have a prefix for all strings 
        # print '-> prefix:', prefix 
        return open_paren + escape(prefix) \ 
            + regex_opt_inner([s[plen:] for s in strings], '(?:') \ 
            + close_paren 
    # is there a suffix? 
    strings_rev = [s[::-1] for s in strings] 
    suffix = commonprefix(strings_rev) 
    if suffix: 
        slen = len(suffix) 
        # print '-> suffix:', suffix[::-1] 
        return open_paren \ 
            + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ 
            + escape(suffix[::-1]) + close_paren 
    # recurse on common 1-string prefixes 
    # print '-> last resort' 
    return open_paren + \ 
        '|'.join(regex_opt_inner(list(group[1]), '') 
                 for group in groupby(strings, lambda s: s[0] == first[0])) \ 
        + close_paren 
 
 
def regex_opt(strings, prefix='', suffix=''): 
    """Return a compiled regex that matches any string in the given list. 
 
    The strings to match must be literal strings, not regexes.  They will be 
    regex-escaped. 
 
    *prefix* and *suffix* are pre- and appended to the final regex. 
    """ 
    strings = sorted(strings) 
    return prefix + regex_opt_inner(strings, '(') + suffix