aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/python/Pygments/py3/pygments/regexopt.py
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/python/Pygments/py3/pygments/regexopt.py
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/python/Pygments/py3/pygments/regexopt.py')
-rw-r--r--contrib/python/Pygments/py3/pygments/regexopt.py91
1 files changed, 91 insertions, 0 deletions
diff --git a/contrib/python/Pygments/py3/pygments/regexopt.py b/contrib/python/Pygments/py3/pygments/regexopt.py
new file mode 100644
index 0000000000..cb2c8e21a9
--- /dev/null
+++ b/contrib/python/Pygments/py3/pygments/regexopt.py
@@ -0,0 +1,91 @@
+"""
+ pygments.regexopt
+ ~~~~~~~~~~~~~~~~~
+
+ An algorithm that generates optimized regexes for matching long lists of
+ literal strings.
+
+ :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS.
+ :license: BSD, see LICENSE for details.
+"""
+
+import re
+from re import escape
+from os.path import commonprefix
+from itertools import groupby
+from operator import itemgetter
+
+CS_ESCAPE = re.compile(r'[\[\^\\\-\]]')
+FIRST_ELEMENT = itemgetter(0)
+
+
+def make_charset(letters):
+ return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
+
+
+def regex_opt_inner(strings, open_paren):
+ """Return a regex that matches any string in the sorted list of strings."""
+ close_paren = open_paren and ')' or ''
+ # print strings, repr(open_paren)
+ if not strings:
+ # print '-> nothing left'
+ return ''
+ first = strings[0]
+ if len(strings) == 1:
+ # print '-> only 1 string'
+ return open_paren + escape(first) + close_paren
+ if not first:
+ # print '-> first string empty'
+ return open_paren + regex_opt_inner(strings[1:], '(?:') \
+ + '?' + close_paren
+ if len(first) == 1:
+ # multiple one-char strings? make a charset
+ oneletter = []
+ rest = []
+ for s in strings:
+ if len(s) == 1:
+ oneletter.append(s)
+ else:
+ rest.append(s)
+ if len(oneletter) > 1: # do we have more than one oneletter string?
+ if rest:
+ # print '-> 1-character + rest'
+ return open_paren + regex_opt_inner(rest, '') + '|' \
+ + make_charset(oneletter) + close_paren
+ # print '-> only 1-character'
+ return open_paren + make_charset(oneletter) + close_paren
+ prefix = commonprefix(strings)
+ if prefix:
+ plen = len(prefix)
+ # we have a prefix for all strings
+ # print '-> prefix:', prefix
+ return open_paren + escape(prefix) \
+ + regex_opt_inner([s[plen:] for s in strings], '(?:') \
+ + close_paren
+ # is there a suffix?
+ strings_rev = [s[::-1] for s in strings]
+ suffix = commonprefix(strings_rev)
+ if suffix:
+ slen = len(suffix)
+ # print '-> suffix:', suffix[::-1]
+ return open_paren \
+ + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
+ + escape(suffix[::-1]) + close_paren
+ # recurse on common 1-string prefixes
+ # print '-> last resort'
+ return open_paren + \
+ '|'.join(regex_opt_inner(list(group[1]), '')
+ for group in groupby(strings, lambda s: s[0] == first[0])) \
+ + close_paren
+
+
+def regex_opt(strings, prefix='', suffix=''):
+ """Return a compiled regex that matches any string in the given list.
+
+ The strings to match must be literal strings, not regexes. They will be
+ regex-escaped.
+
+ *prefix* and *suffix* are pre- and appended to the final regex.
+ """
+ strings = sorted(strings)
+ return prefix + regex_opt_inner(strings, '(') + suffix