aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/python/pyre2/py3/src
diff options
context:
space:
mode:
authorrobot-piglet <robot-piglet@yandex-team.com>2023-12-02 01:45:21 +0300
committerrobot-piglet <robot-piglet@yandex-team.com>2023-12-02 02:42:50 +0300
commit9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch)
tree9f88a486917d371d099cd712efd91b4c122d209d /contrib/python/pyre2/py3/src
parent32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff)
downloadydb-9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c.tar.gz
Intermediate changes
Diffstat (limited to 'contrib/python/pyre2/py3/src')
-rw-r--r--contrib/python/pyre2/py3/src/_re2macros.h13
-rw-r--r--contrib/python/pyre2/py3/src/compile.pxi234
-rw-r--r--contrib/python/pyre2/py3/src/includes.pxi109
-rw-r--r--contrib/python/pyre2/py3/src/match.pxi280
-rw-r--r--contrib/python/pyre2/py3/src/pattern.pxi650
-rw-r--r--contrib/python/pyre2/py3/src/re2.pyx458
6 files changed, 1744 insertions, 0 deletions
diff --git a/contrib/python/pyre2/py3/src/_re2macros.h b/contrib/python/pyre2/py3/src/_re2macros.h
new file mode 100644
index 0000000000..b9ac82af6b
--- /dev/null
+++ b/contrib/python/pyre2/py3/src/_re2macros.h
@@ -0,0 +1,13 @@
+#ifndef __RE2MACROS_H
+#define __RE2MACROS_H
+
+#include <stdio.h>
+#include "re2/stringpiece.h"
+
+static inline re2::StringPiece * new_StringPiece_array(int n)
+{
+ re2::StringPiece * sp = new re2::StringPiece[n];
+ return sp;
+}
+
+#endif
diff --git a/contrib/python/pyre2/py3/src/compile.pxi b/contrib/python/pyre2/py3/src/compile.pxi
new file mode 100644
index 0000000000..887a2778cd
--- /dev/null
+++ b/contrib/python/pyre2/py3/src/compile.pxi
@@ -0,0 +1,234 @@
+
+def compile(pattern, int flags=0, int max_mem=8388608):
+ cachekey = (type(pattern), pattern, flags)
+ if cachekey in _cache:
+ return _cache[cachekey]
+ p = _compile(pattern, flags, max_mem)
+
+ if len(_cache) >= _MAXCACHE:
+ _cache.popitem()
+ _cache[cachekey] = p
+ return p
+
+
+def _compile(object pattern, int flags=0, int max_mem=8388608):
+ """Compile a regular expression pattern, returning a pattern object."""
+ def fallback(pattern, flags, error_msg):
+ """Raise error, warn, or simply return fallback from re module."""
+ if current_notification == FALLBACK_EXCEPTION:
+ raise RegexError(error_msg)
+ elif current_notification == FALLBACK_WARNING:
+ warnings.warn("WARNING: Using re module. Reason: %s" % error_msg)
+ try:
+ result = PythonRePattern(pattern, flags)
+ except re.error as err:
+ raise RegexError(*err.args)
+ return result
+
+ cdef StringPiece * s
+ cdef Options opts
+ cdef int error_code
+ cdef int encoded = 0
+ cdef object original_pattern
+
+ if isinstance(pattern, (Pattern, SREPattern)):
+ if flags:
+ raise ValueError(
+ 'Cannot process flags argument with a compiled pattern')
+ return pattern
+
+ original_pattern = pattern
+ if flags & _L:
+ return fallback(original_pattern, flags, "re.LOCALE not supported")
+ pattern = unicode_to_bytes(pattern, &encoded, -1)
+ newflags = flags
+ if not PY2:
+ if not encoded and flags & _U: # re.UNICODE
+ pass # can use UNICODE with bytes pattern, but assumes valid UTF-8
+ # raise ValueError("can't use UNICODE flag with a bytes pattern")
+ elif encoded and not (flags & ASCII): # re.ASCII (not in Python 2)
+ newflags = flags | _U # re.UNICODE
+ elif encoded and flags & ASCII:
+ newflags = flags & ~_U # re.UNICODE
+ try:
+ pattern = _prepare_pattern(pattern, newflags)
+ except BackreferencesException:
+ return fallback(original_pattern, flags, "Backreferences not supported")
+ except CharClassProblemException:
+ return fallback(original_pattern, flags,
+ "\W and \S not supported inside character classes")
+
+ # Set the options given the flags above.
+ if flags & _I:
+ opts.set_case_sensitive(0);
+
+ opts.set_max_mem(max_mem)
+ opts.set_log_errors(0)
+ if flags & _U or encoded:
+ opts.set_encoding(EncodingUTF8)
+ else: # re.UNICODE flag not passed, and pattern is bytes,
+ # so allow matching of arbitrary byte sequences.
+ opts.set_encoding(EncodingLatin1)
+
+ s = new StringPiece(<char *><bytes>pattern, len(pattern))
+
+ cdef RE2 *re_pattern
+ with nogil:
+ re_pattern = new RE2(s[0], opts)
+
+ if not re_pattern.ok():
+ # Something went wrong with the compilation.
+ del s
+ error_msg = cpp_to_unicode(re_pattern.error())
+ error_code = re_pattern.error_code()
+ del re_pattern
+ if current_notification == FALLBACK_EXCEPTION:
+ # Raise an exception regardless of the type of error.
+ raise RegexError(error_msg)
+ elif error_code not in (ErrorBadPerlOp, ErrorRepeatSize,
+ # ErrorBadEscape,
+ ErrorPatternTooLarge):
+ # Raise an error because these will not be fixed by using the
+ # ``re`` module.
+ raise RegexError(error_msg)
+ elif current_notification == FALLBACK_WARNING:
+ warnings.warn("WARNING: Using re module. Reason: %s" % error_msg)
+ return PythonRePattern(original_pattern, flags)
+
+ cdef Pattern pypattern = Pattern()
+ cdef map[cpp_string, int] named_groups = re_pattern.NamedCapturingGroups()
+ pypattern.pattern = original_pattern
+ pypattern.re_pattern = re_pattern
+ pypattern.groups = re_pattern.NumberOfCapturingGroups()
+ pypattern.encoded = encoded
+ pypattern.flags = flags
+ pypattern.groupindex = {}
+ for it in named_groups:
+ pypattern.groupindex[cpp_to_unicode(it.first)] = it.second
+
+ if flags & DEBUG:
+ print(repr(pypattern._dump_pattern()))
+ del s
+ return pypattern
+
+
+def _prepare_pattern(bytes pattern, int flags):
+ """Translate pattern to RE2 syntax."""
+ cdef bytearray result = bytearray()
+ cdef unsigned char * cstring = pattern
+ cdef unsigned char this, that
+ cdef int size = len(pattern)
+ cdef int n = 0
+
+ if flags & (_S | _M):
+ result.extend(b'(?')
+ if flags & _S:
+ result.extend(b's')
+ if flags & _M:
+ result.extend(b'm')
+ result.extend(b')')
+ while n < size:
+ this = cstring[n]
+ if flags & _X:
+ if this in b' \t\n\r\f\v':
+ n += 1
+ continue
+ elif this == b'#':
+ while True:
+ n += 1
+ if n >= size:
+ break
+ this = cstring[n]
+ if this == b'\n':
+ break
+ n += 1
+ continue
+
+ if this != b'[' and this != b'\\':
+ result.append(this)
+ n += 1
+ continue
+ elif this == b'[':
+ result.append(this)
+ while True:
+ n += 1
+ if n >= size:
+ raise RegexError("unexpected end of regular expression")
+ this = cstring[n]
+ if this == b']':
+ result.append(this)
+ break
+ elif this == b'\\':
+ n += 1
+ that = cstring[n]
+ if that == b'b':
+ result.extend(br'\010')
+ elif flags & _U:
+ if that == b'd':
+ result.extend(br'\p{Nd}')
+ elif that == b'w':
+ result.extend(br'_\p{L}\p{Nd}')
+ elif that == b's':
+ result.extend(br'\s\p{Z}')
+ elif that == b'D':
+ result.extend(br'\P{Nd}')
+ elif that == b'W':
+ # Since \w and \s are made out of several character
+ # groups, I don't see a way to convert their
+ # complements into a group without rewriting the
+ # whole expression, which seems too complicated.
+ raise CharClassProblemException()
+ elif that == b'S':
+ raise CharClassProblemException()
+ else:
+ result.append(this)
+ result.append(that)
+ else:
+ result.append(this)
+ result.append(that)
+ else:
+ result.append(this)
+ elif this == b'\\':
+ n += 1
+ that = cstring[n]
+ if b'8' <= that <= b'9':
+ raise BackreferencesException()
+ elif isoct(that):
+ if (n + 2 < size and isoct(cstring[n + 1])
+ and isoct(cstring[n + 2])):
+ # all clear, this is an octal escape
+ result.extend(cstring[n - 1:n + 3])
+ n += 2
+ else:
+ raise BackreferencesException()
+ elif that == b'x':
+ if (n + 2 < size and ishex(cstring[n + 1])
+ and ishex(cstring[n + 2])):
+ # hex escape
+ result.extend(cstring[n - 1:n + 3])
+ n += 2
+ else:
+ raise BackreferencesException()
+ elif that == b'Z':
+ result.extend(b'\\z')
+ elif flags & _U:
+ if that == b'd':
+ result.extend(br'\p{Nd}')
+ elif that == b'w':
+ result.extend(br'[_\p{L}\p{Nd}]')
+ elif that == b's':
+ result.extend(br'[\s\p{Z}]')
+ elif that == b'D':
+ result.extend(br'[^\p{Nd}]')
+ elif that == b'W':
+ result.extend(br'[^_\p{L}\p{Nd}]')
+ elif that == b'S':
+ result.extend(br'[^\s\p{Z}]')
+ else:
+ result.append(this)
+ result.append(that)
+ else:
+ result.append(this)
+ result.append(that)
+ n += 1
+ return bytes(result)
diff --git a/contrib/python/pyre2/py3/src/includes.pxi b/contrib/python/pyre2/py3/src/includes.pxi
new file mode 100644
index 0000000000..8c35b6d4b2
--- /dev/null
+++ b/contrib/python/pyre2/py3/src/includes.pxi
@@ -0,0 +1,109 @@
+cimport cpython.unicode
+from libcpp.map cimport map
+from libcpp.string cimport string as cpp_string
+from cython.operator cimport postincrement, dereference
+from cpython.buffer cimport Py_buffer, PyBUF_SIMPLE, PyObject_CheckBuffer, \
+ PyObject_GetBuffer, PyBuffer_Release
+from cpython.version cimport PY_MAJOR_VERSION
+
+
+cdef extern from *:
+ cdef void emit_if_narrow_unicode "#if !defined(Py_UNICODE_WIDE) && PY_VERSION_HEX < 0x03030000 //" ()
+ cdef void emit_endif "#endif //" ()
+
+
+cdef extern from "Python.h":
+ int PyObject_CheckReadBuffer(object)
+ int PyObject_AsReadBuffer(object, const void **, Py_ssize_t *)
+
+
+cdef extern from "re2/stringpiece.h" namespace "re2":
+ cdef cppclass StringPiece:
+ StringPiece()
+ StringPiece(const char *)
+ StringPiece(const char *, int)
+ const char * data()
+ int copy(char * buf, size_t n, size_t pos)
+ int length()
+
+
+cdef extern from "re2/re2.h" namespace "re2":
+ cdef enum Anchor:
+ UNANCHORED "RE2::UNANCHORED"
+ ANCHOR_START "RE2::ANCHOR_START"
+ ANCHOR_BOTH "RE2::ANCHOR_BOTH"
+
+ ctypedef Anchor re2_Anchor "RE2::Anchor"
+
+ cdef enum ErrorCode:
+ NoError "RE2::NoError"
+ ErrorInternal "RE2::ErrorInternal"
+ # Parse errors
+ ErrorBadEscape "RE2::ErrorBadEscape" # bad escape sequence
+ ErrorBadCharClass "RE2::ErrorBadCharClass" # bad character class
+ ErrorBadCharRange "RE2::ErrorBadCharRange" # bad character class range
+ ErrorMissingBracket "RE2::ErrorMissingBracket" # missing closing ]
+ ErrorMissingParen "RE2::ErrorMissingParen" # missing closing )
+ ErrorTrailingBackslash "RE2::ErrorTrailingBackslash" # trailing \ at end of regexp
+ ErrorRepeatArgument "RE2::ErrorRepeatArgument" # repeat argument missing, e.g. "*"
+ ErrorRepeatSize "RE2::ErrorRepeatSize" # bad repetition argument
+ ErrorRepeatOp "RE2::ErrorRepeatOp" # bad repetition operator
+ ErrorBadPerlOp "RE2::ErrorBadPerlOp" # bad perl operator
+ ErrorBadUTF8 "RE2::ErrorBadUTF8" # invalid UTF-8 in regexp
+ ErrorBadNamedCapture "RE2::ErrorBadNamedCapture" # bad named capture group
+ ErrorPatternTooLarge "RE2::ErrorPatternTooLarge" # pattern too large (compile failed)
+
+ cdef enum Encoding:
+ EncodingUTF8 "RE2::Options::EncodingUTF8"
+ EncodingLatin1 "RE2::Options::EncodingLatin1"
+
+ ctypedef Encoding re2_Encoding "RE2::Options::Encoding"
+
+ cdef cppclass Options "RE2::Options":
+ Options()
+ void set_posix_syntax(int b)
+ void set_longest_match(int b)
+ void set_log_errors(int b)
+ void set_max_mem(int m)
+ void set_literal(int b)
+ void set_never_nl(int b)
+ void set_case_sensitive(int b)
+ void set_perl_classes(int b)
+ void set_word_boundary(int b)
+ void set_one_line(int b)
+ int case_sensitive()
+ void set_encoding(re2_Encoding encoding)
+
+ cdef cppclass RE2:
+ RE2(const StringPiece pattern, Options option) nogil
+ RE2(const StringPiece pattern) nogil
+ int Match(const StringPiece text, int startpos, int endpos,
+ Anchor anchor, StringPiece * match, int nmatch) nogil
+ int Replace(cpp_string *str, const RE2 pattern,
+ const StringPiece rewrite) nogil
+ int GlobalReplace(cpp_string *str, const RE2 pattern,
+ const StringPiece rewrite) nogil
+ int NumberOfCapturingGroups()
+ int ok()
+ const cpp_string pattern()
+ cpp_string error()
+ ErrorCode error_code()
+ const map[cpp_string, int]& NamedCapturingGroups()
+
+ # hack for static methods
+ cdef int Replace "RE2::Replace"(
+ cpp_string *str, const RE2 pattern,
+ const StringPiece rewrite) nogil
+ cdef int GlobalReplace "RE2::GlobalReplace"(
+ cpp_string *str,
+ const RE2 pattern,
+ const StringPiece rewrite) nogil
+
+
+cdef extern from "_re2macros.h":
+ StringPiece * new_StringPiece_array(int) nogil
+
+
+cdef extern from *:
+ # StringPiece * new_StringPiece_array "new re2::StringPiece[n]" (int) nogil
+ void delete_StringPiece_array "delete[]" (StringPiece *) nogil
diff --git a/contrib/python/pyre2/py3/src/match.pxi b/contrib/python/pyre2/py3/src/match.pxi
new file mode 100644
index 0000000000..3eaae74b47
--- /dev/null
+++ b/contrib/python/pyre2/py3/src/match.pxi
@@ -0,0 +1,280 @@
+cdef class Match:
+ cdef readonly Pattern re
+ cdef readonly object string
+ cdef readonly int pos
+ cdef readonly int endpos
+ cdef readonly tuple regs
+
+ cdef StringPiece * matches
+ cdef int encoded
+ cdef int nmatches
+ cdef int _lastindex
+ cdef tuple _groups
+ cdef dict _named_groups
+
+ property lastindex:
+ def __get__(self):
+ return None if self._lastindex < 1 else self._lastindex
+
+ property lastgroup:
+ def __get__(self):
+ if self._lastindex < 1:
+ return None
+ for name, n in self.re.groupindex.items():
+ if n == self._lastindex:
+ return name
+ return None
+
+ def __init__(self, Pattern pattern_object, int num_groups):
+ self._lastindex = -1
+ self._groups = None
+ self.pos = 0
+ self.endpos = -1
+ self.matches = new_StringPiece_array(num_groups + 1)
+ self.nmatches = num_groups
+ self.re = pattern_object
+
+ cdef _init_groups(self):
+ cdef list groups = []
+ cdef int i
+ cdef const char * last_end = NULL
+ cdef const char * cur_end = NULL
+
+ for i in range(self.nmatches):
+ if self.matches[i].data() == NULL:
+ groups.append(None)
+ else:
+ if i > 0:
+ cur_end = self.matches[i].data() + self.matches[i].length()
+
+ if last_end == NULL:
+ last_end = cur_end
+ self._lastindex = i
+ else:
+ # The rules for last group are a bit complicated:
+ # if two groups end at the same point, the earlier one
+ # is considered last, so we don't switch our selection
+ # unless the end point has moved.
+ if cur_end > last_end:
+ last_end = cur_end
+ self._lastindex = i
+ groups.append(
+ self.matches[i].data()[:self.matches[i].length()])
+ self._groups = tuple(groups)
+
+ cdef bytes _group(self, object groupnum):
+ cdef int idx
+ if isinstance(groupnum, int):
+ idx = groupnum
+ if idx > self.nmatches - 1:
+ raise IndexError("no such group %d; available groups: %r"
+ % (idx, list(range(self.nmatches))))
+ return self._groups[idx]
+ groupdict = self._groupdict()
+ if groupnum not in groupdict:
+ raise IndexError("no such group %r; available groups: %r"
+ % (groupnum, list(groupdict)))
+ return groupdict[groupnum]
+
+ cdef dict _groupdict(self):
+ if self._named_groups is None:
+ self._named_groups = {name: self._groups[n]
+ for name, n in self.re.groupindex.items()}
+ return self._named_groups
+
+ def groups(self, default=None):
+ if self.encoded:
+ return tuple([default if g is None else g.decode('utf8')
+ for g in self._groups[1:]])
+ return tuple([default if g is None else g
+ for g in self._groups[1:]])
+
+ def group(self, *args):
+ if len(args) == 0:
+ groupnum = 0
+ elif len(args) == 1:
+ groupnum = args[0]
+ else: # len(args) > 1:
+ return tuple([self.group(i) for i in args])
+ if self.encoded:
+ result = self._group(groupnum)
+ return None if result is None else result.decode('utf8')
+ return self._group(groupnum)
+
+ def groupdict(self):
+ result = self._groupdict()
+ if self.encoded:
+ return {a: None if b is None else b.decode('utf8')
+ for a, b in result.items()}
+ return result
+
+ def expand(self, object template):
+ """Expand a template with groups."""
+ cdef bytearray result = bytearray()
+ if isinstance(template, unicode):
+ if not PY2 and not self.encoded:
+ raise ValueError(
+ 'cannot expand unicode template on bytes pattern')
+ templ = template.encode('utf8')
+ else:
+ if not PY2 and self.encoded:
+ raise ValueError(
+ 'cannot expand bytes template on unicode pattern')
+ templ = bytes(template)
+ self._expand(templ, result)
+ return result.decode('utf8') if self.encoded else bytes(result)
+
+ cdef _expand(self, bytes templ, bytearray result):
+ """Expand template by appending to an existing bytearray.
+ Everything remains UTF-8 encoded."""
+ cdef char * cstring
+ cdef int n = 0, prev = 0, size
+
+ # NB: cstring is used to get single characters, to avoid difference in
+ # Python 2/3 behavior of bytes objects.
+ cstring = templ
+ size = len(templ)
+ while True:
+ prev = n
+ n = templ.find(b'\\', prev)
+ if n == -1:
+ result.extend(templ[prev:])
+ break
+ result.extend(templ[prev:n])
+ n += 1
+ if (n + 2 < size and cstring[n] == b'x'
+ and ishex(cstring[n + 1]) and ishex(cstring[n + 2])):
+ # hex char reference \x1f
+ result.append(int(templ[n + 1:n + 3], base=16) & 255)
+ n += 3
+ elif (n + 2 < size and isoct(cstring[n]) and isoct(cstring[n + 1])
+ and isoct(cstring[n + 2])):
+ # octal char reference \123
+ result.append(int(templ[n:n + 3], base=8) & 255)
+ n += 3
+ elif cstring[n] == b'0':
+ if n + 1 < size and isoct(cstring[n + 1]):
+ # 2 character octal: \01
+ result.append(int(templ[n:n + 2], base=8))
+ n += 2
+ else: # nul-terminator literal \0
+ result.append(b'\0')
+ n += 1
+ elif b'0' <= cstring[n] <= b'9': # numeric group reference
+ if n + 1 < size and isdigit(cstring[n + 1]):
+ # 2 digit group ref \12
+ groupno = int(templ[n:n + 2])
+ n += 2
+ else:
+ # 1 digit group ref \1
+ groupno = int(templ[n:n + 1])
+ n += 1
+ if groupno <= self.re.groups:
+ groupval = self._group(groupno)
+ if groupval is not None:
+ result.extend(groupval)
+ else:
+ raise RegexError('invalid group reference.')
+ elif cstring[n] == b'g': # named group reference
+ n += 1
+ if n >= size or cstring[n] != b'<':
+ raise RegexError('missing group name')
+ n += 1
+ start = n
+ while cstring[n] != b'>':
+ if not isident(cstring[n]):
+ raise RegexError('bad character in group name')
+ n += 1
+ if n >= size:
+ raise RegexError('unterminated group name')
+ if templ[start:n].isdigit():
+ name = int(templ[start:n])
+ elif isdigit(cstring[start]):
+ raise RegexError('bad character in group name')
+ else:
+ name = templ[start:n]
+ if self.encoded:
+ name = name.decode('utf8')
+ groupval = self._group(name)
+ if groupval is not None:
+ result.extend(groupval)
+ n += 1
+ else:
+ if cstring[n] == b'n':
+ result.append(b'\n')
+ elif cstring[n] == b'r':
+ result.append(b'\r')
+ elif cstring[n] == b't':
+ result.append(b'\t')
+ elif cstring[n] == b'v':
+ result.append(b'\v')
+ elif cstring[n] == b'f':
+ result.append(b'\f')
+ elif cstring[n] == b'a':
+ result.append(b'\a')
+ elif cstring[n] == b'b':
+ result.append(b'\b')
+ elif cstring[n] == b'\\':
+ result.append(b'\\')
+ else: # copy verbatim
+ result.append(b'\\')
+ result.append(cstring[n])
+ n += 1
+ return bytes(result)
+
+ def start(self, group=0):
+ return self.span(group)[0]
+
+ def end(self, group=0):
+ return self.span(group)[1]
+
+ def span(self, group=0):
+ if isinstance(group, int):
+ if group > len(self.regs):
+ raise IndexError("no such group %d; available groups: %r"
+ % (group, list(range(len(self.regs)))))
+ return self.regs[group]
+ else:
+ self._groupdict()
+ if group not in self.re.groupindex:
+ raise IndexError("no such group %r; available groups: %r"
+ % (group, list(self.re.groupindex)))
+ return self.regs[self.re.groupindex[group]]
+
+ cdef _make_spans(self, char * cstring, int size, int * cpos, int * upos):
+ cdef int start, end
+ cdef StringPiece * piece
+
+ spans = []
+ for i in range(self.nmatches):
+ if self.matches[i].data() == NULL:
+ spans.append((-1, -1))
+ else:
+ piece = &self.matches[i]
+ if piece.data() == NULL:
+ return (-1, -1)
+ start = piece.data() - cstring
+ end = start + piece.length()
+ spans.append((start, end))
+
+ if self.encoded == 2:
+ spans = self._convert_spans(spans, cstring, size, cpos, upos)
+
+ self.regs = tuple(spans)
+
+ cdef list _convert_spans(self, spans,
+ char * cstring, int size, int * cpos, int * upos):
+ cdef map[int, int] positions
+ cdef int x, y
+ for x, y in spans:
+ positions[x] = x
+ positions[y] = y
+ unicodeindices(positions, cstring, size, cpos, upos)
+ return [(positions[x], positions[y]) for x, y in spans]
+
+ def __dealloc__(self):
+ delete_StringPiece_array(self.matches)
+
+ def __repr__(self):
+ return '<re2.Match object; span=%r, match=%r>' % (
+ self.span(), self.group())
diff --git a/contrib/python/pyre2/py3/src/pattern.pxi b/contrib/python/pyre2/py3/src/pattern.pxi
new file mode 100644
index 0000000000..b8439d2007
--- /dev/null
+++ b/contrib/python/pyre2/py3/src/pattern.pxi
@@ -0,0 +1,650 @@
+cdef class Pattern:
+ cdef readonly object pattern # original pattern in Python format
+ cdef readonly int flags
+ cdef readonly int groups # number of groups
+ cdef readonly dict groupindex # name => group number
+ cdef object __weakref__
+
+ cdef bint encoded # True if this was originally a Unicode pattern
+ cdef RE2 * re_pattern
+
+ def search(self, object string, int pos=0, int endpos=-1):
+ """Scan through string looking for a match, and return a corresponding
+ Match instance. Return None if no position in the string matches."""
+ return self._search(string, pos, endpos, UNANCHORED)
+
+ def match(self, object string, int pos=0, int endpos=-1):
+ """Matches zero or more characters at the beginning of the string."""
+ return self._search(string, pos, endpos, ANCHOR_START)
+
+ def fullmatch(self, object string, int pos=0, int endpos=-1):
+ """"fullmatch(string[, pos[, endpos]]) --> Match object or None."
+
+ Matches the entire string."""
+ return self._search(string, pos, endpos, ANCHOR_BOTH)
+
+ cdef _search(self, object string, int pos, int endpos,
+ re2_Anchor anchoring):
+ """Scan through string looking for a match, and return a corresponding
+ Match instance. Return None if no position in the string matches."""
+ cdef char * cstring
+ cdef Py_ssize_t size
+ cdef Py_buffer buf
+ cdef int retval
+ cdef int encoded = 0
+ cdef StringPiece * sp
+ cdef Match m = Match(self, self.groups + 1)
+ cdef int cpos = 0, upos = pos
+
+ if 0 <= endpos <= pos:
+ return None
+
+ bytestr = unicode_to_bytes(string, &encoded, self.encoded)
+ if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1:
+ raise TypeError('expected string or buffer')
+ try:
+ if encoded == 2 and (pos or endpos != -1):
+ utf8indices(cstring, size, &pos, &endpos)
+ cpos = pos
+ if pos > size:
+ return None
+ if 0 <= endpos < size:
+ size = endpos
+
+ sp = new StringPiece(cstring, size)
+ with nogil:
+ retval = self.re_pattern.Match(
+ sp[0],
+ pos,
+ size,
+ anchoring,
+ m.matches,
+ self.groups + 1)
+ del sp
+ if retval == 0:
+ return None
+
+ m.encoded = encoded
+ m.nmatches = self.groups + 1
+ m.string = string
+ m.pos = pos
+ if endpos == -1:
+ m.endpos = size
+ else:
+ m.endpos = endpos
+ m._make_spans(cstring, size, &cpos, &upos)
+ m._init_groups()
+ finally:
+ release_cstring(&buf)
+ return m
+
+ def contains(self, object string, int pos=0, int endpos=-1):
+ """"contains(string[, pos[, endpos]]) --> bool."
+
+ Scan through string looking for a match, and return True or False."""
+ cdef char * cstring
+ cdef Py_ssize_t size
+ cdef Py_buffer buf
+ cdef int retval
+ cdef int encoded = 0
+ cdef StringPiece * sp
+
+ if 0 <= endpos <= pos:
+ return False
+
+ bytestr = unicode_to_bytes(string, &encoded, self.encoded)
+ if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1:
+ raise TypeError('expected string or buffer')
+ try:
+ if encoded == 2 and (pos or endpos != -1):
+ utf8indices(cstring, size, &pos, &endpos)
+ if pos > size:
+ return False
+ if 0 <= endpos < size:
+ size = endpos
+
+ sp = new StringPiece(cstring, size)
+ with nogil:
+ retval = self.re_pattern.Match(
+ sp[0],
+ pos,
+ size,
+ UNANCHORED,
+ NULL,
+ 0)
+ del sp
+ finally:
+ release_cstring(&buf)
+ return retval != 0
+
+ def count(self, object string, int pos=0, int endpos=-1):
+ """Return number of non-overlapping matches of pattern in string."""
+ cdef char * cstring
+ cdef Py_ssize_t size
+ cdef Py_buffer buf
+ cdef int retval
+ cdef int encoded = 0
+ cdef int result = 0
+ cdef StringPiece * sp = NULL
+ cdef StringPiece * matches = NULL
+
+ bytestr = unicode_to_bytes(string, &encoded, self.encoded)
+ if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1:
+ raise TypeError('expected string or buffer')
+ try:
+ if encoded == 2 and (pos or endpos != -1):
+ utf8indices(cstring, size, &pos, &endpos)
+ if pos > size:
+ return 0
+ if 0 <= endpos < size:
+ size = endpos
+
+ sp = new StringPiece(cstring, size)
+ matches = new_StringPiece_array(1)
+ try:
+ while True:
+ with nogil:
+ retval = self.re_pattern.Match(
+ sp[0],
+ pos,
+ size,
+ UNANCHORED,
+ matches,
+ 1)
+ if retval == 0:
+ break
+ result += 1
+ if pos == size:
+ break
+ # offset the pos to move to the next point
+ pos = matches[0].data() - cstring + (
+ matches[0].length() or 1)
+ finally:
+ del sp
+ delete_StringPiece_array(matches)
+ finally:
+ release_cstring(&buf)
+ return result
+
+ def findall(self, object string, int pos=0, int endpos=-1):
+ """Return all non-overlapping matches of pattern in string as a list
+ of strings."""
+ cdef char * cstring
+ cdef Py_ssize_t size
+ cdef Py_buffer buf
+ cdef int encoded = 0
+ cdef int retval
+ cdef list resultlist = []
+ cdef StringPiece * sp = NULL
+ cdef StringPiece * matches = NULL
+
+ bytestr = unicode_to_bytes(string, &encoded, self.encoded)
+ if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1:
+ raise TypeError('expected string or buffer')
+ try:
+ if encoded == 2 and (pos or endpos != -1):
+ utf8indices(cstring, size, &pos, &endpos)
+ if pos > size:
+ return []
+ if 0 <= endpos < size:
+ size = endpos
+
+ sp = new StringPiece(cstring, size)
+ matches = new_StringPiece_array(self.groups + 1)
+
+ while True:
+ with nogil:
+ retval = self.re_pattern.Match(
+ sp[0],
+ pos,
+ size,
+ UNANCHORED,
+ matches,
+ self.groups + 1)
+ if retval == 0:
+ break
+ if self.groups > 1:
+ if encoded:
+ resultlist.append(tuple([
+ '' if matches[i].data() is NULL else
+ matches[i].data()[:matches[i].length()
+ ].decode('utf8')
+ for i in range(1, self.groups + 1)]))
+ else:
+ resultlist.append(tuple([
+ b'' if matches[i].data() is NULL
+ else matches[i].data()[:matches[i].length()]
+ for i in range(1, self.groups + 1)]))
+ else: # 0 or 1 group; return list of strings
+ if encoded:
+ resultlist.append(matches[self.groups].data()[
+ :matches[self.groups].length()].decode('utf8'))
+ else:
+ resultlist.append(matches[self.groups].data()[
+ :matches[self.groups].length()])
+ if pos == size:
+ break
+ # offset the pos to move to the next point
+ pos = matches[0].data() - cstring + (matches[0].length() or 1)
+ finally:
+ del sp
+ delete_StringPiece_array(matches)
+ release_cstring(&buf)
+ return resultlist
+
+ def finditer(self, object string, int pos=0, int endpos=-1):
+ """Yield all non-overlapping matches of pattern in string as Match
+ objects."""
+ result = iter(self._finditer(string, pos, endpos))
+ next(result) # dummy value to raise error before start of generator
+ return result
+
+ def _finditer(self, object string, int pos=0, int endpos=-1):
+ cdef char * cstring
+ cdef Py_ssize_t size
+ cdef Py_buffer buf
+ cdef int retval
+ cdef StringPiece * sp = NULL
+ cdef Match m
+ cdef int encoded = 0
+ cdef int cpos = 0, upos = pos
+
+ bytestr = unicode_to_bytes(string, &encoded, self.encoded)
+ if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1:
+ raise TypeError('expected string or buffer')
+ try:
+ if encoded == 2 and (pos or endpos != -1):
+ utf8indices(cstring, size, &pos, &endpos)
+ cpos = pos
+ if pos > size:
+ return
+ if 0 <= endpos < size:
+ size = endpos
+
+ sp = new StringPiece(cstring, size)
+
+ yield
+ while True:
+ m = Match(self, self.groups + 1)
+ m.string = string
+ with nogil:
+ retval = self.re_pattern.Match(
+ sp[0],
+ pos,
+ size,
+ UNANCHORED,
+ m.matches,
+ self.groups + 1)
+ if retval == 0:
+ break
+ m.encoded = encoded
+ m.nmatches = self.groups + 1
+ m.pos = pos
+ if endpos == -1:
+ m.endpos = size
+ else:
+ m.endpos = endpos
+ m._make_spans(cstring, size, &cpos, &upos)
+ m._init_groups()
+ yield m
+ if pos == size:
+ break
+ # offset the pos to move to the next point
+ pos = m.matches[0].data() - cstring + (
+ m.matches[0].length() or 1)
+ finally:
+ del sp
+ release_cstring(&buf)
+
+ def split(self, string, int maxsplit=0):
+ """split(string[, maxsplit = 0]) --> list
+
+ Split a string by the occurrences of the pattern."""
+ cdef char * cstring
+ cdef Py_ssize_t size
+ cdef int retval
+ cdef int pos = 0
+ cdef int lookahead = 0
+ cdef int num_split = 0
+ cdef StringPiece * sp
+ cdef StringPiece * matches
+ cdef list resultlist = []
+ cdef int encoded = 0
+ cdef Py_buffer buf
+
+ if maxsplit < 0:
+ maxsplit = 0
+
+ bytestr = unicode_to_bytes(string, &encoded, self.encoded)
+ if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1:
+ raise TypeError('expected string or buffer')
+ matches = new_StringPiece_array(self.groups + 1)
+ sp = new StringPiece(cstring, size)
+ try:
+
+ while True:
+ with nogil:
+ retval = self.re_pattern.Match(
+ sp[0],
+ pos + lookahead,
+ size,
+ UNANCHORED,
+ matches,
+ self.groups + 1)
+ if retval == 0:
+ break
+
+ match_start = matches[0].data() - cstring
+ match_end = match_start + matches[0].length()
+
+ # If an empty match, just look ahead until you find something
+ if match_start == match_end:
+ if pos + lookahead == size:
+ break
+ lookahead += 1
+ continue
+
+ if encoded:
+ resultlist.append(
+ char_to_unicode(&sp.data()[pos], match_start - pos))
+ else:
+ resultlist.append(sp.data()[pos:match_start])
+ if self.groups > 0:
+ for group in range(self.groups):
+ if matches[group + 1].data() == NULL:
+ resultlist.append(None)
+ else:
+ if encoded:
+ resultlist.append(char_to_unicode(
+ matches[group + 1].data(),
+ matches[group + 1].length()))
+ else:
+ resultlist.append(matches[group + 1].data()[:
+ matches[group + 1].length()])
+
+ # offset the pos to move to the next point
+ pos = match_end
+ lookahead = 0
+
+ num_split += 1
+ if maxsplit and num_split >= maxsplit:
+ break
+
+ if encoded:
+ resultlist.append(
+ char_to_unicode(&sp.data()[pos], sp.length() - pos))
+ else:
+ resultlist.append(sp.data()[pos:])
+ finally:
+ del sp
+ delete_StringPiece_array(matches)
+ release_cstring(&buf)
+ return resultlist
+
+ def sub(self, repl, string, int count=0):
+ """sub(repl, string[, count = 0]) --> newstring
+
+ Return the string obtained by replacing the leftmost non-overlapping
+ occurrences of pattern in string by the replacement repl."""
+ cdef int num_repl = 0
+ return self._subn(repl, string, count, &num_repl)
+
+ def subn(self, repl, string, int count=0):
+ """subn(repl, string[, count = 0]) --> (newstring, number of subs)
+
+ Return the tuple (new_string, number_of_subs_made) found by replacing
+ the leftmost non-overlapping occurrences of pattern with the
+ replacement repl."""
+ cdef int num_repl = 0
+ result = self._subn(repl, string, count, &num_repl)
+ return result, num_repl
+
+ cdef _subn(self, repl, string, int count, int *num_repl):
+ cdef bytes repl_b
+ cdef char * cstring
+ cdef object result
+ cdef Py_ssize_t size
+ cdef StringPiece * sp = NULL
+ cdef cpp_string * input_str = NULL
+ cdef int string_encoded = 0
+ cdef int repl_encoded = 0
+
+ if callable(repl):
+ # This is a callback, so use the custom function
+ return self._subn_callback(repl, string, count, num_repl)
+
+ repl_b = unicode_to_bytes(repl, &repl_encoded, self.encoded)
+ if not repl_encoded and not isinstance(repl, bytes):
+ repl_b = bytes(repl) # coerce buffer to bytes object
+
+ if count > 1 or (b'\\' if PY2 else <char>b'\\') in repl_b:
+ # Limit on number of substitutions or replacement string contains
+ # escape sequences; handle with Match.expand() implementation.
+ # RE2 does support simple numeric group references \1, \2,
+ # but the number of differences with Python behavior is
+ # non-trivial.
+ return self._subn_expand(repl_b, string, count, num_repl)
+ try:
+ cstring = repl_b
+ size = len(repl_b)
+ sp = new StringPiece(cstring, size)
+
+ bytestr = unicode_to_bytes(string, &string_encoded, self.encoded)
+ if not string_encoded and not isinstance(bytestr, bytes):
+ bytestr = bytes(bytestr) # coerce buffer to bytes object
+ input_str = new cpp_string(<char *>bytestr, len(bytestr))
+ # NB: RE2 treats unmatched groups in repl as empty string;
+ # Python raises an error.
+ with nogil:
+ if count == 0:
+ num_repl[0] = GlobalReplace(
+ input_str, self.re_pattern[0], sp[0])
+ elif count == 1:
+ num_repl[0] = Replace(
+ input_str, self.re_pattern[0], sp[0])
+
+ if string_encoded or (repl_encoded and num_repl[0] > 0):
+ result = cpp_to_unicode(input_str[0])
+ else:
+ result = cpp_to_bytes(input_str[0])
+ finally:
+ del input_str, sp
+ return result
+
+ cdef _subn_callback(self, callback, string, int count, int * num_repl):
+ # This function is probably the hardest to implement correctly.
+ # This is my first attempt, but if anybody has a better solution,
+ # please help out.
+ cdef char * cstring
+ cdef Py_ssize_t size
+ cdef Py_buffer buf
+ cdef int retval
+ cdef int prevendpos = -1
+ cdef int endpos = 0
+ cdef int pos = 0
+ cdef int encoded = 0
+ cdef StringPiece * sp
+ cdef Match m
+ cdef bytearray result = bytearray()
+ cdef int cpos = 0, upos = 0
+
+ if count < 0:
+ count = 0
+
+ bytestr = unicode_to_bytes(string, &encoded, self.encoded)
+ if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1:
+ raise TypeError('expected string or buffer')
+ sp = new StringPiece(cstring, size)
+ try:
+ while True:
+ m = Match(self, self.groups + 1)
+ m.string = string
+ with nogil:
+ retval = self.re_pattern.Match(
+ sp[0],
+ pos,
+ size,
+ UNANCHORED,
+ m.matches,
+ self.groups + 1)
+ if retval == 0:
+ break
+
+ endpos = m.matches[0].data() - cstring
+ if endpos == prevendpos:
+ endpos += 1
+ if endpos > size:
+ break
+ prevendpos = endpos
+ result.extend(sp.data()[pos:endpos])
+ pos = endpos + m.matches[0].length()
+
+ m.encoded = encoded
+ m.nmatches = self.groups + 1
+ m._make_spans(cstring, size, &cpos, &upos)
+ m._init_groups()
+ tmp = callback(m)
+ if tmp:
+ result.extend(tmp.encode('utf8') if encoded else tmp)
+ else:
+ result.extend(b'')
+
+ num_repl[0] += 1
+ if count and num_repl[0] >= count:
+ break
+ result.extend(sp.data()[pos:])
+ finally:
+ del sp
+ release_cstring(&buf)
+ return result.decode('utf8') if encoded else bytes(result)
+
+ cdef _subn_expand(self, bytes repl, string, int count, int * num_repl):
+ """Perform ``count`` substitutions with replacement string and
+ Match.expand."""
+ cdef char * cstring
+ cdef Py_ssize_t size
+ cdef Py_buffer buf
+ cdef int retval
+ cdef int prevendpos = -1
+ cdef int endpos = 0
+ cdef int pos = 0
+ cdef int encoded = 0
+ cdef StringPiece * sp
+ cdef Match m
+ cdef bytearray result = bytearray()
+
+ if count < 0:
+ count = 0
+
+ bytestr = unicode_to_bytes(string, &encoded, self.encoded)
+ if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1:
+ raise TypeError('expected string or buffer')
+ sp = new StringPiece(cstring, size)
+ try:
+ while True:
+ m = Match(self, self.groups + 1)
+ m.string = string
+ with nogil:
+ retval = self.re_pattern.Match(
+ sp[0],
+ pos,
+ size,
+ UNANCHORED,
+ m.matches,
+ self.groups + 1)
+ if retval == 0:
+ break
+
+ endpos = m.matches[0].data() - cstring
+ if endpos == prevendpos:
+ endpos += 1
+ if endpos > size:
+ break
+ prevendpos = endpos
+ result.extend(sp.data()[pos:endpos])
+ pos = endpos + m.matches[0].length()
+
+ m.encoded = encoded
+ m.nmatches = self.groups + 1
+ m._init_groups()
+ m._expand(repl, result)
+
+ num_repl[0] += 1
+ if count and num_repl[0] >= count:
+ break
+ result.extend(sp.data()[pos:])
+ finally:
+ del sp
+ release_cstring(&buf)
+ return result.decode('utf8') if encoded else bytes(result)
+
+ def scanner(self, arg):
+ return re.compile(self.pattern).scanner(arg)
+ # raise NotImplementedError
+
+ def _dump_pattern(self):
+ cdef cpp_string s = self.re_pattern.pattern()
+ if self.encoded:
+ return cpp_to_bytes(s).decode('utf8')
+ return cpp_to_bytes(s)
+
+ def __repr__(self):
+ if self.flags == 0:
+ return 're2.compile(%r)' % self.pattern
+ return 're2.compile(%r, %r)' % (self.pattern, self.flags)
+
+ def __reduce__(self):
+ return (compile, (self.pattern, self.flags))
+
+ def __dealloc__(self):
+ del self.re_pattern
+
+
+class PythonRePattern:
+ """A wrapper for re.Pattern to support the extra methods defined by re2
+ (contains, count)."""
+ def __init__(self, pattern, flags=None):
+ self._pattern = re.compile(pattern, flags)
+ self.pattern = pattern
+ self.flags = flags
+ self.groupindex = self._pattern.groupindex
+ self.groups = self._pattern.groups
+
+ def contains(self, string):
+ return bool(self._pattern.search(string))
+
+ def count(self, string, pos=0, endpos=9223372036854775807):
+ return len(self._pattern.findall(string, pos, endpos))
+
+ def findall(self, string, pos=0, endpos=9223372036854775807):
+ return self._pattern.findall(string, pos, endpos)
+
+ def finditer(self, string, pos=0, endpos=9223372036854775807):
+ return self._pattern.finditer(string, pos, endpos)
+
+ def fullmatch(self, string, pos=0, endpos=9223372036854775807):
+ return self._pattern.fullmatch(string, pos, endpos)
+
+ def match(self, string, pos=0, endpos=9223372036854775807):
+ return self._pattern.match(string, pos, endpos)
+
+ def scanner(self, string, pos=0, endpos=9223372036854775807):
+ return self._pattern.scanner(string, pos, endpos)
+
+ def search(self, string, pos=0, endpos=9223372036854775807):
+ return self._pattern.search(string, pos, endpos)
+
+ def split(self, string, maxsplit=0):
+ return self._pattern.split(string, maxsplit)
+
+ def sub(self, repl, string, count=0):
+ return self._pattern.sub(repl, string, count)
+
+ def subn(self, repl, string, count=0):
+ return self._pattern.subn(repl, string, count)
+
+ def __repr__(self):
+ return repr(self._pattern)
+
+ def __reduce__(self):
+ return (self, (self.pattern, self.flags))
diff --git a/contrib/python/pyre2/py3/src/re2.pyx b/contrib/python/pyre2/py3/src/re2.pyx
new file mode 100644
index 0000000000..c48101426f
--- /dev/null
+++ b/contrib/python/pyre2/py3/src/re2.pyx
@@ -0,0 +1,458 @@
+# cython: infer_types(False)
+r"""Regular expressions using Google's RE2 engine.
+
+Compared to Python's ``re``, the RE2 engine compiles regular expressions to
+deterministic finite automata, which guarantees linear-time behavior.
+
+Intended as a drop-in replacement for ``re``. Unicode is supported by encoding
+to UTF-8, and bytes strings are treated as UTF-8 when the UNICODE flag is given.
+For best performance, work with UTF-8 encoded bytes strings.
+
+Regular expressions that are not compatible with RE2 are processed with
+fallback to ``re``. Examples of features not supported by RE2:
+
+ - lookahead assertions ``(?!...)``
+ - backreferences (``\\n`` in search pattern)
+ - \W and \S not supported inside character classes
+
+On the other hand, unicode character classes are supported (e.g., ``\p{Greek}``).
+Syntax reference: https://github.com/google/re2/wiki/Syntax
+
+What follows is a reference for the regular expression syntax supported by this
+module (i.e., without requiring fallback to `re`).
+
+Regular expressions can contain both special and ordinary characters.
+Most ordinary characters, like "A", "a", or "0", are the simplest
+regular expressions; they simply match themselves.
+
+The special characters are::
+
+ "." Matches any character except a newline.
+ "^" Matches the start of the string.
+ "$" Matches the end of the string or just before the newline at
+ the end of the string.
+ "*" Matches 0 or more (greedy) repetitions of the preceding RE.
+ Greedy means that it will match as many repetitions as possible.
+ "+" Matches 1 or more (greedy) repetitions of the preceding RE.
+ "?" Matches 0 or 1 (greedy) of the preceding RE.
+ *?,+?,?? Non-greedy versions of the previous three special characters.
+ {m,n} Matches from m to n repetitions of the preceding RE.
+ {m,n}? Non-greedy version of the above.
+ "\\" Either escapes special characters or signals a special sequence.
+ [] Indicates a set of characters.
+ A "^" as the first character indicates a complementing set.
+ "|" A|B, creates an RE that will match either A or B.
+ (...) Matches the RE inside the parentheses.
+ The contents can be retrieved or matched later in the string.
+ (?:...) Non-grouping version of regular parentheses.
+ (?imsux) Set the I, M, S, U, or X flag for the RE (see below).
+
+The special sequences consist of "\\" and a character from the list
+below. If the ordinary character is not on the list, then the
+resulting RE will match the second character::
+
+ \A Matches only at the start of the string.
+ \Z Matches only at the end of the string.
+ \b Matches the empty string, but only at the start or end of a word.
+ \B Matches the empty string, but not at the start or end of a word.
+ \d Matches any decimal digit.
+ \D Matches any non-digit character.
+ \s Matches any whitespace character.
+ \S Matches any non-whitespace character.
+ \w Matches any alphanumeric character.
+ \W Matches the complement of \w.
+ \\ Matches a literal backslash.
+ \pN Unicode character class (one-letter name)
+ \p{Greek} Unicode character class
+ \PN negated Unicode character class (one-letter name)
+ \P{Greek} negated Unicode character class
+
+This module exports the following functions::
+
+ count Count all occurrences of a pattern in a string.
+ match Match a regular expression pattern to the beginning of a string.
+ fullmatch Match a regular expression pattern to all of a string.
+ search Search a string for a pattern and return Match object.
+ contains Same as search, but only return bool.
+ sub Substitute occurrences of a pattern found in a string.
+ subn Same as sub, but also return the number of substitutions made.
+ split Split a string by the occurrences of a pattern.
+ findall Find all occurrences of a pattern in a string.
+ finditer Return an iterator yielding a match object for each match.
+ compile Compile a pattern into a RegexObject.
+ purge Clear the regular expression cache.
+ escape Backslash all non-alphanumerics in a string.
+
+Some of the functions in this module takes flags as optional parameters::
+
+ A ASCII Make \w, \W, \b, \B, \d, \D match the corresponding ASCII
+ character categories (rather than the whole Unicode
+ categories, which is the default).
+ I IGNORECASE Perform case-insensitive matching.
+ M MULTILINE "^" matches the beginning of lines (after a newline)
+ as well as the string.
+ "$" matches the end of lines (before a newline) as well
+ as the end of the string.
+ S DOTALL "." matches any character at all, including the newline.
+ X VERBOSE Ignore whitespace and comments for nicer looking RE's.
+ U UNICODE Enable Unicode character classes and make \w, \W, \b, \B,
+ Unicode-aware (default for unicode patterns).
+
+This module also defines an exception 'RegexError' (also available under the
+alias 'error').
+
+"""
+
+include "includes.pxi"
+
+import re
+import sys
+import warnings
+from re import error as RegexError
+
+error = re.error
+
+# Import re flags to be compatible.
+I, M, S, U, X, L = re.I, re.M, re.S, re.U, re.X, re.L
+IGNORECASE = re.IGNORECASE
+MULTILINE = re.MULTILINE
+DOTALL = re.DOTALL
+UNICODE = re.UNICODE
+VERBOSE = re.VERBOSE
+LOCALE = re.LOCALE
+DEBUG = re.DEBUG
+ASCII = 256 # Python 3
+
+FALLBACK_QUIETLY = 0
+FALLBACK_WARNING = 1
+FALLBACK_EXCEPTION = 2
+
+VERSION = (0, 2, 23)
+VERSION_HEX = 0x000217
+
+cdef int _I = I, _M = M, _S = S, _U = U, _X = X, _L = L
+cdef int current_notification = FALLBACK_QUIETLY
+cdef bint PY2 = PY_MAJOR_VERSION == 2
+
+# Type of compiled re object from Python stdlib
+SREPattern = type(re.compile(''))
+
+_cache = {}
+_cache_repl = {}
+
+_MAXCACHE = 100
+
+
+include "compile.pxi"
+include "pattern.pxi"
+include "match.pxi"
+
+
+def purge():
+ """Clear the regular expression caches."""
+ _cache.clear()
+ _cache_repl.clear()
+
+
+def search(pattern, string, int flags=0):
+ """Scan through string looking for a match to the pattern, returning
+ a ``Match`` object or none if no match was found."""
+ return compile(pattern, flags).search(string)
+
+
+def match(pattern, string, int flags=0):
+ """Try to apply the pattern at the start of the string, returning
+ a ``Match`` object, or ``None`` if no match was found."""
+ return compile(pattern, flags).match(string)
+
+
+def fullmatch(pattern, string, int flags=0):
+ """Try to apply the pattern to the entire string, returning
+ a ``Match`` object, or ``None`` if no match was found."""
+ return compile(pattern, flags).fullmatch(string)
+
+
+def contains(pattern, string, int flags=0):
+ """Scan through string looking for a match to the pattern, returning
+ True or False."""
+ return compile(pattern, flags).contains(string)
+
+
+def finditer(pattern, string, int flags=0):
+ """Yield all non-overlapping matches in the string.
+
+ For each match, the iterator returns a ``Match`` object.
+ Empty matches are included in the result."""
+ return compile(pattern, flags).finditer(string)
+
+
+def findall(pattern, string, int flags=0):
+ """Return a list of all non-overlapping matches in the string.
+
+ Each match is represented as a string or a tuple (when there are two ore
+ more groups). Empty matches are included in the result."""
+ return compile(pattern, flags).findall(string)
+
+
+def count(pattern, string, int flags=0):
+ """Return number of non-overlapping matches in the string.
+
+ Empty matches are included in the count."""
+ return compile(pattern, flags).count(string)
+
+
+def split(pattern, string, int maxsplit=0, int flags=0):
+ """Split the source string by the occurrences of the pattern,
+ returning a list containing the resulting substrings."""
+ return compile(pattern, flags).split(string, maxsplit)
+
+
+def sub(pattern, repl, string, int count=0, int flags=0):
+ """Return the string obtained by replacing the leftmost
+ non-overlapping occurrences of the pattern in string by the
+ replacement ``repl``. ``repl`` can be either a string or a callable;
+ if a string, backslash escapes in it are processed. If it is
+ a callable, it's passed the ``Match`` object and must return
+ a replacement string to be used."""
+ return compile(pattern, flags).sub(repl, string, count)
+
+
+def subn(pattern, repl, string, int count=0, int flags=0):
+ """Return a 2-tuple containing ``(new_string, number)``.
+ new_string is the string obtained by replacing the leftmost
+ non-overlapping occurrences of the pattern in the source
+ string by the replacement ``repl``. ``number`` is the number of
+ substitutions that were made. ``repl`` can be either a string or a
+ callable; if a string, backslash escapes in it are processed.
+ If it is a callable, it's passed the ``Match`` object and must
+ return a replacement string to be used."""
+ return compile(pattern, flags).subn(repl, string, count)
+
+
+def escape(pattern):
+ """Escape all non-alphanumeric characters in pattern."""
+ cdef bint uni = isinstance(pattern, unicode)
+ cdef list s
+ if PY2 or uni:
+ s = list(pattern)
+ else:
+ s = [bytes([c]) for c in pattern]
+ for i in range(len(pattern)):
+ # c = pattern[i]
+ c = s[i]
+ if ord(c) < 0x80 and not c.isalnum():
+ if uni:
+ if c == u'\000':
+ s[i] = u'\\000'
+ else:
+ s[i] = u"\\" + c
+ else:
+ if c == b'\000':
+ s[i] = b'\\000'
+ else:
+ s[i] = b'\\' + c
+ return u''.join(s) if uni else b''.join(s)
+
+
+class BackreferencesException(Exception):
+ """Search pattern contains backreferences."""
+ pass
+
+
+class CharClassProblemException(Exception):
+ """Search pattern contains unsupported character class."""
+ pass
+
+
+def set_fallback_notification(level):
+ """Set the fallback notification to a level; one of:
+ FALLBACK_QUIETLY
+ FALLBACK_WARNING
+ FALLBACK_EXCEPTION
+ """
+ global current_notification
+ level = int(level)
+ if level < 0 or level > 2:
+ raise ValueError("This function expects a valid notification level.")
+ current_notification = level
+
+
+cdef bint ishex(unsigned char c):
+ """Test whether ``c`` is in ``[0-9a-fA-F]``"""
+ return (b'0' <= c <= b'9' or b'a' <= c <= b'f' or b'A' <= c <= b'F')
+
+
+cdef bint isoct(unsigned char c):
+ """Test whether ``c`` is in ``[0-7]``"""
+ return b'0' <= c <= b'7'
+
+
+cdef bint isdigit(unsigned char c):
+ """Test whether ``c`` is in ``[0-9]``"""
+ return b'0' <= c <= b'9'
+
+
+cdef bint isident(unsigned char c):
+ """Test whether ``c`` is in ``[a-zA-Z0-9_]``"""
+ return (b'a' <= c <= b'z' or b'A' <= c <= b'Z'
+ or b'0' <= c <= b'9' or c == b'_')
+
+
+cdef inline bytes cpp_to_bytes(cpp_string input):
+ """Convert from a std::string object to a python string."""
+ # By taking the slice we go to the right size,
+ # despite spurious or missing null characters.
+ return input.data()[:input.length()]
+
+
+cdef inline unicode cpp_to_unicode(cpp_string input):
+ """Convert a std::string object to a unicode string."""
+ return cpython.unicode.PyUnicode_DecodeUTF8(
+ input.data(), input.length(), 'strict')
+
+
+cdef inline unicode char_to_unicode(const char * input, int length):
+ """Convert a C string to a unicode string."""
+ return cpython.unicode.PyUnicode_DecodeUTF8(input, length, 'strict')
+
+
+cdef inline unicode_to_bytes(object pystring, int * encoded,
+ int checkotherencoding):
+ """Convert a unicode string to a utf8 bytes object, if necessary.
+
+ If pystring is a bytes string or a buffer, return unchanged.
+ If checkotherencoding is 0 or 1 and using Python 3, raise an error
+ if its truth value is not equal to that of encoded.
+ encoded is set to 1 if encoded string can be treated as ASCII,
+ and 2 if it contains multibyte unicode characters."""
+ if cpython.unicode.PyUnicode_Check(pystring):
+ origlen = len(pystring)
+ pystring = pystring.encode('utf8')
+ encoded[0] = 1 if origlen == len(pystring) else 2
+ else:
+ encoded[0] = 0
+ if not PY2 and checkotherencoding > 0 and not encoded[0]:
+ raise TypeError("can't use a string pattern on a bytes-like object")
+ elif not PY2 and checkotherencoding == 0 and encoded[0]:
+ raise TypeError("can't use a bytes pattern on a string-like object")
+ return pystring
+
+
+cdef inline int pystring_to_cstring(
+ object pystring, char ** cstring, Py_ssize_t * size,
+ Py_buffer * buf):
+ """Get a pointer from bytes/buffer object ``pystring``.
+
+ On success, return 0, and set ``cstring``, ``size``, and ``buf``."""
+ cdef int result = -1
+ cstring[0] = NULL
+ size[0] = 0
+ if PyObject_CheckBuffer(pystring) == 1: # new-style Buffer interface
+ result = PyObject_GetBuffer(pystring, buf, PyBUF_SIMPLE)
+ if result == 0:
+ cstring[0] = <char *>buf.buf
+ size[0] = buf.len
+ return result
+
+
+cdef inline void release_cstring(Py_buffer *buf):
+ """Release buffer if necessary."""
+ if not PY2:
+ PyBuffer_Release(buf)
+
+
+cdef utf8indices(char * cstring, int size, int *pos, int *endpos):
+ """Convert unicode indices ``pos`` and ``endpos`` to UTF-8 indices.
+
+ If the indices are out of range, leave them unchanged."""
+ cdef unsigned char * data = <unsigned char *>cstring
+ cdef int newpos = pos[0], newendpos = -1
+ cdef int cpos = 0, upos = 0
+ while cpos < size:
+ if data[cpos] < 0x80:
+ cpos += 1
+ upos += 1
+ elif data[cpos] < 0xe0:
+ cpos += 2
+ upos += 1
+ elif data[cpos] < 0xf0:
+ cpos += 3
+ upos += 1
+ else:
+ cpos += 4
+ upos += 1
+ # wide unicode chars get 2 unichars when Python <3.3 is compiled
+ # with --enable-unicode=ucs2
+ emit_if_narrow_unicode()
+ upos += 1
+ emit_endif()
+
+ if upos == pos[0]:
+ newpos = cpos
+ if endpos[0] == -1:
+ break
+ elif upos == endpos[0]:
+ newendpos = cpos
+ break
+ pos[0] = newpos
+ endpos[0] = newendpos
+
+
+cdef void unicodeindices(map[int, int] &positions,
+ char * cstring, int size, int * cpos, int * upos):
+ """Convert UTF-8 byte indices to unicode indices."""
+ cdef unsigned char * s = <unsigned char *>cstring
+ cdef map[int, int].iterator it = positions.begin()
+
+ if dereference(it).first == -1:
+ dereference(it).second = -1
+ postincrement(it)
+ if it == positions.end():
+ return
+ if dereference(it).first == cpos[0]:
+ dereference(it).second = upos[0]
+ postincrement(it)
+ if it == positions.end():
+ return
+
+ while cpos[0] < size:
+ if s[cpos[0]] < 0x80:
+ cpos[0] += 1
+ upos[0] += 1
+ elif s[cpos[0]] < 0xe0:
+ cpos[0] += 2
+ upos[0] += 1
+ elif s[cpos[0]] < 0xf0:
+ cpos[0] += 3
+ upos[0] += 1
+ else:
+ cpos[0] += 4
+ upos[0] += 1
+ # wide unicode chars get 2 unichars when Python <3.3 is compiled
+ # with --enable-unicode=ucs2
+ emit_if_narrow_unicode()
+ upos[0] += 1
+ emit_endif()
+
+ if dereference(it).first == cpos[0]:
+ dereference(it).second = upos[0]
+ postincrement(it)
+ if it == positions.end():
+ break
+
+
+__all__ = [
+ # exceptions
+ 'BackreferencesException', 'CharClassProblemException',
+ 'RegexError', 'error',
+ # constants
+ 'FALLBACK_EXCEPTION', 'FALLBACK_QUIETLY', 'FALLBACK_WARNING', 'DEBUG',
+ 'S', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE',
+ 'U', 'UNICODE', 'X', 'VERBOSE', 'VERSION', 'VERSION_HEX',
+ # classes
+ 'Match', 'Pattern', 'SREPattern',
+ # functions
+ 'compile', 'count', 'escape', 'findall', 'finditer', 'fullmatch',
+ 'match', 'purge', 'search', 'split', 'sub', 'subn',
+ 'set_fallback_notification',
+ ]