diff options
author | robot-piglet <robot-piglet@yandex-team.com> | 2023-12-02 01:45:21 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2023-12-02 02:42:50 +0300 |
commit | 9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch) | |
tree | 9f88a486917d371d099cd712efd91b4c122d209d /contrib/python/pyre2/py3/src | |
parent | 32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff) | |
download | ydb-9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c.tar.gz |
Intermediate changes
Diffstat (limited to 'contrib/python/pyre2/py3/src')
-rw-r--r-- | contrib/python/pyre2/py3/src/_re2macros.h | 13 | ||||
-rw-r--r-- | contrib/python/pyre2/py3/src/compile.pxi | 234 | ||||
-rw-r--r-- | contrib/python/pyre2/py3/src/includes.pxi | 109 | ||||
-rw-r--r-- | contrib/python/pyre2/py3/src/match.pxi | 280 | ||||
-rw-r--r-- | contrib/python/pyre2/py3/src/pattern.pxi | 650 | ||||
-rw-r--r-- | contrib/python/pyre2/py3/src/re2.pyx | 458 |
6 files changed, 1744 insertions, 0 deletions
diff --git a/contrib/python/pyre2/py3/src/_re2macros.h b/contrib/python/pyre2/py3/src/_re2macros.h new file mode 100644 index 0000000000..b9ac82af6b --- /dev/null +++ b/contrib/python/pyre2/py3/src/_re2macros.h @@ -0,0 +1,13 @@ +#ifndef __RE2MACROS_H +#define __RE2MACROS_H + +#include <stdio.h> +#include "re2/stringpiece.h" + +static inline re2::StringPiece * new_StringPiece_array(int n) +{ + re2::StringPiece * sp = new re2::StringPiece[n]; + return sp; +} + +#endif diff --git a/contrib/python/pyre2/py3/src/compile.pxi b/contrib/python/pyre2/py3/src/compile.pxi new file mode 100644 index 0000000000..887a2778cd --- /dev/null +++ b/contrib/python/pyre2/py3/src/compile.pxi @@ -0,0 +1,234 @@ + +def compile(pattern, int flags=0, int max_mem=8388608): + cachekey = (type(pattern), pattern, flags) + if cachekey in _cache: + return _cache[cachekey] + p = _compile(pattern, flags, max_mem) + + if len(_cache) >= _MAXCACHE: + _cache.popitem() + _cache[cachekey] = p + return p + + +def _compile(object pattern, int flags=0, int max_mem=8388608): + """Compile a regular expression pattern, returning a pattern object.""" + def fallback(pattern, flags, error_msg): + """Raise error, warn, or simply return fallback from re module.""" + if current_notification == FALLBACK_EXCEPTION: + raise RegexError(error_msg) + elif current_notification == FALLBACK_WARNING: + warnings.warn("WARNING: Using re module. Reason: %s" % error_msg) + try: + result = PythonRePattern(pattern, flags) + except re.error as err: + raise RegexError(*err.args) + return result + + cdef StringPiece * s + cdef Options opts + cdef int error_code + cdef int encoded = 0 + cdef object original_pattern + + if isinstance(pattern, (Pattern, SREPattern)): + if flags: + raise ValueError( + 'Cannot process flags argument with a compiled pattern') + return pattern + + original_pattern = pattern + if flags & _L: + return fallback(original_pattern, flags, "re.LOCALE not supported") + pattern = unicode_to_bytes(pattern, &encoded, -1) + newflags = flags + if not PY2: + if not encoded and flags & _U: # re.UNICODE + pass # can use UNICODE with bytes pattern, but assumes valid UTF-8 + # raise ValueError("can't use UNICODE flag with a bytes pattern") + elif encoded and not (flags & ASCII): # re.ASCII (not in Python 2) + newflags = flags | _U # re.UNICODE + elif encoded and flags & ASCII: + newflags = flags & ~_U # re.UNICODE + try: + pattern = _prepare_pattern(pattern, newflags) + except BackreferencesException: + return fallback(original_pattern, flags, "Backreferences not supported") + except CharClassProblemException: + return fallback(original_pattern, flags, + "\W and \S not supported inside character classes") + + # Set the options given the flags above. + if flags & _I: + opts.set_case_sensitive(0); + + opts.set_max_mem(max_mem) + opts.set_log_errors(0) + if flags & _U or encoded: + opts.set_encoding(EncodingUTF8) + else: # re.UNICODE flag not passed, and pattern is bytes, + # so allow matching of arbitrary byte sequences. + opts.set_encoding(EncodingLatin1) + + s = new StringPiece(<char *><bytes>pattern, len(pattern)) + + cdef RE2 *re_pattern + with nogil: + re_pattern = new RE2(s[0], opts) + + if not re_pattern.ok(): + # Something went wrong with the compilation. + del s + error_msg = cpp_to_unicode(re_pattern.error()) + error_code = re_pattern.error_code() + del re_pattern + if current_notification == FALLBACK_EXCEPTION: + # Raise an exception regardless of the type of error. + raise RegexError(error_msg) + elif error_code not in (ErrorBadPerlOp, ErrorRepeatSize, + # ErrorBadEscape, + ErrorPatternTooLarge): + # Raise an error because these will not be fixed by using the + # ``re`` module. + raise RegexError(error_msg) + elif current_notification == FALLBACK_WARNING: + warnings.warn("WARNING: Using re module. Reason: %s" % error_msg) + return PythonRePattern(original_pattern, flags) + + cdef Pattern pypattern = Pattern() + cdef map[cpp_string, int] named_groups = re_pattern.NamedCapturingGroups() + pypattern.pattern = original_pattern + pypattern.re_pattern = re_pattern + pypattern.groups = re_pattern.NumberOfCapturingGroups() + pypattern.encoded = encoded + pypattern.flags = flags + pypattern.groupindex = {} + for it in named_groups: + pypattern.groupindex[cpp_to_unicode(it.first)] = it.second + + if flags & DEBUG: + print(repr(pypattern._dump_pattern())) + del s + return pypattern + + +def _prepare_pattern(bytes pattern, int flags): + """Translate pattern to RE2 syntax.""" + cdef bytearray result = bytearray() + cdef unsigned char * cstring = pattern + cdef unsigned char this, that + cdef int size = len(pattern) + cdef int n = 0 + + if flags & (_S | _M): + result.extend(b'(?') + if flags & _S: + result.extend(b's') + if flags & _M: + result.extend(b'm') + result.extend(b')') + while n < size: + this = cstring[n] + if flags & _X: + if this in b' \t\n\r\f\v': + n += 1 + continue + elif this == b'#': + while True: + n += 1 + if n >= size: + break + this = cstring[n] + if this == b'\n': + break + n += 1 + continue + + if this != b'[' and this != b'\\': + result.append(this) + n += 1 + continue + elif this == b'[': + result.append(this) + while True: + n += 1 + if n >= size: + raise RegexError("unexpected end of regular expression") + this = cstring[n] + if this == b']': + result.append(this) + break + elif this == b'\\': + n += 1 + that = cstring[n] + if that == b'b': + result.extend(br'\010') + elif flags & _U: + if that == b'd': + result.extend(br'\p{Nd}') + elif that == b'w': + result.extend(br'_\p{L}\p{Nd}') + elif that == b's': + result.extend(br'\s\p{Z}') + elif that == b'D': + result.extend(br'\P{Nd}') + elif that == b'W': + # Since \w and \s are made out of several character + # groups, I don't see a way to convert their + # complements into a group without rewriting the + # whole expression, which seems too complicated. + raise CharClassProblemException() + elif that == b'S': + raise CharClassProblemException() + else: + result.append(this) + result.append(that) + else: + result.append(this) + result.append(that) + else: + result.append(this) + elif this == b'\\': + n += 1 + that = cstring[n] + if b'8' <= that <= b'9': + raise BackreferencesException() + elif isoct(that): + if (n + 2 < size and isoct(cstring[n + 1]) + and isoct(cstring[n + 2])): + # all clear, this is an octal escape + result.extend(cstring[n - 1:n + 3]) + n += 2 + else: + raise BackreferencesException() + elif that == b'x': + if (n + 2 < size and ishex(cstring[n + 1]) + and ishex(cstring[n + 2])): + # hex escape + result.extend(cstring[n - 1:n + 3]) + n += 2 + else: + raise BackreferencesException() + elif that == b'Z': + result.extend(b'\\z') + elif flags & _U: + if that == b'd': + result.extend(br'\p{Nd}') + elif that == b'w': + result.extend(br'[_\p{L}\p{Nd}]') + elif that == b's': + result.extend(br'[\s\p{Z}]') + elif that == b'D': + result.extend(br'[^\p{Nd}]') + elif that == b'W': + result.extend(br'[^_\p{L}\p{Nd}]') + elif that == b'S': + result.extend(br'[^\s\p{Z}]') + else: + result.append(this) + result.append(that) + else: + result.append(this) + result.append(that) + n += 1 + return bytes(result) diff --git a/contrib/python/pyre2/py3/src/includes.pxi b/contrib/python/pyre2/py3/src/includes.pxi new file mode 100644 index 0000000000..8c35b6d4b2 --- /dev/null +++ b/contrib/python/pyre2/py3/src/includes.pxi @@ -0,0 +1,109 @@ +cimport cpython.unicode +from libcpp.map cimport map +from libcpp.string cimport string as cpp_string +from cython.operator cimport postincrement, dereference +from cpython.buffer cimport Py_buffer, PyBUF_SIMPLE, PyObject_CheckBuffer, \ + PyObject_GetBuffer, PyBuffer_Release +from cpython.version cimport PY_MAJOR_VERSION + + +cdef extern from *: + cdef void emit_if_narrow_unicode "#if !defined(Py_UNICODE_WIDE) && PY_VERSION_HEX < 0x03030000 //" () + cdef void emit_endif "#endif //" () + + +cdef extern from "Python.h": + int PyObject_CheckReadBuffer(object) + int PyObject_AsReadBuffer(object, const void **, Py_ssize_t *) + + +cdef extern from "re2/stringpiece.h" namespace "re2": + cdef cppclass StringPiece: + StringPiece() + StringPiece(const char *) + StringPiece(const char *, int) + const char * data() + int copy(char * buf, size_t n, size_t pos) + int length() + + +cdef extern from "re2/re2.h" namespace "re2": + cdef enum Anchor: + UNANCHORED "RE2::UNANCHORED" + ANCHOR_START "RE2::ANCHOR_START" + ANCHOR_BOTH "RE2::ANCHOR_BOTH" + + ctypedef Anchor re2_Anchor "RE2::Anchor" + + cdef enum ErrorCode: + NoError "RE2::NoError" + ErrorInternal "RE2::ErrorInternal" + # Parse errors + ErrorBadEscape "RE2::ErrorBadEscape" # bad escape sequence + ErrorBadCharClass "RE2::ErrorBadCharClass" # bad character class + ErrorBadCharRange "RE2::ErrorBadCharRange" # bad character class range + ErrorMissingBracket "RE2::ErrorMissingBracket" # missing closing ] + ErrorMissingParen "RE2::ErrorMissingParen" # missing closing ) + ErrorTrailingBackslash "RE2::ErrorTrailingBackslash" # trailing \ at end of regexp + ErrorRepeatArgument "RE2::ErrorRepeatArgument" # repeat argument missing, e.g. "*" + ErrorRepeatSize "RE2::ErrorRepeatSize" # bad repetition argument + ErrorRepeatOp "RE2::ErrorRepeatOp" # bad repetition operator + ErrorBadPerlOp "RE2::ErrorBadPerlOp" # bad perl operator + ErrorBadUTF8 "RE2::ErrorBadUTF8" # invalid UTF-8 in regexp + ErrorBadNamedCapture "RE2::ErrorBadNamedCapture" # bad named capture group + ErrorPatternTooLarge "RE2::ErrorPatternTooLarge" # pattern too large (compile failed) + + cdef enum Encoding: + EncodingUTF8 "RE2::Options::EncodingUTF8" + EncodingLatin1 "RE2::Options::EncodingLatin1" + + ctypedef Encoding re2_Encoding "RE2::Options::Encoding" + + cdef cppclass Options "RE2::Options": + Options() + void set_posix_syntax(int b) + void set_longest_match(int b) + void set_log_errors(int b) + void set_max_mem(int m) + void set_literal(int b) + void set_never_nl(int b) + void set_case_sensitive(int b) + void set_perl_classes(int b) + void set_word_boundary(int b) + void set_one_line(int b) + int case_sensitive() + void set_encoding(re2_Encoding encoding) + + cdef cppclass RE2: + RE2(const StringPiece pattern, Options option) nogil + RE2(const StringPiece pattern) nogil + int Match(const StringPiece text, int startpos, int endpos, + Anchor anchor, StringPiece * match, int nmatch) nogil + int Replace(cpp_string *str, const RE2 pattern, + const StringPiece rewrite) nogil + int GlobalReplace(cpp_string *str, const RE2 pattern, + const StringPiece rewrite) nogil + int NumberOfCapturingGroups() + int ok() + const cpp_string pattern() + cpp_string error() + ErrorCode error_code() + const map[cpp_string, int]& NamedCapturingGroups() + + # hack for static methods + cdef int Replace "RE2::Replace"( + cpp_string *str, const RE2 pattern, + const StringPiece rewrite) nogil + cdef int GlobalReplace "RE2::GlobalReplace"( + cpp_string *str, + const RE2 pattern, + const StringPiece rewrite) nogil + + +cdef extern from "_re2macros.h": + StringPiece * new_StringPiece_array(int) nogil + + +cdef extern from *: + # StringPiece * new_StringPiece_array "new re2::StringPiece[n]" (int) nogil + void delete_StringPiece_array "delete[]" (StringPiece *) nogil diff --git a/contrib/python/pyre2/py3/src/match.pxi b/contrib/python/pyre2/py3/src/match.pxi new file mode 100644 index 0000000000..3eaae74b47 --- /dev/null +++ b/contrib/python/pyre2/py3/src/match.pxi @@ -0,0 +1,280 @@ +cdef class Match: + cdef readonly Pattern re + cdef readonly object string + cdef readonly int pos + cdef readonly int endpos + cdef readonly tuple regs + + cdef StringPiece * matches + cdef int encoded + cdef int nmatches + cdef int _lastindex + cdef tuple _groups + cdef dict _named_groups + + property lastindex: + def __get__(self): + return None if self._lastindex < 1 else self._lastindex + + property lastgroup: + def __get__(self): + if self._lastindex < 1: + return None + for name, n in self.re.groupindex.items(): + if n == self._lastindex: + return name + return None + + def __init__(self, Pattern pattern_object, int num_groups): + self._lastindex = -1 + self._groups = None + self.pos = 0 + self.endpos = -1 + self.matches = new_StringPiece_array(num_groups + 1) + self.nmatches = num_groups + self.re = pattern_object + + cdef _init_groups(self): + cdef list groups = [] + cdef int i + cdef const char * last_end = NULL + cdef const char * cur_end = NULL + + for i in range(self.nmatches): + if self.matches[i].data() == NULL: + groups.append(None) + else: + if i > 0: + cur_end = self.matches[i].data() + self.matches[i].length() + + if last_end == NULL: + last_end = cur_end + self._lastindex = i + else: + # The rules for last group are a bit complicated: + # if two groups end at the same point, the earlier one + # is considered last, so we don't switch our selection + # unless the end point has moved. + if cur_end > last_end: + last_end = cur_end + self._lastindex = i + groups.append( + self.matches[i].data()[:self.matches[i].length()]) + self._groups = tuple(groups) + + cdef bytes _group(self, object groupnum): + cdef int idx + if isinstance(groupnum, int): + idx = groupnum + if idx > self.nmatches - 1: + raise IndexError("no such group %d; available groups: %r" + % (idx, list(range(self.nmatches)))) + return self._groups[idx] + groupdict = self._groupdict() + if groupnum not in groupdict: + raise IndexError("no such group %r; available groups: %r" + % (groupnum, list(groupdict))) + return groupdict[groupnum] + + cdef dict _groupdict(self): + if self._named_groups is None: + self._named_groups = {name: self._groups[n] + for name, n in self.re.groupindex.items()} + return self._named_groups + + def groups(self, default=None): + if self.encoded: + return tuple([default if g is None else g.decode('utf8') + for g in self._groups[1:]]) + return tuple([default if g is None else g + for g in self._groups[1:]]) + + def group(self, *args): + if len(args) == 0: + groupnum = 0 + elif len(args) == 1: + groupnum = args[0] + else: # len(args) > 1: + return tuple([self.group(i) for i in args]) + if self.encoded: + result = self._group(groupnum) + return None if result is None else result.decode('utf8') + return self._group(groupnum) + + def groupdict(self): + result = self._groupdict() + if self.encoded: + return {a: None if b is None else b.decode('utf8') + for a, b in result.items()} + return result + + def expand(self, object template): + """Expand a template with groups.""" + cdef bytearray result = bytearray() + if isinstance(template, unicode): + if not PY2 and not self.encoded: + raise ValueError( + 'cannot expand unicode template on bytes pattern') + templ = template.encode('utf8') + else: + if not PY2 and self.encoded: + raise ValueError( + 'cannot expand bytes template on unicode pattern') + templ = bytes(template) + self._expand(templ, result) + return result.decode('utf8') if self.encoded else bytes(result) + + cdef _expand(self, bytes templ, bytearray result): + """Expand template by appending to an existing bytearray. + Everything remains UTF-8 encoded.""" + cdef char * cstring + cdef int n = 0, prev = 0, size + + # NB: cstring is used to get single characters, to avoid difference in + # Python 2/3 behavior of bytes objects. + cstring = templ + size = len(templ) + while True: + prev = n + n = templ.find(b'\\', prev) + if n == -1: + result.extend(templ[prev:]) + break + result.extend(templ[prev:n]) + n += 1 + if (n + 2 < size and cstring[n] == b'x' + and ishex(cstring[n + 1]) and ishex(cstring[n + 2])): + # hex char reference \x1f + result.append(int(templ[n + 1:n + 3], base=16) & 255) + n += 3 + elif (n + 2 < size and isoct(cstring[n]) and isoct(cstring[n + 1]) + and isoct(cstring[n + 2])): + # octal char reference \123 + result.append(int(templ[n:n + 3], base=8) & 255) + n += 3 + elif cstring[n] == b'0': + if n + 1 < size and isoct(cstring[n + 1]): + # 2 character octal: \01 + result.append(int(templ[n:n + 2], base=8)) + n += 2 + else: # nul-terminator literal \0 + result.append(b'\0') + n += 1 + elif b'0' <= cstring[n] <= b'9': # numeric group reference + if n + 1 < size and isdigit(cstring[n + 1]): + # 2 digit group ref \12 + groupno = int(templ[n:n + 2]) + n += 2 + else: + # 1 digit group ref \1 + groupno = int(templ[n:n + 1]) + n += 1 + if groupno <= self.re.groups: + groupval = self._group(groupno) + if groupval is not None: + result.extend(groupval) + else: + raise RegexError('invalid group reference.') + elif cstring[n] == b'g': # named group reference + n += 1 + if n >= size or cstring[n] != b'<': + raise RegexError('missing group name') + n += 1 + start = n + while cstring[n] != b'>': + if not isident(cstring[n]): + raise RegexError('bad character in group name') + n += 1 + if n >= size: + raise RegexError('unterminated group name') + if templ[start:n].isdigit(): + name = int(templ[start:n]) + elif isdigit(cstring[start]): + raise RegexError('bad character in group name') + else: + name = templ[start:n] + if self.encoded: + name = name.decode('utf8') + groupval = self._group(name) + if groupval is not None: + result.extend(groupval) + n += 1 + else: + if cstring[n] == b'n': + result.append(b'\n') + elif cstring[n] == b'r': + result.append(b'\r') + elif cstring[n] == b't': + result.append(b'\t') + elif cstring[n] == b'v': + result.append(b'\v') + elif cstring[n] == b'f': + result.append(b'\f') + elif cstring[n] == b'a': + result.append(b'\a') + elif cstring[n] == b'b': + result.append(b'\b') + elif cstring[n] == b'\\': + result.append(b'\\') + else: # copy verbatim + result.append(b'\\') + result.append(cstring[n]) + n += 1 + return bytes(result) + + def start(self, group=0): + return self.span(group)[0] + + def end(self, group=0): + return self.span(group)[1] + + def span(self, group=0): + if isinstance(group, int): + if group > len(self.regs): + raise IndexError("no such group %d; available groups: %r" + % (group, list(range(len(self.regs))))) + return self.regs[group] + else: + self._groupdict() + if group not in self.re.groupindex: + raise IndexError("no such group %r; available groups: %r" + % (group, list(self.re.groupindex))) + return self.regs[self.re.groupindex[group]] + + cdef _make_spans(self, char * cstring, int size, int * cpos, int * upos): + cdef int start, end + cdef StringPiece * piece + + spans = [] + for i in range(self.nmatches): + if self.matches[i].data() == NULL: + spans.append((-1, -1)) + else: + piece = &self.matches[i] + if piece.data() == NULL: + return (-1, -1) + start = piece.data() - cstring + end = start + piece.length() + spans.append((start, end)) + + if self.encoded == 2: + spans = self._convert_spans(spans, cstring, size, cpos, upos) + + self.regs = tuple(spans) + + cdef list _convert_spans(self, spans, + char * cstring, int size, int * cpos, int * upos): + cdef map[int, int] positions + cdef int x, y + for x, y in spans: + positions[x] = x + positions[y] = y + unicodeindices(positions, cstring, size, cpos, upos) + return [(positions[x], positions[y]) for x, y in spans] + + def __dealloc__(self): + delete_StringPiece_array(self.matches) + + def __repr__(self): + return '<re2.Match object; span=%r, match=%r>' % ( + self.span(), self.group()) diff --git a/contrib/python/pyre2/py3/src/pattern.pxi b/contrib/python/pyre2/py3/src/pattern.pxi new file mode 100644 index 0000000000..b8439d2007 --- /dev/null +++ b/contrib/python/pyre2/py3/src/pattern.pxi @@ -0,0 +1,650 @@ +cdef class Pattern: + cdef readonly object pattern # original pattern in Python format + cdef readonly int flags + cdef readonly int groups # number of groups + cdef readonly dict groupindex # name => group number + cdef object __weakref__ + + cdef bint encoded # True if this was originally a Unicode pattern + cdef RE2 * re_pattern + + def search(self, object string, int pos=0, int endpos=-1): + """Scan through string looking for a match, and return a corresponding + Match instance. Return None if no position in the string matches.""" + return self._search(string, pos, endpos, UNANCHORED) + + def match(self, object string, int pos=0, int endpos=-1): + """Matches zero or more characters at the beginning of the string.""" + return self._search(string, pos, endpos, ANCHOR_START) + + def fullmatch(self, object string, int pos=0, int endpos=-1): + """"fullmatch(string[, pos[, endpos]]) --> Match object or None." + + Matches the entire string.""" + return self._search(string, pos, endpos, ANCHOR_BOTH) + + cdef _search(self, object string, int pos, int endpos, + re2_Anchor anchoring): + """Scan through string looking for a match, and return a corresponding + Match instance. Return None if no position in the string matches.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int encoded = 0 + cdef StringPiece * sp + cdef Match m = Match(self, self.groups + 1) + cdef int cpos = 0, upos = pos + + if 0 <= endpos <= pos: + return None + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + cpos = pos + if pos > size: + return None + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + anchoring, + m.matches, + self.groups + 1) + del sp + if retval == 0: + return None + + m.encoded = encoded + m.nmatches = self.groups + 1 + m.string = string + m.pos = pos + if endpos == -1: + m.endpos = size + else: + m.endpos = endpos + m._make_spans(cstring, size, &cpos, &upos) + m._init_groups() + finally: + release_cstring(&buf) + return m + + def contains(self, object string, int pos=0, int endpos=-1): + """"contains(string[, pos[, endpos]]) --> bool." + + Scan through string looking for a match, and return True or False.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int encoded = 0 + cdef StringPiece * sp + + if 0 <= endpos <= pos: + return False + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + if pos > size: + return False + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + NULL, + 0) + del sp + finally: + release_cstring(&buf) + return retval != 0 + + def count(self, object string, int pos=0, int endpos=-1): + """Return number of non-overlapping matches of pattern in string.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int encoded = 0 + cdef int result = 0 + cdef StringPiece * sp = NULL + cdef StringPiece * matches = NULL + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + if pos > size: + return 0 + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + matches = new_StringPiece_array(1) + try: + while True: + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + matches, + 1) + if retval == 0: + break + result += 1 + if pos == size: + break + # offset the pos to move to the next point + pos = matches[0].data() - cstring + ( + matches[0].length() or 1) + finally: + del sp + delete_StringPiece_array(matches) + finally: + release_cstring(&buf) + return result + + def findall(self, object string, int pos=0, int endpos=-1): + """Return all non-overlapping matches of pattern in string as a list + of strings.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int encoded = 0 + cdef int retval + cdef list resultlist = [] + cdef StringPiece * sp = NULL + cdef StringPiece * matches = NULL + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + if pos > size: + return [] + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + matches = new_StringPiece_array(self.groups + 1) + + while True: + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + matches, + self.groups + 1) + if retval == 0: + break + if self.groups > 1: + if encoded: + resultlist.append(tuple([ + '' if matches[i].data() is NULL else + matches[i].data()[:matches[i].length() + ].decode('utf8') + for i in range(1, self.groups + 1)])) + else: + resultlist.append(tuple([ + b'' if matches[i].data() is NULL + else matches[i].data()[:matches[i].length()] + for i in range(1, self.groups + 1)])) + else: # 0 or 1 group; return list of strings + if encoded: + resultlist.append(matches[self.groups].data()[ + :matches[self.groups].length()].decode('utf8')) + else: + resultlist.append(matches[self.groups].data()[ + :matches[self.groups].length()]) + if pos == size: + break + # offset the pos to move to the next point + pos = matches[0].data() - cstring + (matches[0].length() or 1) + finally: + del sp + delete_StringPiece_array(matches) + release_cstring(&buf) + return resultlist + + def finditer(self, object string, int pos=0, int endpos=-1): + """Yield all non-overlapping matches of pattern in string as Match + objects.""" + result = iter(self._finditer(string, pos, endpos)) + next(result) # dummy value to raise error before start of generator + return result + + def _finditer(self, object string, int pos=0, int endpos=-1): + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef StringPiece * sp = NULL + cdef Match m + cdef int encoded = 0 + cdef int cpos = 0, upos = pos + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + cpos = pos + if pos > size: + return + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + + yield + while True: + m = Match(self, self.groups + 1) + m.string = string + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + m.matches, + self.groups + 1) + if retval == 0: + break + m.encoded = encoded + m.nmatches = self.groups + 1 + m.pos = pos + if endpos == -1: + m.endpos = size + else: + m.endpos = endpos + m._make_spans(cstring, size, &cpos, &upos) + m._init_groups() + yield m + if pos == size: + break + # offset the pos to move to the next point + pos = m.matches[0].data() - cstring + ( + m.matches[0].length() or 1) + finally: + del sp + release_cstring(&buf) + + def split(self, string, int maxsplit=0): + """split(string[, maxsplit = 0]) --> list + + Split a string by the occurrences of the pattern.""" + cdef char * cstring + cdef Py_ssize_t size + cdef int retval + cdef int pos = 0 + cdef int lookahead = 0 + cdef int num_split = 0 + cdef StringPiece * sp + cdef StringPiece * matches + cdef list resultlist = [] + cdef int encoded = 0 + cdef Py_buffer buf + + if maxsplit < 0: + maxsplit = 0 + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + matches = new_StringPiece_array(self.groups + 1) + sp = new StringPiece(cstring, size) + try: + + while True: + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos + lookahead, + size, + UNANCHORED, + matches, + self.groups + 1) + if retval == 0: + break + + match_start = matches[0].data() - cstring + match_end = match_start + matches[0].length() + + # If an empty match, just look ahead until you find something + if match_start == match_end: + if pos + lookahead == size: + break + lookahead += 1 + continue + + if encoded: + resultlist.append( + char_to_unicode(&sp.data()[pos], match_start - pos)) + else: + resultlist.append(sp.data()[pos:match_start]) + if self.groups > 0: + for group in range(self.groups): + if matches[group + 1].data() == NULL: + resultlist.append(None) + else: + if encoded: + resultlist.append(char_to_unicode( + matches[group + 1].data(), + matches[group + 1].length())) + else: + resultlist.append(matches[group + 1].data()[: + matches[group + 1].length()]) + + # offset the pos to move to the next point + pos = match_end + lookahead = 0 + + num_split += 1 + if maxsplit and num_split >= maxsplit: + break + + if encoded: + resultlist.append( + char_to_unicode(&sp.data()[pos], sp.length() - pos)) + else: + resultlist.append(sp.data()[pos:]) + finally: + del sp + delete_StringPiece_array(matches) + release_cstring(&buf) + return resultlist + + def sub(self, repl, string, int count=0): + """sub(repl, string[, count = 0]) --> newstring + + Return the string obtained by replacing the leftmost non-overlapping + occurrences of pattern in string by the replacement repl.""" + cdef int num_repl = 0 + return self._subn(repl, string, count, &num_repl) + + def subn(self, repl, string, int count=0): + """subn(repl, string[, count = 0]) --> (newstring, number of subs) + + Return the tuple (new_string, number_of_subs_made) found by replacing + the leftmost non-overlapping occurrences of pattern with the + replacement repl.""" + cdef int num_repl = 0 + result = self._subn(repl, string, count, &num_repl) + return result, num_repl + + cdef _subn(self, repl, string, int count, int *num_repl): + cdef bytes repl_b + cdef char * cstring + cdef object result + cdef Py_ssize_t size + cdef StringPiece * sp = NULL + cdef cpp_string * input_str = NULL + cdef int string_encoded = 0 + cdef int repl_encoded = 0 + + if callable(repl): + # This is a callback, so use the custom function + return self._subn_callback(repl, string, count, num_repl) + + repl_b = unicode_to_bytes(repl, &repl_encoded, self.encoded) + if not repl_encoded and not isinstance(repl, bytes): + repl_b = bytes(repl) # coerce buffer to bytes object + + if count > 1 or (b'\\' if PY2 else <char>b'\\') in repl_b: + # Limit on number of substitutions or replacement string contains + # escape sequences; handle with Match.expand() implementation. + # RE2 does support simple numeric group references \1, \2, + # but the number of differences with Python behavior is + # non-trivial. + return self._subn_expand(repl_b, string, count, num_repl) + try: + cstring = repl_b + size = len(repl_b) + sp = new StringPiece(cstring, size) + + bytestr = unicode_to_bytes(string, &string_encoded, self.encoded) + if not string_encoded and not isinstance(bytestr, bytes): + bytestr = bytes(bytestr) # coerce buffer to bytes object + input_str = new cpp_string(<char *>bytestr, len(bytestr)) + # NB: RE2 treats unmatched groups in repl as empty string; + # Python raises an error. + with nogil: + if count == 0: + num_repl[0] = GlobalReplace( + input_str, self.re_pattern[0], sp[0]) + elif count == 1: + num_repl[0] = Replace( + input_str, self.re_pattern[0], sp[0]) + + if string_encoded or (repl_encoded and num_repl[0] > 0): + result = cpp_to_unicode(input_str[0]) + else: + result = cpp_to_bytes(input_str[0]) + finally: + del input_str, sp + return result + + cdef _subn_callback(self, callback, string, int count, int * num_repl): + # This function is probably the hardest to implement correctly. + # This is my first attempt, but if anybody has a better solution, + # please help out. + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int prevendpos = -1 + cdef int endpos = 0 + cdef int pos = 0 + cdef int encoded = 0 + cdef StringPiece * sp + cdef Match m + cdef bytearray result = bytearray() + cdef int cpos = 0, upos = 0 + + if count < 0: + count = 0 + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + sp = new StringPiece(cstring, size) + try: + while True: + m = Match(self, self.groups + 1) + m.string = string + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + m.matches, + self.groups + 1) + if retval == 0: + break + + endpos = m.matches[0].data() - cstring + if endpos == prevendpos: + endpos += 1 + if endpos > size: + break + prevendpos = endpos + result.extend(sp.data()[pos:endpos]) + pos = endpos + m.matches[0].length() + + m.encoded = encoded + m.nmatches = self.groups + 1 + m._make_spans(cstring, size, &cpos, &upos) + m._init_groups() + tmp = callback(m) + if tmp: + result.extend(tmp.encode('utf8') if encoded else tmp) + else: + result.extend(b'') + + num_repl[0] += 1 + if count and num_repl[0] >= count: + break + result.extend(sp.data()[pos:]) + finally: + del sp + release_cstring(&buf) + return result.decode('utf8') if encoded else bytes(result) + + cdef _subn_expand(self, bytes repl, string, int count, int * num_repl): + """Perform ``count`` substitutions with replacement string and + Match.expand.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int prevendpos = -1 + cdef int endpos = 0 + cdef int pos = 0 + cdef int encoded = 0 + cdef StringPiece * sp + cdef Match m + cdef bytearray result = bytearray() + + if count < 0: + count = 0 + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + sp = new StringPiece(cstring, size) + try: + while True: + m = Match(self, self.groups + 1) + m.string = string + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + m.matches, + self.groups + 1) + if retval == 0: + break + + endpos = m.matches[0].data() - cstring + if endpos == prevendpos: + endpos += 1 + if endpos > size: + break + prevendpos = endpos + result.extend(sp.data()[pos:endpos]) + pos = endpos + m.matches[0].length() + + m.encoded = encoded + m.nmatches = self.groups + 1 + m._init_groups() + m._expand(repl, result) + + num_repl[0] += 1 + if count and num_repl[0] >= count: + break + result.extend(sp.data()[pos:]) + finally: + del sp + release_cstring(&buf) + return result.decode('utf8') if encoded else bytes(result) + + def scanner(self, arg): + return re.compile(self.pattern).scanner(arg) + # raise NotImplementedError + + def _dump_pattern(self): + cdef cpp_string s = self.re_pattern.pattern() + if self.encoded: + return cpp_to_bytes(s).decode('utf8') + return cpp_to_bytes(s) + + def __repr__(self): + if self.flags == 0: + return 're2.compile(%r)' % self.pattern + return 're2.compile(%r, %r)' % (self.pattern, self.flags) + + def __reduce__(self): + return (compile, (self.pattern, self.flags)) + + def __dealloc__(self): + del self.re_pattern + + +class PythonRePattern: + """A wrapper for re.Pattern to support the extra methods defined by re2 + (contains, count).""" + def __init__(self, pattern, flags=None): + self._pattern = re.compile(pattern, flags) + self.pattern = pattern + self.flags = flags + self.groupindex = self._pattern.groupindex + self.groups = self._pattern.groups + + def contains(self, string): + return bool(self._pattern.search(string)) + + def count(self, string, pos=0, endpos=9223372036854775807): + return len(self._pattern.findall(string, pos, endpos)) + + def findall(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.findall(string, pos, endpos) + + def finditer(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.finditer(string, pos, endpos) + + def fullmatch(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.fullmatch(string, pos, endpos) + + def match(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.match(string, pos, endpos) + + def scanner(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.scanner(string, pos, endpos) + + def search(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.search(string, pos, endpos) + + def split(self, string, maxsplit=0): + return self._pattern.split(string, maxsplit) + + def sub(self, repl, string, count=0): + return self._pattern.sub(repl, string, count) + + def subn(self, repl, string, count=0): + return self._pattern.subn(repl, string, count) + + def __repr__(self): + return repr(self._pattern) + + def __reduce__(self): + return (self, (self.pattern, self.flags)) diff --git a/contrib/python/pyre2/py3/src/re2.pyx b/contrib/python/pyre2/py3/src/re2.pyx new file mode 100644 index 0000000000..c48101426f --- /dev/null +++ b/contrib/python/pyre2/py3/src/re2.pyx @@ -0,0 +1,458 @@ +# cython: infer_types(False) +r"""Regular expressions using Google's RE2 engine. + +Compared to Python's ``re``, the RE2 engine compiles regular expressions to +deterministic finite automata, which guarantees linear-time behavior. + +Intended as a drop-in replacement for ``re``. Unicode is supported by encoding +to UTF-8, and bytes strings are treated as UTF-8 when the UNICODE flag is given. +For best performance, work with UTF-8 encoded bytes strings. + +Regular expressions that are not compatible with RE2 are processed with +fallback to ``re``. Examples of features not supported by RE2: + + - lookahead assertions ``(?!...)`` + - backreferences (``\\n`` in search pattern) + - \W and \S not supported inside character classes + +On the other hand, unicode character classes are supported (e.g., ``\p{Greek}``). +Syntax reference: https://github.com/google/re2/wiki/Syntax + +What follows is a reference for the regular expression syntax supported by this +module (i.e., without requiring fallback to `re`). + +Regular expressions can contain both special and ordinary characters. +Most ordinary characters, like "A", "a", or "0", are the simplest +regular expressions; they simply match themselves. + +The special characters are:: + + "." Matches any character except a newline. + "^" Matches the start of the string. + "$" Matches the end of the string or just before the newline at + the end of the string. + "*" Matches 0 or more (greedy) repetitions of the preceding RE. + Greedy means that it will match as many repetitions as possible. + "+" Matches 1 or more (greedy) repetitions of the preceding RE. + "?" Matches 0 or 1 (greedy) of the preceding RE. + *?,+?,?? Non-greedy versions of the previous three special characters. + {m,n} Matches from m to n repetitions of the preceding RE. + {m,n}? Non-greedy version of the above. + "\\" Either escapes special characters or signals a special sequence. + [] Indicates a set of characters. + A "^" as the first character indicates a complementing set. + "|" A|B, creates an RE that will match either A or B. + (...) Matches the RE inside the parentheses. + The contents can be retrieved or matched later in the string. + (?:...) Non-grouping version of regular parentheses. + (?imsux) Set the I, M, S, U, or X flag for the RE (see below). + +The special sequences consist of "\\" and a character from the list +below. If the ordinary character is not on the list, then the +resulting RE will match the second character:: + + \A Matches only at the start of the string. + \Z Matches only at the end of the string. + \b Matches the empty string, but only at the start or end of a word. + \B Matches the empty string, but not at the start or end of a word. + \d Matches any decimal digit. + \D Matches any non-digit character. + \s Matches any whitespace character. + \S Matches any non-whitespace character. + \w Matches any alphanumeric character. + \W Matches the complement of \w. + \\ Matches a literal backslash. + \pN Unicode character class (one-letter name) + \p{Greek} Unicode character class + \PN negated Unicode character class (one-letter name) + \P{Greek} negated Unicode character class + +This module exports the following functions:: + + count Count all occurrences of a pattern in a string. + match Match a regular expression pattern to the beginning of a string. + fullmatch Match a regular expression pattern to all of a string. + search Search a string for a pattern and return Match object. + contains Same as search, but only return bool. + sub Substitute occurrences of a pattern found in a string. + subn Same as sub, but also return the number of substitutions made. + split Split a string by the occurrences of a pattern. + findall Find all occurrences of a pattern in a string. + finditer Return an iterator yielding a match object for each match. + compile Compile a pattern into a RegexObject. + purge Clear the regular expression cache. + escape Backslash all non-alphanumerics in a string. + +Some of the functions in this module takes flags as optional parameters:: + + A ASCII Make \w, \W, \b, \B, \d, \D match the corresponding ASCII + character categories (rather than the whole Unicode + categories, which is the default). + I IGNORECASE Perform case-insensitive matching. + M MULTILINE "^" matches the beginning of lines (after a newline) + as well as the string. + "$" matches the end of lines (before a newline) as well + as the end of the string. + S DOTALL "." matches any character at all, including the newline. + X VERBOSE Ignore whitespace and comments for nicer looking RE's. + U UNICODE Enable Unicode character classes and make \w, \W, \b, \B, + Unicode-aware (default for unicode patterns). + +This module also defines an exception 'RegexError' (also available under the +alias 'error'). + +""" + +include "includes.pxi" + +import re +import sys +import warnings +from re import error as RegexError + +error = re.error + +# Import re flags to be compatible. +I, M, S, U, X, L = re.I, re.M, re.S, re.U, re.X, re.L +IGNORECASE = re.IGNORECASE +MULTILINE = re.MULTILINE +DOTALL = re.DOTALL +UNICODE = re.UNICODE +VERBOSE = re.VERBOSE +LOCALE = re.LOCALE +DEBUG = re.DEBUG +ASCII = 256 # Python 3 + +FALLBACK_QUIETLY = 0 +FALLBACK_WARNING = 1 +FALLBACK_EXCEPTION = 2 + +VERSION = (0, 2, 23) +VERSION_HEX = 0x000217 + +cdef int _I = I, _M = M, _S = S, _U = U, _X = X, _L = L +cdef int current_notification = FALLBACK_QUIETLY +cdef bint PY2 = PY_MAJOR_VERSION == 2 + +# Type of compiled re object from Python stdlib +SREPattern = type(re.compile('')) + +_cache = {} +_cache_repl = {} + +_MAXCACHE = 100 + + +include "compile.pxi" +include "pattern.pxi" +include "match.pxi" + + +def purge(): + """Clear the regular expression caches.""" + _cache.clear() + _cache_repl.clear() + + +def search(pattern, string, int flags=0): + """Scan through string looking for a match to the pattern, returning + a ``Match`` object or none if no match was found.""" + return compile(pattern, flags).search(string) + + +def match(pattern, string, int flags=0): + """Try to apply the pattern at the start of the string, returning + a ``Match`` object, or ``None`` if no match was found.""" + return compile(pattern, flags).match(string) + + +def fullmatch(pattern, string, int flags=0): + """Try to apply the pattern to the entire string, returning + a ``Match`` object, or ``None`` if no match was found.""" + return compile(pattern, flags).fullmatch(string) + + +def contains(pattern, string, int flags=0): + """Scan through string looking for a match to the pattern, returning + True or False.""" + return compile(pattern, flags).contains(string) + + +def finditer(pattern, string, int flags=0): + """Yield all non-overlapping matches in the string. + + For each match, the iterator returns a ``Match`` object. + Empty matches are included in the result.""" + return compile(pattern, flags).finditer(string) + + +def findall(pattern, string, int flags=0): + """Return a list of all non-overlapping matches in the string. + + Each match is represented as a string or a tuple (when there are two ore + more groups). Empty matches are included in the result.""" + return compile(pattern, flags).findall(string) + + +def count(pattern, string, int flags=0): + """Return number of non-overlapping matches in the string. + + Empty matches are included in the count.""" + return compile(pattern, flags).count(string) + + +def split(pattern, string, int maxsplit=0, int flags=0): + """Split the source string by the occurrences of the pattern, + returning a list containing the resulting substrings.""" + return compile(pattern, flags).split(string, maxsplit) + + +def sub(pattern, repl, string, int count=0, int flags=0): + """Return the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in string by the + replacement ``repl``. ``repl`` can be either a string or a callable; + if a string, backslash escapes in it are processed. If it is + a callable, it's passed the ``Match`` object and must return + a replacement string to be used.""" + return compile(pattern, flags).sub(repl, string, count) + + +def subn(pattern, repl, string, int count=0, int flags=0): + """Return a 2-tuple containing ``(new_string, number)``. + new_string is the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in the source + string by the replacement ``repl``. ``number`` is the number of + substitutions that were made. ``repl`` can be either a string or a + callable; if a string, backslash escapes in it are processed. + If it is a callable, it's passed the ``Match`` object and must + return a replacement string to be used.""" + return compile(pattern, flags).subn(repl, string, count) + + +def escape(pattern): + """Escape all non-alphanumeric characters in pattern.""" + cdef bint uni = isinstance(pattern, unicode) + cdef list s + if PY2 or uni: + s = list(pattern) + else: + s = [bytes([c]) for c in pattern] + for i in range(len(pattern)): + # c = pattern[i] + c = s[i] + if ord(c) < 0x80 and not c.isalnum(): + if uni: + if c == u'\000': + s[i] = u'\\000' + else: + s[i] = u"\\" + c + else: + if c == b'\000': + s[i] = b'\\000' + else: + s[i] = b'\\' + c + return u''.join(s) if uni else b''.join(s) + + +class BackreferencesException(Exception): + """Search pattern contains backreferences.""" + pass + + +class CharClassProblemException(Exception): + """Search pattern contains unsupported character class.""" + pass + + +def set_fallback_notification(level): + """Set the fallback notification to a level; one of: + FALLBACK_QUIETLY + FALLBACK_WARNING + FALLBACK_EXCEPTION + """ + global current_notification + level = int(level) + if level < 0 or level > 2: + raise ValueError("This function expects a valid notification level.") + current_notification = level + + +cdef bint ishex(unsigned char c): + """Test whether ``c`` is in ``[0-9a-fA-F]``""" + return (b'0' <= c <= b'9' or b'a' <= c <= b'f' or b'A' <= c <= b'F') + + +cdef bint isoct(unsigned char c): + """Test whether ``c`` is in ``[0-7]``""" + return b'0' <= c <= b'7' + + +cdef bint isdigit(unsigned char c): + """Test whether ``c`` is in ``[0-9]``""" + return b'0' <= c <= b'9' + + +cdef bint isident(unsigned char c): + """Test whether ``c`` is in ``[a-zA-Z0-9_]``""" + return (b'a' <= c <= b'z' or b'A' <= c <= b'Z' + or b'0' <= c <= b'9' or c == b'_') + + +cdef inline bytes cpp_to_bytes(cpp_string input): + """Convert from a std::string object to a python string.""" + # By taking the slice we go to the right size, + # despite spurious or missing null characters. + return input.data()[:input.length()] + + +cdef inline unicode cpp_to_unicode(cpp_string input): + """Convert a std::string object to a unicode string.""" + return cpython.unicode.PyUnicode_DecodeUTF8( + input.data(), input.length(), 'strict') + + +cdef inline unicode char_to_unicode(const char * input, int length): + """Convert a C string to a unicode string.""" + return cpython.unicode.PyUnicode_DecodeUTF8(input, length, 'strict') + + +cdef inline unicode_to_bytes(object pystring, int * encoded, + int checkotherencoding): + """Convert a unicode string to a utf8 bytes object, if necessary. + + If pystring is a bytes string or a buffer, return unchanged. + If checkotherencoding is 0 or 1 and using Python 3, raise an error + if its truth value is not equal to that of encoded. + encoded is set to 1 if encoded string can be treated as ASCII, + and 2 if it contains multibyte unicode characters.""" + if cpython.unicode.PyUnicode_Check(pystring): + origlen = len(pystring) + pystring = pystring.encode('utf8') + encoded[0] = 1 if origlen == len(pystring) else 2 + else: + encoded[0] = 0 + if not PY2 and checkotherencoding > 0 and not encoded[0]: + raise TypeError("can't use a string pattern on a bytes-like object") + elif not PY2 and checkotherencoding == 0 and encoded[0]: + raise TypeError("can't use a bytes pattern on a string-like object") + return pystring + + +cdef inline int pystring_to_cstring( + object pystring, char ** cstring, Py_ssize_t * size, + Py_buffer * buf): + """Get a pointer from bytes/buffer object ``pystring``. + + On success, return 0, and set ``cstring``, ``size``, and ``buf``.""" + cdef int result = -1 + cstring[0] = NULL + size[0] = 0 + if PyObject_CheckBuffer(pystring) == 1: # new-style Buffer interface + result = PyObject_GetBuffer(pystring, buf, PyBUF_SIMPLE) + if result == 0: + cstring[0] = <char *>buf.buf + size[0] = buf.len + return result + + +cdef inline void release_cstring(Py_buffer *buf): + """Release buffer if necessary.""" + if not PY2: + PyBuffer_Release(buf) + + +cdef utf8indices(char * cstring, int size, int *pos, int *endpos): + """Convert unicode indices ``pos`` and ``endpos`` to UTF-8 indices. + + If the indices are out of range, leave them unchanged.""" + cdef unsigned char * data = <unsigned char *>cstring + cdef int newpos = pos[0], newendpos = -1 + cdef int cpos = 0, upos = 0 + while cpos < size: + if data[cpos] < 0x80: + cpos += 1 + upos += 1 + elif data[cpos] < 0xe0: + cpos += 2 + upos += 1 + elif data[cpos] < 0xf0: + cpos += 3 + upos += 1 + else: + cpos += 4 + upos += 1 + # wide unicode chars get 2 unichars when Python <3.3 is compiled + # with --enable-unicode=ucs2 + emit_if_narrow_unicode() + upos += 1 + emit_endif() + + if upos == pos[0]: + newpos = cpos + if endpos[0] == -1: + break + elif upos == endpos[0]: + newendpos = cpos + break + pos[0] = newpos + endpos[0] = newendpos + + +cdef void unicodeindices(map[int, int] &positions, + char * cstring, int size, int * cpos, int * upos): + """Convert UTF-8 byte indices to unicode indices.""" + cdef unsigned char * s = <unsigned char *>cstring + cdef map[int, int].iterator it = positions.begin() + + if dereference(it).first == -1: + dereference(it).second = -1 + postincrement(it) + if it == positions.end(): + return + if dereference(it).first == cpos[0]: + dereference(it).second = upos[0] + postincrement(it) + if it == positions.end(): + return + + while cpos[0] < size: + if s[cpos[0]] < 0x80: + cpos[0] += 1 + upos[0] += 1 + elif s[cpos[0]] < 0xe0: + cpos[0] += 2 + upos[0] += 1 + elif s[cpos[0]] < 0xf0: + cpos[0] += 3 + upos[0] += 1 + else: + cpos[0] += 4 + upos[0] += 1 + # wide unicode chars get 2 unichars when Python <3.3 is compiled + # with --enable-unicode=ucs2 + emit_if_narrow_unicode() + upos[0] += 1 + emit_endif() + + if dereference(it).first == cpos[0]: + dereference(it).second = upos[0] + postincrement(it) + if it == positions.end(): + break + + +__all__ = [ + # exceptions + 'BackreferencesException', 'CharClassProblemException', + 'RegexError', 'error', + # constants + 'FALLBACK_EXCEPTION', 'FALLBACK_QUIETLY', 'FALLBACK_WARNING', 'DEBUG', + 'S', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', + 'U', 'UNICODE', 'X', 'VERBOSE', 'VERSION', 'VERSION_HEX', + # classes + 'Match', 'Pattern', 'SREPattern', + # functions + 'compile', 'count', 'escape', 'findall', 'finditer', 'fullmatch', + 'match', 'purge', 'search', 'split', 'sub', 'subn', + 'set_fallback_notification', + ] |