diff options
author | vitalyisaev <vitalyisaev@ydb.tech> | 2023-11-30 13:26:22 +0300 |
---|---|---|
committer | vitalyisaev <vitalyisaev@ydb.tech> | 2023-11-30 15:44:45 +0300 |
commit | 0a98fece5a9b54f16afeb3a94b3eb3105e9c3962 (patch) | |
tree | 291d72dbd7e9865399f668c84d11ed86fb190bbf /contrib/python/pyre2/py3/src/re2.pyx | |
parent | cb2c8d75065e5b3c47094067cb4aa407d4813298 (diff) | |
download | ydb-0a98fece5a9b54f16afeb3a94b3eb3105e9c3962.tar.gz |
YQ Connector:Use docker-compose in integrational tests
Diffstat (limited to 'contrib/python/pyre2/py3/src/re2.pyx')
-rw-r--r-- | contrib/python/pyre2/py3/src/re2.pyx | 458 |
1 files changed, 458 insertions, 0 deletions
diff --git a/contrib/python/pyre2/py3/src/re2.pyx b/contrib/python/pyre2/py3/src/re2.pyx new file mode 100644 index 0000000000..c48101426f --- /dev/null +++ b/contrib/python/pyre2/py3/src/re2.pyx @@ -0,0 +1,458 @@ +# cython: infer_types(False) +r"""Regular expressions using Google's RE2 engine. + +Compared to Python's ``re``, the RE2 engine compiles regular expressions to +deterministic finite automata, which guarantees linear-time behavior. + +Intended as a drop-in replacement for ``re``. Unicode is supported by encoding +to UTF-8, and bytes strings are treated as UTF-8 when the UNICODE flag is given. +For best performance, work with UTF-8 encoded bytes strings. + +Regular expressions that are not compatible with RE2 are processed with +fallback to ``re``. Examples of features not supported by RE2: + + - lookahead assertions ``(?!...)`` + - backreferences (``\\n`` in search pattern) + - \W and \S not supported inside character classes + +On the other hand, unicode character classes are supported (e.g., ``\p{Greek}``). +Syntax reference: https://github.com/google/re2/wiki/Syntax + +What follows is a reference for the regular expression syntax supported by this +module (i.e., without requiring fallback to `re`). + +Regular expressions can contain both special and ordinary characters. +Most ordinary characters, like "A", "a", or "0", are the simplest +regular expressions; they simply match themselves. + +The special characters are:: + + "." Matches any character except a newline. + "^" Matches the start of the string. + "$" Matches the end of the string or just before the newline at + the end of the string. + "*" Matches 0 or more (greedy) repetitions of the preceding RE. + Greedy means that it will match as many repetitions as possible. + "+" Matches 1 or more (greedy) repetitions of the preceding RE. + "?" Matches 0 or 1 (greedy) of the preceding RE. + *?,+?,?? Non-greedy versions of the previous three special characters. + {m,n} Matches from m to n repetitions of the preceding RE. + {m,n}? Non-greedy version of the above. + "\\" Either escapes special characters or signals a special sequence. + [] Indicates a set of characters. + A "^" as the first character indicates a complementing set. + "|" A|B, creates an RE that will match either A or B. + (...) Matches the RE inside the parentheses. + The contents can be retrieved or matched later in the string. + (?:...) Non-grouping version of regular parentheses. + (?imsux) Set the I, M, S, U, or X flag for the RE (see below). + +The special sequences consist of "\\" and a character from the list +below. If the ordinary character is not on the list, then the +resulting RE will match the second character:: + + \A Matches only at the start of the string. + \Z Matches only at the end of the string. + \b Matches the empty string, but only at the start or end of a word. + \B Matches the empty string, but not at the start or end of a word. + \d Matches any decimal digit. + \D Matches any non-digit character. + \s Matches any whitespace character. + \S Matches any non-whitespace character. + \w Matches any alphanumeric character. + \W Matches the complement of \w. + \\ Matches a literal backslash. + \pN Unicode character class (one-letter name) + \p{Greek} Unicode character class + \PN negated Unicode character class (one-letter name) + \P{Greek} negated Unicode character class + +This module exports the following functions:: + + count Count all occurrences of a pattern in a string. + match Match a regular expression pattern to the beginning of a string. + fullmatch Match a regular expression pattern to all of a string. + search Search a string for a pattern and return Match object. + contains Same as search, but only return bool. + sub Substitute occurrences of a pattern found in a string. + subn Same as sub, but also return the number of substitutions made. + split Split a string by the occurrences of a pattern. + findall Find all occurrences of a pattern in a string. + finditer Return an iterator yielding a match object for each match. + compile Compile a pattern into a RegexObject. + purge Clear the regular expression cache. + escape Backslash all non-alphanumerics in a string. + +Some of the functions in this module takes flags as optional parameters:: + + A ASCII Make \w, \W, \b, \B, \d, \D match the corresponding ASCII + character categories (rather than the whole Unicode + categories, which is the default). + I IGNORECASE Perform case-insensitive matching. + M MULTILINE "^" matches the beginning of lines (after a newline) + as well as the string. + "$" matches the end of lines (before a newline) as well + as the end of the string. + S DOTALL "." matches any character at all, including the newline. + X VERBOSE Ignore whitespace and comments for nicer looking RE's. + U UNICODE Enable Unicode character classes and make \w, \W, \b, \B, + Unicode-aware (default for unicode patterns). + +This module also defines an exception 'RegexError' (also available under the +alias 'error'). + +""" + +include "includes.pxi" + +import re +import sys +import warnings +from re import error as RegexError + +error = re.error + +# Import re flags to be compatible. +I, M, S, U, X, L = re.I, re.M, re.S, re.U, re.X, re.L +IGNORECASE = re.IGNORECASE +MULTILINE = re.MULTILINE +DOTALL = re.DOTALL +UNICODE = re.UNICODE +VERBOSE = re.VERBOSE +LOCALE = re.LOCALE +DEBUG = re.DEBUG +ASCII = 256 # Python 3 + +FALLBACK_QUIETLY = 0 +FALLBACK_WARNING = 1 +FALLBACK_EXCEPTION = 2 + +VERSION = (0, 2, 23) +VERSION_HEX = 0x000217 + +cdef int _I = I, _M = M, _S = S, _U = U, _X = X, _L = L +cdef int current_notification = FALLBACK_QUIETLY +cdef bint PY2 = PY_MAJOR_VERSION == 2 + +# Type of compiled re object from Python stdlib +SREPattern = type(re.compile('')) + +_cache = {} +_cache_repl = {} + +_MAXCACHE = 100 + + +include "compile.pxi" +include "pattern.pxi" +include "match.pxi" + + +def purge(): + """Clear the regular expression caches.""" + _cache.clear() + _cache_repl.clear() + + +def search(pattern, string, int flags=0): + """Scan through string looking for a match to the pattern, returning + a ``Match`` object or none if no match was found.""" + return compile(pattern, flags).search(string) + + +def match(pattern, string, int flags=0): + """Try to apply the pattern at the start of the string, returning + a ``Match`` object, or ``None`` if no match was found.""" + return compile(pattern, flags).match(string) + + +def fullmatch(pattern, string, int flags=0): + """Try to apply the pattern to the entire string, returning + a ``Match`` object, or ``None`` if no match was found.""" + return compile(pattern, flags).fullmatch(string) + + +def contains(pattern, string, int flags=0): + """Scan through string looking for a match to the pattern, returning + True or False.""" + return compile(pattern, flags).contains(string) + + +def finditer(pattern, string, int flags=0): + """Yield all non-overlapping matches in the string. + + For each match, the iterator returns a ``Match`` object. + Empty matches are included in the result.""" + return compile(pattern, flags).finditer(string) + + +def findall(pattern, string, int flags=0): + """Return a list of all non-overlapping matches in the string. + + Each match is represented as a string or a tuple (when there are two ore + more groups). Empty matches are included in the result.""" + return compile(pattern, flags).findall(string) + + +def count(pattern, string, int flags=0): + """Return number of non-overlapping matches in the string. + + Empty matches are included in the count.""" + return compile(pattern, flags).count(string) + + +def split(pattern, string, int maxsplit=0, int flags=0): + """Split the source string by the occurrences of the pattern, + returning a list containing the resulting substrings.""" + return compile(pattern, flags).split(string, maxsplit) + + +def sub(pattern, repl, string, int count=0, int flags=0): + """Return the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in string by the + replacement ``repl``. ``repl`` can be either a string or a callable; + if a string, backslash escapes in it are processed. If it is + a callable, it's passed the ``Match`` object and must return + a replacement string to be used.""" + return compile(pattern, flags).sub(repl, string, count) + + +def subn(pattern, repl, string, int count=0, int flags=0): + """Return a 2-tuple containing ``(new_string, number)``. + new_string is the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in the source + string by the replacement ``repl``. ``number`` is the number of + substitutions that were made. ``repl`` can be either a string or a + callable; if a string, backslash escapes in it are processed. + If it is a callable, it's passed the ``Match`` object and must + return a replacement string to be used.""" + return compile(pattern, flags).subn(repl, string, count) + + +def escape(pattern): + """Escape all non-alphanumeric characters in pattern.""" + cdef bint uni = isinstance(pattern, unicode) + cdef list s + if PY2 or uni: + s = list(pattern) + else: + s = [bytes([c]) for c in pattern] + for i in range(len(pattern)): + # c = pattern[i] + c = s[i] + if ord(c) < 0x80 and not c.isalnum(): + if uni: + if c == u'\000': + s[i] = u'\\000' + else: + s[i] = u"\\" + c + else: + if c == b'\000': + s[i] = b'\\000' + else: + s[i] = b'\\' + c + return u''.join(s) if uni else b''.join(s) + + +class BackreferencesException(Exception): + """Search pattern contains backreferences.""" + pass + + +class CharClassProblemException(Exception): + """Search pattern contains unsupported character class.""" + pass + + +def set_fallback_notification(level): + """Set the fallback notification to a level; one of: + FALLBACK_QUIETLY + FALLBACK_WARNING + FALLBACK_EXCEPTION + """ + global current_notification + level = int(level) + if level < 0 or level > 2: + raise ValueError("This function expects a valid notification level.") + current_notification = level + + +cdef bint ishex(unsigned char c): + """Test whether ``c`` is in ``[0-9a-fA-F]``""" + return (b'0' <= c <= b'9' or b'a' <= c <= b'f' or b'A' <= c <= b'F') + + +cdef bint isoct(unsigned char c): + """Test whether ``c`` is in ``[0-7]``""" + return b'0' <= c <= b'7' + + +cdef bint isdigit(unsigned char c): + """Test whether ``c`` is in ``[0-9]``""" + return b'0' <= c <= b'9' + + +cdef bint isident(unsigned char c): + """Test whether ``c`` is in ``[a-zA-Z0-9_]``""" + return (b'a' <= c <= b'z' or b'A' <= c <= b'Z' + or b'0' <= c <= b'9' or c == b'_') + + +cdef inline bytes cpp_to_bytes(cpp_string input): + """Convert from a std::string object to a python string.""" + # By taking the slice we go to the right size, + # despite spurious or missing null characters. + return input.data()[:input.length()] + + +cdef inline unicode cpp_to_unicode(cpp_string input): + """Convert a std::string object to a unicode string.""" + return cpython.unicode.PyUnicode_DecodeUTF8( + input.data(), input.length(), 'strict') + + +cdef inline unicode char_to_unicode(const char * input, int length): + """Convert a C string to a unicode string.""" + return cpython.unicode.PyUnicode_DecodeUTF8(input, length, 'strict') + + +cdef inline unicode_to_bytes(object pystring, int * encoded, + int checkotherencoding): + """Convert a unicode string to a utf8 bytes object, if necessary. + + If pystring is a bytes string or a buffer, return unchanged. + If checkotherencoding is 0 or 1 and using Python 3, raise an error + if its truth value is not equal to that of encoded. + encoded is set to 1 if encoded string can be treated as ASCII, + and 2 if it contains multibyte unicode characters.""" + if cpython.unicode.PyUnicode_Check(pystring): + origlen = len(pystring) + pystring = pystring.encode('utf8') + encoded[0] = 1 if origlen == len(pystring) else 2 + else: + encoded[0] = 0 + if not PY2 and checkotherencoding > 0 and not encoded[0]: + raise TypeError("can't use a string pattern on a bytes-like object") + elif not PY2 and checkotherencoding == 0 and encoded[0]: + raise TypeError("can't use a bytes pattern on a string-like object") + return pystring + + +cdef inline int pystring_to_cstring( + object pystring, char ** cstring, Py_ssize_t * size, + Py_buffer * buf): + """Get a pointer from bytes/buffer object ``pystring``. + + On success, return 0, and set ``cstring``, ``size``, and ``buf``.""" + cdef int result = -1 + cstring[0] = NULL + size[0] = 0 + if PyObject_CheckBuffer(pystring) == 1: # new-style Buffer interface + result = PyObject_GetBuffer(pystring, buf, PyBUF_SIMPLE) + if result == 0: + cstring[0] = <char *>buf.buf + size[0] = buf.len + return result + + +cdef inline void release_cstring(Py_buffer *buf): + """Release buffer if necessary.""" + if not PY2: + PyBuffer_Release(buf) + + +cdef utf8indices(char * cstring, int size, int *pos, int *endpos): + """Convert unicode indices ``pos`` and ``endpos`` to UTF-8 indices. + + If the indices are out of range, leave them unchanged.""" + cdef unsigned char * data = <unsigned char *>cstring + cdef int newpos = pos[0], newendpos = -1 + cdef int cpos = 0, upos = 0 + while cpos < size: + if data[cpos] < 0x80: + cpos += 1 + upos += 1 + elif data[cpos] < 0xe0: + cpos += 2 + upos += 1 + elif data[cpos] < 0xf0: + cpos += 3 + upos += 1 + else: + cpos += 4 + upos += 1 + # wide unicode chars get 2 unichars when Python <3.3 is compiled + # with --enable-unicode=ucs2 + emit_if_narrow_unicode() + upos += 1 + emit_endif() + + if upos == pos[0]: + newpos = cpos + if endpos[0] == -1: + break + elif upos == endpos[0]: + newendpos = cpos + break + pos[0] = newpos + endpos[0] = newendpos + + +cdef void unicodeindices(map[int, int] &positions, + char * cstring, int size, int * cpos, int * upos): + """Convert UTF-8 byte indices to unicode indices.""" + cdef unsigned char * s = <unsigned char *>cstring + cdef map[int, int].iterator it = positions.begin() + + if dereference(it).first == -1: + dereference(it).second = -1 + postincrement(it) + if it == positions.end(): + return + if dereference(it).first == cpos[0]: + dereference(it).second = upos[0] + postincrement(it) + if it == positions.end(): + return + + while cpos[0] < size: + if s[cpos[0]] < 0x80: + cpos[0] += 1 + upos[0] += 1 + elif s[cpos[0]] < 0xe0: + cpos[0] += 2 + upos[0] += 1 + elif s[cpos[0]] < 0xf0: + cpos[0] += 3 + upos[0] += 1 + else: + cpos[0] += 4 + upos[0] += 1 + # wide unicode chars get 2 unichars when Python <3.3 is compiled + # with --enable-unicode=ucs2 + emit_if_narrow_unicode() + upos[0] += 1 + emit_endif() + + if dereference(it).first == cpos[0]: + dereference(it).second = upos[0] + postincrement(it) + if it == positions.end(): + break + + +__all__ = [ + # exceptions + 'BackreferencesException', 'CharClassProblemException', + 'RegexError', 'error', + # constants + 'FALLBACK_EXCEPTION', 'FALLBACK_QUIETLY', 'FALLBACK_WARNING', 'DEBUG', + 'S', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', + 'U', 'UNICODE', 'X', 'VERBOSE', 'VERSION', 'VERSION_HEX', + # classes + 'Match', 'Pattern', 'SREPattern', + # functions + 'compile', 'count', 'escape', 'findall', 'finditer', 'fullmatch', + 'match', 'purge', 'search', 'split', 'sub', 'subn', + 'set_fallback_notification', + ] |