diff options
author | vitalyisaev <vitalyisaev@ydb.tech> | 2023-11-30 13:26:22 +0300 |
---|---|---|
committer | vitalyisaev <vitalyisaev@ydb.tech> | 2023-11-30 15:44:45 +0300 |
commit | 0a98fece5a9b54f16afeb3a94b3eb3105e9c3962 (patch) | |
tree | 291d72dbd7e9865399f668c84d11ed86fb190bbf /contrib/python/pyre2/py3/src/pattern.pxi | |
parent | cb2c8d75065e5b3c47094067cb4aa407d4813298 (diff) | |
download | ydb-0a98fece5a9b54f16afeb3a94b3eb3105e9c3962.tar.gz |
YQ Connector:Use docker-compose in integrational tests
Diffstat (limited to 'contrib/python/pyre2/py3/src/pattern.pxi')
-rw-r--r-- | contrib/python/pyre2/py3/src/pattern.pxi | 650 |
1 files changed, 650 insertions, 0 deletions
diff --git a/contrib/python/pyre2/py3/src/pattern.pxi b/contrib/python/pyre2/py3/src/pattern.pxi new file mode 100644 index 0000000000..b8439d2007 --- /dev/null +++ b/contrib/python/pyre2/py3/src/pattern.pxi @@ -0,0 +1,650 @@ +cdef class Pattern: + cdef readonly object pattern # original pattern in Python format + cdef readonly int flags + cdef readonly int groups # number of groups + cdef readonly dict groupindex # name => group number + cdef object __weakref__ + + cdef bint encoded # True if this was originally a Unicode pattern + cdef RE2 * re_pattern + + def search(self, object string, int pos=0, int endpos=-1): + """Scan through string looking for a match, and return a corresponding + Match instance. Return None if no position in the string matches.""" + return self._search(string, pos, endpos, UNANCHORED) + + def match(self, object string, int pos=0, int endpos=-1): + """Matches zero or more characters at the beginning of the string.""" + return self._search(string, pos, endpos, ANCHOR_START) + + def fullmatch(self, object string, int pos=0, int endpos=-1): + """"fullmatch(string[, pos[, endpos]]) --> Match object or None." + + Matches the entire string.""" + return self._search(string, pos, endpos, ANCHOR_BOTH) + + cdef _search(self, object string, int pos, int endpos, + re2_Anchor anchoring): + """Scan through string looking for a match, and return a corresponding + Match instance. Return None if no position in the string matches.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int encoded = 0 + cdef StringPiece * sp + cdef Match m = Match(self, self.groups + 1) + cdef int cpos = 0, upos = pos + + if 0 <= endpos <= pos: + return None + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + cpos = pos + if pos > size: + return None + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + anchoring, + m.matches, + self.groups + 1) + del sp + if retval == 0: + return None + + m.encoded = encoded + m.nmatches = self.groups + 1 + m.string = string + m.pos = pos + if endpos == -1: + m.endpos = size + else: + m.endpos = endpos + m._make_spans(cstring, size, &cpos, &upos) + m._init_groups() + finally: + release_cstring(&buf) + return m + + def contains(self, object string, int pos=0, int endpos=-1): + """"contains(string[, pos[, endpos]]) --> bool." + + Scan through string looking for a match, and return True or False.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int encoded = 0 + cdef StringPiece * sp + + if 0 <= endpos <= pos: + return False + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + if pos > size: + return False + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + NULL, + 0) + del sp + finally: + release_cstring(&buf) + return retval != 0 + + def count(self, object string, int pos=0, int endpos=-1): + """Return number of non-overlapping matches of pattern in string.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int encoded = 0 + cdef int result = 0 + cdef StringPiece * sp = NULL + cdef StringPiece * matches = NULL + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + if pos > size: + return 0 + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + matches = new_StringPiece_array(1) + try: + while True: + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + matches, + 1) + if retval == 0: + break + result += 1 + if pos == size: + break + # offset the pos to move to the next point + pos = matches[0].data() - cstring + ( + matches[0].length() or 1) + finally: + del sp + delete_StringPiece_array(matches) + finally: + release_cstring(&buf) + return result + + def findall(self, object string, int pos=0, int endpos=-1): + """Return all non-overlapping matches of pattern in string as a list + of strings.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int encoded = 0 + cdef int retval + cdef list resultlist = [] + cdef StringPiece * sp = NULL + cdef StringPiece * matches = NULL + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + if pos > size: + return [] + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + matches = new_StringPiece_array(self.groups + 1) + + while True: + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + matches, + self.groups + 1) + if retval == 0: + break + if self.groups > 1: + if encoded: + resultlist.append(tuple([ + '' if matches[i].data() is NULL else + matches[i].data()[:matches[i].length() + ].decode('utf8') + for i in range(1, self.groups + 1)])) + else: + resultlist.append(tuple([ + b'' if matches[i].data() is NULL + else matches[i].data()[:matches[i].length()] + for i in range(1, self.groups + 1)])) + else: # 0 or 1 group; return list of strings + if encoded: + resultlist.append(matches[self.groups].data()[ + :matches[self.groups].length()].decode('utf8')) + else: + resultlist.append(matches[self.groups].data()[ + :matches[self.groups].length()]) + if pos == size: + break + # offset the pos to move to the next point + pos = matches[0].data() - cstring + (matches[0].length() or 1) + finally: + del sp + delete_StringPiece_array(matches) + release_cstring(&buf) + return resultlist + + def finditer(self, object string, int pos=0, int endpos=-1): + """Yield all non-overlapping matches of pattern in string as Match + objects.""" + result = iter(self._finditer(string, pos, endpos)) + next(result) # dummy value to raise error before start of generator + return result + + def _finditer(self, object string, int pos=0, int endpos=-1): + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef StringPiece * sp = NULL + cdef Match m + cdef int encoded = 0 + cdef int cpos = 0, upos = pos + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + cpos = pos + if pos > size: + return + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + + yield + while True: + m = Match(self, self.groups + 1) + m.string = string + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + m.matches, + self.groups + 1) + if retval == 0: + break + m.encoded = encoded + m.nmatches = self.groups + 1 + m.pos = pos + if endpos == -1: + m.endpos = size + else: + m.endpos = endpos + m._make_spans(cstring, size, &cpos, &upos) + m._init_groups() + yield m + if pos == size: + break + # offset the pos to move to the next point + pos = m.matches[0].data() - cstring + ( + m.matches[0].length() or 1) + finally: + del sp + release_cstring(&buf) + + def split(self, string, int maxsplit=0): + """split(string[, maxsplit = 0]) --> list + + Split a string by the occurrences of the pattern.""" + cdef char * cstring + cdef Py_ssize_t size + cdef int retval + cdef int pos = 0 + cdef int lookahead = 0 + cdef int num_split = 0 + cdef StringPiece * sp + cdef StringPiece * matches + cdef list resultlist = [] + cdef int encoded = 0 + cdef Py_buffer buf + + if maxsplit < 0: + maxsplit = 0 + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + matches = new_StringPiece_array(self.groups + 1) + sp = new StringPiece(cstring, size) + try: + + while True: + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos + lookahead, + size, + UNANCHORED, + matches, + self.groups + 1) + if retval == 0: + break + + match_start = matches[0].data() - cstring + match_end = match_start + matches[0].length() + + # If an empty match, just look ahead until you find something + if match_start == match_end: + if pos + lookahead == size: + break + lookahead += 1 + continue + + if encoded: + resultlist.append( + char_to_unicode(&sp.data()[pos], match_start - pos)) + else: + resultlist.append(sp.data()[pos:match_start]) + if self.groups > 0: + for group in range(self.groups): + if matches[group + 1].data() == NULL: + resultlist.append(None) + else: + if encoded: + resultlist.append(char_to_unicode( + matches[group + 1].data(), + matches[group + 1].length())) + else: + resultlist.append(matches[group + 1].data()[: + matches[group + 1].length()]) + + # offset the pos to move to the next point + pos = match_end + lookahead = 0 + + num_split += 1 + if maxsplit and num_split >= maxsplit: + break + + if encoded: + resultlist.append( + char_to_unicode(&sp.data()[pos], sp.length() - pos)) + else: + resultlist.append(sp.data()[pos:]) + finally: + del sp + delete_StringPiece_array(matches) + release_cstring(&buf) + return resultlist + + def sub(self, repl, string, int count=0): + """sub(repl, string[, count = 0]) --> newstring + + Return the string obtained by replacing the leftmost non-overlapping + occurrences of pattern in string by the replacement repl.""" + cdef int num_repl = 0 + return self._subn(repl, string, count, &num_repl) + + def subn(self, repl, string, int count=0): + """subn(repl, string[, count = 0]) --> (newstring, number of subs) + + Return the tuple (new_string, number_of_subs_made) found by replacing + the leftmost non-overlapping occurrences of pattern with the + replacement repl.""" + cdef int num_repl = 0 + result = self._subn(repl, string, count, &num_repl) + return result, num_repl + + cdef _subn(self, repl, string, int count, int *num_repl): + cdef bytes repl_b + cdef char * cstring + cdef object result + cdef Py_ssize_t size + cdef StringPiece * sp = NULL + cdef cpp_string * input_str = NULL + cdef int string_encoded = 0 + cdef int repl_encoded = 0 + + if callable(repl): + # This is a callback, so use the custom function + return self._subn_callback(repl, string, count, num_repl) + + repl_b = unicode_to_bytes(repl, &repl_encoded, self.encoded) + if not repl_encoded and not isinstance(repl, bytes): + repl_b = bytes(repl) # coerce buffer to bytes object + + if count > 1 or (b'\\' if PY2 else <char>b'\\') in repl_b: + # Limit on number of substitutions or replacement string contains + # escape sequences; handle with Match.expand() implementation. + # RE2 does support simple numeric group references \1, \2, + # but the number of differences with Python behavior is + # non-trivial. + return self._subn_expand(repl_b, string, count, num_repl) + try: + cstring = repl_b + size = len(repl_b) + sp = new StringPiece(cstring, size) + + bytestr = unicode_to_bytes(string, &string_encoded, self.encoded) + if not string_encoded and not isinstance(bytestr, bytes): + bytestr = bytes(bytestr) # coerce buffer to bytes object + input_str = new cpp_string(<char *>bytestr, len(bytestr)) + # NB: RE2 treats unmatched groups in repl as empty string; + # Python raises an error. + with nogil: + if count == 0: + num_repl[0] = GlobalReplace( + input_str, self.re_pattern[0], sp[0]) + elif count == 1: + num_repl[0] = Replace( + input_str, self.re_pattern[0], sp[0]) + + if string_encoded or (repl_encoded and num_repl[0] > 0): + result = cpp_to_unicode(input_str[0]) + else: + result = cpp_to_bytes(input_str[0]) + finally: + del input_str, sp + return result + + cdef _subn_callback(self, callback, string, int count, int * num_repl): + # This function is probably the hardest to implement correctly. + # This is my first attempt, but if anybody has a better solution, + # please help out. + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int prevendpos = -1 + cdef int endpos = 0 + cdef int pos = 0 + cdef int encoded = 0 + cdef StringPiece * sp + cdef Match m + cdef bytearray result = bytearray() + cdef int cpos = 0, upos = 0 + + if count < 0: + count = 0 + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + sp = new StringPiece(cstring, size) + try: + while True: + m = Match(self, self.groups + 1) + m.string = string + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + m.matches, + self.groups + 1) + if retval == 0: + break + + endpos = m.matches[0].data() - cstring + if endpos == prevendpos: + endpos += 1 + if endpos > size: + break + prevendpos = endpos + result.extend(sp.data()[pos:endpos]) + pos = endpos + m.matches[0].length() + + m.encoded = encoded + m.nmatches = self.groups + 1 + m._make_spans(cstring, size, &cpos, &upos) + m._init_groups() + tmp = callback(m) + if tmp: + result.extend(tmp.encode('utf8') if encoded else tmp) + else: + result.extend(b'') + + num_repl[0] += 1 + if count and num_repl[0] >= count: + break + result.extend(sp.data()[pos:]) + finally: + del sp + release_cstring(&buf) + return result.decode('utf8') if encoded else bytes(result) + + cdef _subn_expand(self, bytes repl, string, int count, int * num_repl): + """Perform ``count`` substitutions with replacement string and + Match.expand.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int prevendpos = -1 + cdef int endpos = 0 + cdef int pos = 0 + cdef int encoded = 0 + cdef StringPiece * sp + cdef Match m + cdef bytearray result = bytearray() + + if count < 0: + count = 0 + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + sp = new StringPiece(cstring, size) + try: + while True: + m = Match(self, self.groups + 1) + m.string = string + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + m.matches, + self.groups + 1) + if retval == 0: + break + + endpos = m.matches[0].data() - cstring + if endpos == prevendpos: + endpos += 1 + if endpos > size: + break + prevendpos = endpos + result.extend(sp.data()[pos:endpos]) + pos = endpos + m.matches[0].length() + + m.encoded = encoded + m.nmatches = self.groups + 1 + m._init_groups() + m._expand(repl, result) + + num_repl[0] += 1 + if count and num_repl[0] >= count: + break + result.extend(sp.data()[pos:]) + finally: + del sp + release_cstring(&buf) + return result.decode('utf8') if encoded else bytes(result) + + def scanner(self, arg): + return re.compile(self.pattern).scanner(arg) + # raise NotImplementedError + + def _dump_pattern(self): + cdef cpp_string s = self.re_pattern.pattern() + if self.encoded: + return cpp_to_bytes(s).decode('utf8') + return cpp_to_bytes(s) + + def __repr__(self): + if self.flags == 0: + return 're2.compile(%r)' % self.pattern + return 're2.compile(%r, %r)' % (self.pattern, self.flags) + + def __reduce__(self): + return (compile, (self.pattern, self.flags)) + + def __dealloc__(self): + del self.re_pattern + + +class PythonRePattern: + """A wrapper for re.Pattern to support the extra methods defined by re2 + (contains, count).""" + def __init__(self, pattern, flags=None): + self._pattern = re.compile(pattern, flags) + self.pattern = pattern + self.flags = flags + self.groupindex = self._pattern.groupindex + self.groups = self._pattern.groups + + def contains(self, string): + return bool(self._pattern.search(string)) + + def count(self, string, pos=0, endpos=9223372036854775807): + return len(self._pattern.findall(string, pos, endpos)) + + def findall(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.findall(string, pos, endpos) + + def finditer(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.finditer(string, pos, endpos) + + def fullmatch(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.fullmatch(string, pos, endpos) + + def match(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.match(string, pos, endpos) + + def scanner(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.scanner(string, pos, endpos) + + def search(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.search(string, pos, endpos) + + def split(self, string, maxsplit=0): + return self._pattern.split(string, maxsplit) + + def sub(self, repl, string, count=0): + return self._pattern.sub(repl, string, count) + + def subn(self, repl, string, count=0): + return self._pattern.subn(repl, string, count) + + def __repr__(self): + return repr(self._pattern) + + def __reduce__(self): + return (self, (self.pattern, self.flags)) |