diff options
author | alexv-smirnov <alex@ydb.tech> | 2023-03-15 19:59:12 +0300 |
---|---|---|
committer | alexv-smirnov <alex@ydb.tech> | 2023-03-15 19:59:12 +0300 |
commit | 056bb284ccf8dd6793ec3a54ffa36c4fb2b9ad11 (patch) | |
tree | 4740980126f32e3af7937ba0ca5f83e59baa4ab0 /contrib/tools/cython/Cython/Compiler/StringEncoding.py | |
parent | 269126dcced1cc8b53eb4398b4a33e5142f10290 (diff) | |
download | ydb-056bb284ccf8dd6793ec3a54ffa36c4fb2b9ad11.tar.gz |
add library/cpp/actors, ymake build to ydb oss export
Diffstat (limited to 'contrib/tools/cython/Cython/Compiler/StringEncoding.py')
-rw-r--r-- | contrib/tools/cython/Cython/Compiler/StringEncoding.py | 363 |
1 files changed, 363 insertions, 0 deletions
diff --git a/contrib/tools/cython/Cython/Compiler/StringEncoding.py b/contrib/tools/cython/Cython/Compiler/StringEncoding.py new file mode 100644 index 0000000000..c37e8aab79 --- /dev/null +++ b/contrib/tools/cython/Cython/Compiler/StringEncoding.py @@ -0,0 +1,363 @@ +# +# Cython -- encoding related tools +# + +from __future__ import absolute_import + +import re +import sys + +if sys.version_info[0] >= 3: + _unicode, _str, _bytes, _unichr = str, str, bytes, chr + IS_PYTHON3 = True +else: + _unicode, _str, _bytes, _unichr = unicode, str, str, unichr + IS_PYTHON3 = False + +empty_bytes = _bytes() +empty_unicode = _unicode() + +join_bytes = empty_bytes.join + + +class UnicodeLiteralBuilder(object): + """Assemble a unicode string. + """ + def __init__(self): + self.chars = [] + + def append(self, characters): + if isinstance(characters, _bytes): + # this came from a Py2 string literal in the parser code + characters = characters.decode("ASCII") + assert isinstance(characters, _unicode), str(type(characters)) + self.chars.append(characters) + + if sys.maxunicode == 65535: + def append_charval(self, char_number): + if char_number > 65535: + # wide Unicode character on narrow platform => replace + # by surrogate pair + char_number -= 0x10000 + self.chars.append( _unichr((char_number // 1024) + 0xD800) ) + self.chars.append( _unichr((char_number % 1024) + 0xDC00) ) + else: + self.chars.append( _unichr(char_number) ) + else: + def append_charval(self, char_number): + self.chars.append( _unichr(char_number) ) + + def append_uescape(self, char_number, escape_string): + self.append_charval(char_number) + + def getstring(self): + return EncodedString(u''.join(self.chars)) + + def getstrings(self): + return (None, self.getstring()) + + +class BytesLiteralBuilder(object): + """Assemble a byte string or char value. + """ + def __init__(self, target_encoding): + self.chars = [] + self.target_encoding = target_encoding + + def append(self, characters): + if isinstance(characters, _unicode): + characters = characters.encode(self.target_encoding) + assert isinstance(characters, _bytes), str(type(characters)) + self.chars.append(characters) + + def append_charval(self, char_number): + self.chars.append( _unichr(char_number).encode('ISO-8859-1') ) + + def append_uescape(self, char_number, escape_string): + self.append(escape_string) + + def getstring(self): + # this *must* return a byte string! + return bytes_literal(join_bytes(self.chars), self.target_encoding) + + def getchar(self): + # this *must* return a byte string! + return self.getstring() + + def getstrings(self): + return (self.getstring(), None) + + +class StrLiteralBuilder(object): + """Assemble both a bytes and a unicode representation of a string. + """ + def __init__(self, target_encoding): + self._bytes = BytesLiteralBuilder(target_encoding) + self._unicode = UnicodeLiteralBuilder() + + def append(self, characters): + self._bytes.append(characters) + self._unicode.append(characters) + + def append_charval(self, char_number): + self._bytes.append_charval(char_number) + self._unicode.append_charval(char_number) + + def append_uescape(self, char_number, escape_string): + self._bytes.append(escape_string) + self._unicode.append_charval(char_number) + + def getstrings(self): + return (self._bytes.getstring(), self._unicode.getstring()) + + +class EncodedString(_unicode): + # unicode string subclass to keep track of the original encoding. + # 'encoding' is None for unicode strings and the source encoding + # otherwise + encoding = None + + def __deepcopy__(self, memo): + return self + + def byteencode(self): + assert self.encoding is not None + return self.encode(self.encoding) + + def utf8encode(self): + assert self.encoding is None + return self.encode("UTF-8") + + @property + def is_unicode(self): + return self.encoding is None + + def contains_surrogates(self): + return string_contains_surrogates(self) + + def as_utf8_string(self): + return bytes_literal(self.utf8encode(), 'utf8') + + +def string_contains_surrogates(ustring): + """ + Check if the unicode string contains surrogate code points + on a CPython platform with wide (UCS-4) or narrow (UTF-16) + Unicode, i.e. characters that would be spelled as two + separate code units on a narrow platform. + """ + for c in map(ord, ustring): + if c > 65535: # can only happen on wide platforms + return True + if 0xD800 <= c <= 0xDFFF: + return True + return False + + +def string_contains_lone_surrogates(ustring): + """ + Check if the unicode string contains lone surrogate code points + on a CPython platform with wide (UCS-4) or narrow (UTF-16) + Unicode, i.e. characters that would be spelled as two + separate code units on a narrow platform, but that do not form a pair. + """ + last_was_start = False + unicode_uses_surrogate_encoding = sys.maxunicode == 65535 + for c in map(ord, ustring): + # surrogates tend to be rare + if c < 0xD800 or c > 0xDFFF: + if last_was_start: + return True + elif not unicode_uses_surrogate_encoding: + # on 32bit Unicode platforms, there is never a pair + return True + elif c <= 0xDBFF: + if last_was_start: + return True # lone start + last_was_start = True + else: + if not last_was_start: + return True # lone end + last_was_start = False + return last_was_start + + +class BytesLiteral(_bytes): + # bytes subclass that is compatible with EncodedString + encoding = None + + def __deepcopy__(self, memo): + return self + + def byteencode(self): + if IS_PYTHON3: + return _bytes(self) + else: + # fake-recode the string to make it a plain bytes object + return self.decode('ISO-8859-1').encode('ISO-8859-1') + + def utf8encode(self): + assert False, "this is not a unicode string: %r" % self + + def __str__(self): + """Fake-decode the byte string to unicode to support % + formatting of unicode strings. + """ + return self.decode('ISO-8859-1') + + is_unicode = False + + def as_c_string_literal(self): + value = split_string_literal(escape_byte_string(self)) + return '"%s"' % value + + +def bytes_literal(s, encoding): + assert isinstance(s, bytes) + s = BytesLiteral(s) + s.encoding = encoding + return s + + +def encoded_string(s, encoding): + assert isinstance(s, (_unicode, bytes)) + s = EncodedString(s) + if encoding is not None: + s.encoding = encoding + return s + + +char_from_escape_sequence = { + r'\a' : u'\a', + r'\b' : u'\b', + r'\f' : u'\f', + r'\n' : u'\n', + r'\r' : u'\r', + r'\t' : u'\t', + r'\v' : u'\v', + }.get + +_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) + + +def _to_escape_sequence(s): + if s in '\n\r\t': + return repr(s)[1:-1] + elif s == '"': + return r'\"' + elif s == '\\': + return r'\\' + else: + # within a character sequence, oct passes much better than hex + return ''.join(['\\%03o' % ord(c) for c in s]) + + +def _build_specials_replacer(): + subexps = [] + replacements = {} + for special in _c_special: + regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) + subexps.append(regexp) + replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') + sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub + def replace_specials(m): + return replacements[m.group(1)] + def replace(s): + return sub(replace_specials, s) + return replace + +_replace_specials = _build_specials_replacer() + + +def escape_char(c): + if IS_PYTHON3: + c = c.decode('ISO-8859-1') + if c in '\n\r\t\\': + return repr(c)[1:-1] + elif c == "'": + return "\\'" + n = ord(c) + if n < 32 or n > 127: + # hex works well for characters + return "\\x%02X" % n + else: + return c + +def escape_byte_string(s): + """Escape a byte string so that it can be written into C code. + Note that this returns a Unicode string instead which, when + encoded as ISO-8859-1, will result in the correct byte sequence + being written. + """ + s = _replace_specials(s) + try: + return s.decode("ASCII") # trial decoding: plain ASCII => done + except UnicodeDecodeError: + pass + if IS_PYTHON3: + s_new = bytearray() + append, extend = s_new.append, s_new.extend + for b in s: + if b >= 128: + extend(('\\%3o' % b).encode('ASCII')) + else: + append(b) + return s_new.decode('ISO-8859-1') + else: + l = [] + append = l.append + for c in s: + o = ord(c) + if o >= 128: + append('\\%3o' % o) + else: + append(c) + return join_bytes(l).decode('ISO-8859-1') + +def split_string_literal(s, limit=2000): + # MSVC can't handle long string literals. + if len(s) < limit: + return s + else: + start = 0 + chunks = [] + while start < len(s): + end = start + limit + if len(s) > end-4 and '\\' in s[end-4:end]: + end -= 4 - s[end-4:end].find('\\') # just before the backslash + while s[end-1] == '\\': + end -= 1 + if end == start: + # must have been a long line of backslashes + end = start + limit - (limit % 2) - 4 + break + chunks.append(s[start:end]) + start = end + return '""'.join(chunks) + +def encode_pyunicode_string(s): + """Create Py_UNICODE[] representation of a given unicode string. + """ + s = list(map(ord, s)) + [0] + + if sys.maxunicode >= 0x10000: # Wide build or Py3.3 + utf16, utf32 = [], s + for code_point in s: + if code_point >= 0x10000: # outside of BMP + high, low = divmod(code_point - 0x10000, 1024) + utf16.append(high + 0xD800) + utf16.append(low + 0xDC00) + else: + utf16.append(code_point) + else: + utf16, utf32 = s, [] + for code_unit in s: + if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF: + high, low = utf32[-1], code_unit + utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 + else: + utf32.append(code_unit) + + if utf16 == utf32: + utf16 = [] + return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32)) |