diff options
author | Anton Samokhvalov <[email protected]> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /contrib/tools/cython/Cython/Compiler/StringEncoding.py | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) |
Restoring authorship annotation for Anton Samokhvalov <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'contrib/tools/cython/Cython/Compiler/StringEncoding.py')
-rw-r--r-- | contrib/tools/cython/Cython/Compiler/StringEncoding.py | 608 |
1 files changed, 304 insertions, 304 deletions
diff --git a/contrib/tools/cython/Cython/Compiler/StringEncoding.py b/contrib/tools/cython/Cython/Compiler/StringEncoding.py index c37e8aab799..d9993c6615a 100644 --- a/contrib/tools/cython/Cython/Compiler/StringEncoding.py +++ b/contrib/tools/cython/Cython/Compiler/StringEncoding.py @@ -1,159 +1,159 @@ -# -# Cython -- encoding related tools -# - -from __future__ import absolute_import - -import re -import sys - -if sys.version_info[0] >= 3: +# +# Cython -- encoding related tools +# + +from __future__ import absolute_import + +import re +import sys + +if sys.version_info[0] >= 3: _unicode, _str, _bytes, _unichr = str, str, bytes, chr - IS_PYTHON3 = True -else: + IS_PYTHON3 = True +else: _unicode, _str, _bytes, _unichr = unicode, str, str, unichr - IS_PYTHON3 = False - -empty_bytes = _bytes() -empty_unicode = _unicode() - -join_bytes = empty_bytes.join - - -class UnicodeLiteralBuilder(object): - """Assemble a unicode string. - """ - def __init__(self): - self.chars = [] - - def append(self, characters): - if isinstance(characters, _bytes): - # this came from a Py2 string literal in the parser code - characters = characters.decode("ASCII") - assert isinstance(characters, _unicode), str(type(characters)) - self.chars.append(characters) - - if sys.maxunicode == 65535: - def append_charval(self, char_number): - if char_number > 65535: - # wide Unicode character on narrow platform => replace - # by surrogate pair - char_number -= 0x10000 + IS_PYTHON3 = False + +empty_bytes = _bytes() +empty_unicode = _unicode() + +join_bytes = empty_bytes.join + + +class UnicodeLiteralBuilder(object): + """Assemble a unicode string. + """ + def __init__(self): + self.chars = [] + + def append(self, characters): + if isinstance(characters, _bytes): + # this came from a Py2 string literal in the parser code + characters = characters.decode("ASCII") + assert isinstance(characters, _unicode), str(type(characters)) + self.chars.append(characters) + + if sys.maxunicode == 65535: + def append_charval(self, char_number): + if char_number > 65535: + # wide Unicode character on narrow platform => replace + # by surrogate pair + char_number -= 0x10000 self.chars.append( _unichr((char_number // 1024) + 0xD800) ) self.chars.append( _unichr((char_number % 1024) + 0xDC00) ) - else: + else: self.chars.append( _unichr(char_number) ) - else: - def append_charval(self, char_number): + else: + def append_charval(self, char_number): self.chars.append( _unichr(char_number) ) - - def append_uescape(self, char_number, escape_string): - self.append_charval(char_number) - - def getstring(self): - return EncodedString(u''.join(self.chars)) - - def getstrings(self): - return (None, self.getstring()) - - -class BytesLiteralBuilder(object): - """Assemble a byte string or char value. - """ - def __init__(self, target_encoding): - self.chars = [] - self.target_encoding = target_encoding - - def append(self, characters): - if isinstance(characters, _unicode): - characters = characters.encode(self.target_encoding) - assert isinstance(characters, _bytes), str(type(characters)) - self.chars.append(characters) - - def append_charval(self, char_number): + + def append_uescape(self, char_number, escape_string): + self.append_charval(char_number) + + def getstring(self): + return EncodedString(u''.join(self.chars)) + + def getstrings(self): + return (None, self.getstring()) + + +class BytesLiteralBuilder(object): + """Assemble a byte string or char value. + """ + def __init__(self, target_encoding): + self.chars = [] + self.target_encoding = target_encoding + + def append(self, characters): + if isinstance(characters, _unicode): + characters = characters.encode(self.target_encoding) + assert isinstance(characters, _bytes), str(type(characters)) + self.chars.append(characters) + + def append_charval(self, char_number): self.chars.append( _unichr(char_number).encode('ISO-8859-1') ) - - def append_uescape(self, char_number, escape_string): - self.append(escape_string) - - def getstring(self): - # this *must* return a byte string! + + def append_uescape(self, char_number, escape_string): + self.append(escape_string) + + def getstring(self): + # this *must* return a byte string! return bytes_literal(join_bytes(self.chars), self.target_encoding) - - def getchar(self): - # this *must* return a byte string! - return self.getstring() - - def getstrings(self): - return (self.getstring(), None) - - -class StrLiteralBuilder(object): - """Assemble both a bytes and a unicode representation of a string. - """ - def __init__(self, target_encoding): - self._bytes = BytesLiteralBuilder(target_encoding) - self._unicode = UnicodeLiteralBuilder() - - def append(self, characters): - self._bytes.append(characters) - self._unicode.append(characters) - - def append_charval(self, char_number): - self._bytes.append_charval(char_number) - self._unicode.append_charval(char_number) - - def append_uescape(self, char_number, escape_string): - self._bytes.append(escape_string) - self._unicode.append_charval(char_number) - - def getstrings(self): - return (self._bytes.getstring(), self._unicode.getstring()) - - -class EncodedString(_unicode): - # unicode string subclass to keep track of the original encoding. - # 'encoding' is None for unicode strings and the source encoding - # otherwise - encoding = None - - def __deepcopy__(self, memo): - return self - - def byteencode(self): - assert self.encoding is not None - return self.encode(self.encoding) - - def utf8encode(self): - assert self.encoding is None - return self.encode("UTF-8") - - @property - def is_unicode(self): - return self.encoding is None - - def contains_surrogates(self): - return string_contains_surrogates(self) - + + def getchar(self): + # this *must* return a byte string! + return self.getstring() + + def getstrings(self): + return (self.getstring(), None) + + +class StrLiteralBuilder(object): + """Assemble both a bytes and a unicode representation of a string. + """ + def __init__(self, target_encoding): + self._bytes = BytesLiteralBuilder(target_encoding) + self._unicode = UnicodeLiteralBuilder() + + def append(self, characters): + self._bytes.append(characters) + self._unicode.append(characters) + + def append_charval(self, char_number): + self._bytes.append_charval(char_number) + self._unicode.append_charval(char_number) + + def append_uescape(self, char_number, escape_string): + self._bytes.append(escape_string) + self._unicode.append_charval(char_number) + + def getstrings(self): + return (self._bytes.getstring(), self._unicode.getstring()) + + +class EncodedString(_unicode): + # unicode string subclass to keep track of the original encoding. + # 'encoding' is None for unicode strings and the source encoding + # otherwise + encoding = None + + def __deepcopy__(self, memo): + return self + + def byteencode(self): + assert self.encoding is not None + return self.encode(self.encoding) + + def utf8encode(self): + assert self.encoding is None + return self.encode("UTF-8") + + @property + def is_unicode(self): + return self.encoding is None + + def contains_surrogates(self): + return string_contains_surrogates(self) + def as_utf8_string(self): return bytes_literal(self.utf8encode(), 'utf8') - - -def string_contains_surrogates(ustring): - """ - Check if the unicode string contains surrogate code points - on a CPython platform with wide (UCS-4) or narrow (UTF-16) - Unicode, i.e. characters that would be spelled as two - separate code units on a narrow platform. - """ - for c in map(ord, ustring): - if c > 65535: # can only happen on wide platforms - return True - if 0xD800 <= c <= 0xDFFF: - return True - return False - - + + +def string_contains_surrogates(ustring): + """ + Check if the unicode string contains surrogate code points + on a CPython platform with wide (UCS-4) or narrow (UTF-16) + Unicode, i.e. characters that would be spelled as two + separate code units on a narrow platform. + """ + for c in map(ord, ustring): + if c > 65535: # can only happen on wide platforms + return True + if 0xD800 <= c <= 0xDFFF: + return True + return False + + def string_contains_lone_surrogates(ustring): """ Check if the unicode string contains lone surrogate code points @@ -182,35 +182,35 @@ def string_contains_lone_surrogates(ustring): return last_was_start -class BytesLiteral(_bytes): - # bytes subclass that is compatible with EncodedString - encoding = None - - def __deepcopy__(self, memo): - return self - - def byteencode(self): - if IS_PYTHON3: - return _bytes(self) - else: - # fake-recode the string to make it a plain bytes object - return self.decode('ISO-8859-1').encode('ISO-8859-1') - - def utf8encode(self): - assert False, "this is not a unicode string: %r" % self - - def __str__(self): - """Fake-decode the byte string to unicode to support % - formatting of unicode strings. - """ - return self.decode('ISO-8859-1') - - is_unicode = False - +class BytesLiteral(_bytes): + # bytes subclass that is compatible with EncodedString + encoding = None + + def __deepcopy__(self, memo): + return self + + def byteencode(self): + if IS_PYTHON3: + return _bytes(self) + else: + # fake-recode the string to make it a plain bytes object + return self.decode('ISO-8859-1').encode('ISO-8859-1') + + def utf8encode(self): + assert False, "this is not a unicode string: %r" % self + + def __str__(self): + """Fake-decode the byte string to unicode to support % + formatting of unicode strings. + """ + return self.decode('ISO-8859-1') + + is_unicode = False + def as_c_string_literal(self): value = split_string_literal(escape_byte_string(self)) return '"%s"' % value - + def bytes_literal(s, encoding): assert isinstance(s, bytes) @@ -227,137 +227,137 @@ def encoded_string(s, encoding): return s -char_from_escape_sequence = { - r'\a' : u'\a', - r'\b' : u'\b', - r'\f' : u'\f', - r'\n' : u'\n', - r'\r' : u'\r', - r'\t' : u'\t', - r'\v' : u'\v', - }.get - -_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) - - -def _to_escape_sequence(s): - if s in '\n\r\t': - return repr(s)[1:-1] - elif s == '"': - return r'\"' - elif s == '\\': - return r'\\' - else: - # within a character sequence, oct passes much better than hex - return ''.join(['\\%03o' % ord(c) for c in s]) - - -def _build_specials_replacer(): - subexps = [] - replacements = {} - for special in _c_special: - regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) - subexps.append(regexp) - replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') - sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub - def replace_specials(m): - return replacements[m.group(1)] - def replace(s): - return sub(replace_specials, s) - return replace - -_replace_specials = _build_specials_replacer() - - -def escape_char(c): - if IS_PYTHON3: - c = c.decode('ISO-8859-1') - if c in '\n\r\t\\': - return repr(c)[1:-1] - elif c == "'": - return "\\'" - n = ord(c) - if n < 32 or n > 127: - # hex works well for characters - return "\\x%02X" % n - else: - return c - -def escape_byte_string(s): - """Escape a byte string so that it can be written into C code. - Note that this returns a Unicode string instead which, when - encoded as ISO-8859-1, will result in the correct byte sequence - being written. - """ - s = _replace_specials(s) - try: - return s.decode("ASCII") # trial decoding: plain ASCII => done - except UnicodeDecodeError: - pass - if IS_PYTHON3: - s_new = bytearray() - append, extend = s_new.append, s_new.extend - for b in s: - if b >= 128: - extend(('\\%3o' % b).encode('ASCII')) - else: - append(b) - return s_new.decode('ISO-8859-1') - else: - l = [] - append = l.append - for c in s: - o = ord(c) - if o >= 128: - append('\\%3o' % o) - else: - append(c) - return join_bytes(l).decode('ISO-8859-1') - -def split_string_literal(s, limit=2000): - # MSVC can't handle long string literals. - if len(s) < limit: - return s - else: - start = 0 - chunks = [] - while start < len(s): - end = start + limit - if len(s) > end-4 and '\\' in s[end-4:end]: - end -= 4 - s[end-4:end].find('\\') # just before the backslash - while s[end-1] == '\\': - end -= 1 - if end == start: - # must have been a long line of backslashes - end = start + limit - (limit % 2) - 4 - break - chunks.append(s[start:end]) - start = end - return '""'.join(chunks) - -def encode_pyunicode_string(s): - """Create Py_UNICODE[] representation of a given unicode string. - """ +char_from_escape_sequence = { + r'\a' : u'\a', + r'\b' : u'\b', + r'\f' : u'\f', + r'\n' : u'\n', + r'\r' : u'\r', + r'\t' : u'\t', + r'\v' : u'\v', + }.get + +_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) + + +def _to_escape_sequence(s): + if s in '\n\r\t': + return repr(s)[1:-1] + elif s == '"': + return r'\"' + elif s == '\\': + return r'\\' + else: + # within a character sequence, oct passes much better than hex + return ''.join(['\\%03o' % ord(c) for c in s]) + + +def _build_specials_replacer(): + subexps = [] + replacements = {} + for special in _c_special: + regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) + subexps.append(regexp) + replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') + sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub + def replace_specials(m): + return replacements[m.group(1)] + def replace(s): + return sub(replace_specials, s) + return replace + +_replace_specials = _build_specials_replacer() + + +def escape_char(c): + if IS_PYTHON3: + c = c.decode('ISO-8859-1') + if c in '\n\r\t\\': + return repr(c)[1:-1] + elif c == "'": + return "\\'" + n = ord(c) + if n < 32 or n > 127: + # hex works well for characters + return "\\x%02X" % n + else: + return c + +def escape_byte_string(s): + """Escape a byte string so that it can be written into C code. + Note that this returns a Unicode string instead which, when + encoded as ISO-8859-1, will result in the correct byte sequence + being written. + """ + s = _replace_specials(s) + try: + return s.decode("ASCII") # trial decoding: plain ASCII => done + except UnicodeDecodeError: + pass + if IS_PYTHON3: + s_new = bytearray() + append, extend = s_new.append, s_new.extend + for b in s: + if b >= 128: + extend(('\\%3o' % b).encode('ASCII')) + else: + append(b) + return s_new.decode('ISO-8859-1') + else: + l = [] + append = l.append + for c in s: + o = ord(c) + if o >= 128: + append('\\%3o' % o) + else: + append(c) + return join_bytes(l).decode('ISO-8859-1') + +def split_string_literal(s, limit=2000): + # MSVC can't handle long string literals. + if len(s) < limit: + return s + else: + start = 0 + chunks = [] + while start < len(s): + end = start + limit + if len(s) > end-4 and '\\' in s[end-4:end]: + end -= 4 - s[end-4:end].find('\\') # just before the backslash + while s[end-1] == '\\': + end -= 1 + if end == start: + # must have been a long line of backslashes + end = start + limit - (limit % 2) - 4 + break + chunks.append(s[start:end]) + start = end + return '""'.join(chunks) + +def encode_pyunicode_string(s): + """Create Py_UNICODE[] representation of a given unicode string. + """ s = list(map(ord, s)) + [0] - - if sys.maxunicode >= 0x10000: # Wide build or Py3.3 - utf16, utf32 = [], s - for code_point in s: - if code_point >= 0x10000: # outside of BMP - high, low = divmod(code_point - 0x10000, 1024) - utf16.append(high + 0xD800) - utf16.append(low + 0xDC00) - else: - utf16.append(code_point) - else: - utf16, utf32 = s, [] - for code_unit in s: - if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF: - high, low = utf32[-1], code_unit - utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 - else: - utf32.append(code_unit) - - if utf16 == utf32: - utf16 = [] + + if sys.maxunicode >= 0x10000: # Wide build or Py3.3 + utf16, utf32 = [], s + for code_point in s: + if code_point >= 0x10000: # outside of BMP + high, low = divmod(code_point - 0x10000, 1024) + utf16.append(high + 0xD800) + utf16.append(low + 0xDC00) + else: + utf16.append(code_point) + else: + utf16, utf32 = s, [] + for code_unit in s: + if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF: + high, low = utf32[-1], code_unit + utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 + else: + utf32.append(code_unit) + + if utf16 == utf32: + utf16 = [] return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32)) |