summaryrefslogtreecommitdiffstats
path: root/contrib/tools/cython/Cython/Compiler/StringEncoding.py
diff options
context:
space:
mode:
authorAnton Samokhvalov <[email protected]>2022-02-10 16:45:15 +0300
committerDaniil Cherednik <[email protected]>2022-02-10 16:45:15 +0300
commit72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch)
treeda2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /contrib/tools/cython/Cython/Compiler/StringEncoding.py
parent778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff)
Restoring authorship annotation for Anton Samokhvalov <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'contrib/tools/cython/Cython/Compiler/StringEncoding.py')
-rw-r--r--contrib/tools/cython/Cython/Compiler/StringEncoding.py608
1 files changed, 304 insertions, 304 deletions
diff --git a/contrib/tools/cython/Cython/Compiler/StringEncoding.py b/contrib/tools/cython/Cython/Compiler/StringEncoding.py
index c37e8aab799..d9993c6615a 100644
--- a/contrib/tools/cython/Cython/Compiler/StringEncoding.py
+++ b/contrib/tools/cython/Cython/Compiler/StringEncoding.py
@@ -1,159 +1,159 @@
-#
-# Cython -- encoding related tools
-#
-
-from __future__ import absolute_import
-
-import re
-import sys
-
-if sys.version_info[0] >= 3:
+#
+# Cython -- encoding related tools
+#
+
+from __future__ import absolute_import
+
+import re
+import sys
+
+if sys.version_info[0] >= 3:
_unicode, _str, _bytes, _unichr = str, str, bytes, chr
- IS_PYTHON3 = True
-else:
+ IS_PYTHON3 = True
+else:
_unicode, _str, _bytes, _unichr = unicode, str, str, unichr
- IS_PYTHON3 = False
-
-empty_bytes = _bytes()
-empty_unicode = _unicode()
-
-join_bytes = empty_bytes.join
-
-
-class UnicodeLiteralBuilder(object):
- """Assemble a unicode string.
- """
- def __init__(self):
- self.chars = []
-
- def append(self, characters):
- if isinstance(characters, _bytes):
- # this came from a Py2 string literal in the parser code
- characters = characters.decode("ASCII")
- assert isinstance(characters, _unicode), str(type(characters))
- self.chars.append(characters)
-
- if sys.maxunicode == 65535:
- def append_charval(self, char_number):
- if char_number > 65535:
- # wide Unicode character on narrow platform => replace
- # by surrogate pair
- char_number -= 0x10000
+ IS_PYTHON3 = False
+
+empty_bytes = _bytes()
+empty_unicode = _unicode()
+
+join_bytes = empty_bytes.join
+
+
+class UnicodeLiteralBuilder(object):
+ """Assemble a unicode string.
+ """
+ def __init__(self):
+ self.chars = []
+
+ def append(self, characters):
+ if isinstance(characters, _bytes):
+ # this came from a Py2 string literal in the parser code
+ characters = characters.decode("ASCII")
+ assert isinstance(characters, _unicode), str(type(characters))
+ self.chars.append(characters)
+
+ if sys.maxunicode == 65535:
+ def append_charval(self, char_number):
+ if char_number > 65535:
+ # wide Unicode character on narrow platform => replace
+ # by surrogate pair
+ char_number -= 0x10000
self.chars.append( _unichr((char_number // 1024) + 0xD800) )
self.chars.append( _unichr((char_number % 1024) + 0xDC00) )
- else:
+ else:
self.chars.append( _unichr(char_number) )
- else:
- def append_charval(self, char_number):
+ else:
+ def append_charval(self, char_number):
self.chars.append( _unichr(char_number) )
-
- def append_uescape(self, char_number, escape_string):
- self.append_charval(char_number)
-
- def getstring(self):
- return EncodedString(u''.join(self.chars))
-
- def getstrings(self):
- return (None, self.getstring())
-
-
-class BytesLiteralBuilder(object):
- """Assemble a byte string or char value.
- """
- def __init__(self, target_encoding):
- self.chars = []
- self.target_encoding = target_encoding
-
- def append(self, characters):
- if isinstance(characters, _unicode):
- characters = characters.encode(self.target_encoding)
- assert isinstance(characters, _bytes), str(type(characters))
- self.chars.append(characters)
-
- def append_charval(self, char_number):
+
+ def append_uescape(self, char_number, escape_string):
+ self.append_charval(char_number)
+
+ def getstring(self):
+ return EncodedString(u''.join(self.chars))
+
+ def getstrings(self):
+ return (None, self.getstring())
+
+
+class BytesLiteralBuilder(object):
+ """Assemble a byte string or char value.
+ """
+ def __init__(self, target_encoding):
+ self.chars = []
+ self.target_encoding = target_encoding
+
+ def append(self, characters):
+ if isinstance(characters, _unicode):
+ characters = characters.encode(self.target_encoding)
+ assert isinstance(characters, _bytes), str(type(characters))
+ self.chars.append(characters)
+
+ def append_charval(self, char_number):
self.chars.append( _unichr(char_number).encode('ISO-8859-1') )
-
- def append_uescape(self, char_number, escape_string):
- self.append(escape_string)
-
- def getstring(self):
- # this *must* return a byte string!
+
+ def append_uescape(self, char_number, escape_string):
+ self.append(escape_string)
+
+ def getstring(self):
+ # this *must* return a byte string!
return bytes_literal(join_bytes(self.chars), self.target_encoding)
-
- def getchar(self):
- # this *must* return a byte string!
- return self.getstring()
-
- def getstrings(self):
- return (self.getstring(), None)
-
-
-class StrLiteralBuilder(object):
- """Assemble both a bytes and a unicode representation of a string.
- """
- def __init__(self, target_encoding):
- self._bytes = BytesLiteralBuilder(target_encoding)
- self._unicode = UnicodeLiteralBuilder()
-
- def append(self, characters):
- self._bytes.append(characters)
- self._unicode.append(characters)
-
- def append_charval(self, char_number):
- self._bytes.append_charval(char_number)
- self._unicode.append_charval(char_number)
-
- def append_uescape(self, char_number, escape_string):
- self._bytes.append(escape_string)
- self._unicode.append_charval(char_number)
-
- def getstrings(self):
- return (self._bytes.getstring(), self._unicode.getstring())
-
-
-class EncodedString(_unicode):
- # unicode string subclass to keep track of the original encoding.
- # 'encoding' is None for unicode strings and the source encoding
- # otherwise
- encoding = None
-
- def __deepcopy__(self, memo):
- return self
-
- def byteencode(self):
- assert self.encoding is not None
- return self.encode(self.encoding)
-
- def utf8encode(self):
- assert self.encoding is None
- return self.encode("UTF-8")
-
- @property
- def is_unicode(self):
- return self.encoding is None
-
- def contains_surrogates(self):
- return string_contains_surrogates(self)
-
+
+ def getchar(self):
+ # this *must* return a byte string!
+ return self.getstring()
+
+ def getstrings(self):
+ return (self.getstring(), None)
+
+
+class StrLiteralBuilder(object):
+ """Assemble both a bytes and a unicode representation of a string.
+ """
+ def __init__(self, target_encoding):
+ self._bytes = BytesLiteralBuilder(target_encoding)
+ self._unicode = UnicodeLiteralBuilder()
+
+ def append(self, characters):
+ self._bytes.append(characters)
+ self._unicode.append(characters)
+
+ def append_charval(self, char_number):
+ self._bytes.append_charval(char_number)
+ self._unicode.append_charval(char_number)
+
+ def append_uescape(self, char_number, escape_string):
+ self._bytes.append(escape_string)
+ self._unicode.append_charval(char_number)
+
+ def getstrings(self):
+ return (self._bytes.getstring(), self._unicode.getstring())
+
+
+class EncodedString(_unicode):
+ # unicode string subclass to keep track of the original encoding.
+ # 'encoding' is None for unicode strings and the source encoding
+ # otherwise
+ encoding = None
+
+ def __deepcopy__(self, memo):
+ return self
+
+ def byteencode(self):
+ assert self.encoding is not None
+ return self.encode(self.encoding)
+
+ def utf8encode(self):
+ assert self.encoding is None
+ return self.encode("UTF-8")
+
+ @property
+ def is_unicode(self):
+ return self.encoding is None
+
+ def contains_surrogates(self):
+ return string_contains_surrogates(self)
+
def as_utf8_string(self):
return bytes_literal(self.utf8encode(), 'utf8')
-
-
-def string_contains_surrogates(ustring):
- """
- Check if the unicode string contains surrogate code points
- on a CPython platform with wide (UCS-4) or narrow (UTF-16)
- Unicode, i.e. characters that would be spelled as two
- separate code units on a narrow platform.
- """
- for c in map(ord, ustring):
- if c > 65535: # can only happen on wide platforms
- return True
- if 0xD800 <= c <= 0xDFFF:
- return True
- return False
-
-
+
+
+def string_contains_surrogates(ustring):
+ """
+ Check if the unicode string contains surrogate code points
+ on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+ Unicode, i.e. characters that would be spelled as two
+ separate code units on a narrow platform.
+ """
+ for c in map(ord, ustring):
+ if c > 65535: # can only happen on wide platforms
+ return True
+ if 0xD800 <= c <= 0xDFFF:
+ return True
+ return False
+
+
def string_contains_lone_surrogates(ustring):
"""
Check if the unicode string contains lone surrogate code points
@@ -182,35 +182,35 @@ def string_contains_lone_surrogates(ustring):
return last_was_start
-class BytesLiteral(_bytes):
- # bytes subclass that is compatible with EncodedString
- encoding = None
-
- def __deepcopy__(self, memo):
- return self
-
- def byteencode(self):
- if IS_PYTHON3:
- return _bytes(self)
- else:
- # fake-recode the string to make it a plain bytes object
- return self.decode('ISO-8859-1').encode('ISO-8859-1')
-
- def utf8encode(self):
- assert False, "this is not a unicode string: %r" % self
-
- def __str__(self):
- """Fake-decode the byte string to unicode to support %
- formatting of unicode strings.
- """
- return self.decode('ISO-8859-1')
-
- is_unicode = False
-
+class BytesLiteral(_bytes):
+ # bytes subclass that is compatible with EncodedString
+ encoding = None
+
+ def __deepcopy__(self, memo):
+ return self
+
+ def byteencode(self):
+ if IS_PYTHON3:
+ return _bytes(self)
+ else:
+ # fake-recode the string to make it a plain bytes object
+ return self.decode('ISO-8859-1').encode('ISO-8859-1')
+
+ def utf8encode(self):
+ assert False, "this is not a unicode string: %r" % self
+
+ def __str__(self):
+ """Fake-decode the byte string to unicode to support %
+ formatting of unicode strings.
+ """
+ return self.decode('ISO-8859-1')
+
+ is_unicode = False
+
def as_c_string_literal(self):
value = split_string_literal(escape_byte_string(self))
return '"%s"' % value
-
+
def bytes_literal(s, encoding):
assert isinstance(s, bytes)
@@ -227,137 +227,137 @@ def encoded_string(s, encoding):
return s
-char_from_escape_sequence = {
- r'\a' : u'\a',
- r'\b' : u'\b',
- r'\f' : u'\f',
- r'\n' : u'\n',
- r'\r' : u'\r',
- r'\t' : u'\t',
- r'\v' : u'\v',
- }.get
-
-_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
-
-
-def _to_escape_sequence(s):
- if s in '\n\r\t':
- return repr(s)[1:-1]
- elif s == '"':
- return r'\"'
- elif s == '\\':
- return r'\\'
- else:
- # within a character sequence, oct passes much better than hex
- return ''.join(['\\%03o' % ord(c) for c in s])
-
-
-def _build_specials_replacer():
- subexps = []
- replacements = {}
- for special in _c_special:
- regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
- subexps.append(regexp)
- replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
- sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
- def replace_specials(m):
- return replacements[m.group(1)]
- def replace(s):
- return sub(replace_specials, s)
- return replace
-
-_replace_specials = _build_specials_replacer()
-
-
-def escape_char(c):
- if IS_PYTHON3:
- c = c.decode('ISO-8859-1')
- if c in '\n\r\t\\':
- return repr(c)[1:-1]
- elif c == "'":
- return "\\'"
- n = ord(c)
- if n < 32 or n > 127:
- # hex works well for characters
- return "\\x%02X" % n
- else:
- return c
-
-def escape_byte_string(s):
- """Escape a byte string so that it can be written into C code.
- Note that this returns a Unicode string instead which, when
- encoded as ISO-8859-1, will result in the correct byte sequence
- being written.
- """
- s = _replace_specials(s)
- try:
- return s.decode("ASCII") # trial decoding: plain ASCII => done
- except UnicodeDecodeError:
- pass
- if IS_PYTHON3:
- s_new = bytearray()
- append, extend = s_new.append, s_new.extend
- for b in s:
- if b >= 128:
- extend(('\\%3o' % b).encode('ASCII'))
- else:
- append(b)
- return s_new.decode('ISO-8859-1')
- else:
- l = []
- append = l.append
- for c in s:
- o = ord(c)
- if o >= 128:
- append('\\%3o' % o)
- else:
- append(c)
- return join_bytes(l).decode('ISO-8859-1')
-
-def split_string_literal(s, limit=2000):
- # MSVC can't handle long string literals.
- if len(s) < limit:
- return s
- else:
- start = 0
- chunks = []
- while start < len(s):
- end = start + limit
- if len(s) > end-4 and '\\' in s[end-4:end]:
- end -= 4 - s[end-4:end].find('\\') # just before the backslash
- while s[end-1] == '\\':
- end -= 1
- if end == start:
- # must have been a long line of backslashes
- end = start + limit - (limit % 2) - 4
- break
- chunks.append(s[start:end])
- start = end
- return '""'.join(chunks)
-
-def encode_pyunicode_string(s):
- """Create Py_UNICODE[] representation of a given unicode string.
- """
+char_from_escape_sequence = {
+ r'\a' : u'\a',
+ r'\b' : u'\b',
+ r'\f' : u'\f',
+ r'\n' : u'\n',
+ r'\r' : u'\r',
+ r'\t' : u'\t',
+ r'\v' : u'\v',
+ }.get
+
+_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
+
+
+def _to_escape_sequence(s):
+ if s in '\n\r\t':
+ return repr(s)[1:-1]
+ elif s == '"':
+ return r'\"'
+ elif s == '\\':
+ return r'\\'
+ else:
+ # within a character sequence, oct passes much better than hex
+ return ''.join(['\\%03o' % ord(c) for c in s])
+
+
+def _build_specials_replacer():
+ subexps = []
+ replacements = {}
+ for special in _c_special:
+ regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
+ subexps.append(regexp)
+ replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
+ sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
+ def replace_specials(m):
+ return replacements[m.group(1)]
+ def replace(s):
+ return sub(replace_specials, s)
+ return replace
+
+_replace_specials = _build_specials_replacer()
+
+
+def escape_char(c):
+ if IS_PYTHON3:
+ c = c.decode('ISO-8859-1')
+ if c in '\n\r\t\\':
+ return repr(c)[1:-1]
+ elif c == "'":
+ return "\\'"
+ n = ord(c)
+ if n < 32 or n > 127:
+ # hex works well for characters
+ return "\\x%02X" % n
+ else:
+ return c
+
+def escape_byte_string(s):
+ """Escape a byte string so that it can be written into C code.
+ Note that this returns a Unicode string instead which, when
+ encoded as ISO-8859-1, will result in the correct byte sequence
+ being written.
+ """
+ s = _replace_specials(s)
+ try:
+ return s.decode("ASCII") # trial decoding: plain ASCII => done
+ except UnicodeDecodeError:
+ pass
+ if IS_PYTHON3:
+ s_new = bytearray()
+ append, extend = s_new.append, s_new.extend
+ for b in s:
+ if b >= 128:
+ extend(('\\%3o' % b).encode('ASCII'))
+ else:
+ append(b)
+ return s_new.decode('ISO-8859-1')
+ else:
+ l = []
+ append = l.append
+ for c in s:
+ o = ord(c)
+ if o >= 128:
+ append('\\%3o' % o)
+ else:
+ append(c)
+ return join_bytes(l).decode('ISO-8859-1')
+
+def split_string_literal(s, limit=2000):
+ # MSVC can't handle long string literals.
+ if len(s) < limit:
+ return s
+ else:
+ start = 0
+ chunks = []
+ while start < len(s):
+ end = start + limit
+ if len(s) > end-4 and '\\' in s[end-4:end]:
+ end -= 4 - s[end-4:end].find('\\') # just before the backslash
+ while s[end-1] == '\\':
+ end -= 1
+ if end == start:
+ # must have been a long line of backslashes
+ end = start + limit - (limit % 2) - 4
+ break
+ chunks.append(s[start:end])
+ start = end
+ return '""'.join(chunks)
+
+def encode_pyunicode_string(s):
+ """Create Py_UNICODE[] representation of a given unicode string.
+ """
s = list(map(ord, s)) + [0]
-
- if sys.maxunicode >= 0x10000: # Wide build or Py3.3
- utf16, utf32 = [], s
- for code_point in s:
- if code_point >= 0x10000: # outside of BMP
- high, low = divmod(code_point - 0x10000, 1024)
- utf16.append(high + 0xD800)
- utf16.append(low + 0xDC00)
- else:
- utf16.append(code_point)
- else:
- utf16, utf32 = s, []
- for code_unit in s:
- if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
- high, low = utf32[-1], code_unit
- utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
- else:
- utf32.append(code_unit)
-
- if utf16 == utf32:
- utf16 = []
+
+ if sys.maxunicode >= 0x10000: # Wide build or Py3.3
+ utf16, utf32 = [], s
+ for code_point in s:
+ if code_point >= 0x10000: # outside of BMP
+ high, low = divmod(code_point - 0x10000, 1024)
+ utf16.append(high + 0xD800)
+ utf16.append(low + 0xDC00)
+ else:
+ utf16.append(code_point)
+ else:
+ utf16, utf32 = s, []
+ for code_unit in s:
+ if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
+ high, low = utf32[-1], code_unit
+ utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
+ else:
+ utf32.append(code_unit)
+
+ if utf16 == utf32:
+ utf16 = []
return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))