diff options
author | orivej <[email protected]> | 2022-02-10 16:44:49 +0300 |
---|---|---|
committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:44:49 +0300 |
commit | 718c552901d703c502ccbefdfc3c9028d608b947 (patch) | |
tree | 46534a98bbefcd7b1f3faa5b52c138ab27db75b7 /contrib/tools/python3/src/Lib/codecs.py | |
parent | e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (diff) |
Restoring authorship annotation for <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'contrib/tools/python3/src/Lib/codecs.py')
-rw-r--r-- | contrib/tools/python3/src/Lib/codecs.py | 2208 |
1 files changed, 1104 insertions, 1104 deletions
diff --git a/contrib/tools/python3/src/Lib/codecs.py b/contrib/tools/python3/src/Lib/codecs.py index d2edd148a29..d290964098c 100644 --- a/contrib/tools/python3/src/Lib/codecs.py +++ b/contrib/tools/python3/src/Lib/codecs.py @@ -1,911 +1,911 @@ -""" codecs -- Python Codec Registry, API and helpers. - - -Written by Marc-Andre Lemburg ([email protected]). - -(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. - -""" - -import builtins -import sys - -### Registry and builtin stateless codec functions - -try: - from _codecs import * -except ImportError as why: - raise SystemError('Failed to load the builtin codecs: %s' % why) - -__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", - "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", - "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", - "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", - "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", - "StreamReader", "StreamWriter", - "StreamReaderWriter", "StreamRecoder", - "getencoder", "getdecoder", "getincrementalencoder", - "getincrementaldecoder", "getreader", "getwriter", - "encode", "decode", "iterencode", "iterdecode", - "strict_errors", "ignore_errors", "replace_errors", - "xmlcharrefreplace_errors", - "backslashreplace_errors", "namereplace_errors", - "register_error", "lookup_error"] - -### Constants - -# -# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) -# and its possible byte string values -# for UTF8/UTF16/UTF32 output and little/big endian machines -# - -# UTF-8 -BOM_UTF8 = b'\xef\xbb\xbf' - -# UTF-16, little endian -BOM_LE = BOM_UTF16_LE = b'\xff\xfe' - -# UTF-16, big endian -BOM_BE = BOM_UTF16_BE = b'\xfe\xff' - -# UTF-32, little endian -BOM_UTF32_LE = b'\xff\xfe\x00\x00' - -# UTF-32, big endian -BOM_UTF32_BE = b'\x00\x00\xfe\xff' - -if sys.byteorder == 'little': - - # UTF-16, native endianness - BOM = BOM_UTF16 = BOM_UTF16_LE - - # UTF-32, native endianness - BOM_UTF32 = BOM_UTF32_LE - -else: - - # UTF-16, native endianness - BOM = BOM_UTF16 = BOM_UTF16_BE - - # UTF-32, native endianness - BOM_UTF32 = BOM_UTF32_BE - -# Old broken names (don't use in new code) -BOM32_LE = BOM_UTF16_LE -BOM32_BE = BOM_UTF16_BE -BOM64_LE = BOM_UTF32_LE -BOM64_BE = BOM_UTF32_BE - - -### Codec base classes (defining the API) - -class CodecInfo(tuple): - """Codec details when looking up the codec registry""" - - # Private API to allow Python 3.4 to blacklist the known non-Unicode - # codecs in the standard library. A more general mechanism to - # reliably distinguish test encodings from other codecs will hopefully - # be defined for Python 3.5 - # - # See http://bugs.python.org/issue19619 - _is_text_encoding = True # Assume codecs are text encodings by default - - def __new__(cls, encode, decode, streamreader=None, streamwriter=None, - incrementalencoder=None, incrementaldecoder=None, name=None, - *, _is_text_encoding=None): - self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) - self.name = name - self.encode = encode - self.decode = decode - self.incrementalencoder = incrementalencoder - self.incrementaldecoder = incrementaldecoder - self.streamwriter = streamwriter - self.streamreader = streamreader - if _is_text_encoding is not None: - self._is_text_encoding = _is_text_encoding - return self - - def __repr__(self): - return "<%s.%s object for encoding %s at %#x>" % \ - (self.__class__.__module__, self.__class__.__qualname__, - self.name, id(self)) - -class Codec: - - """ Defines the interface for stateless encoders/decoders. - - The .encode()/.decode() methods may use different error - handling schemes by providing the errors argument. These - string values are predefined: - - 'strict' - raise a ValueError error (or a subclass) - 'ignore' - ignore the character and continue with the next - 'replace' - replace with a suitable replacement character; - Python will use the official U+FFFD REPLACEMENT - CHARACTER for the builtin Unicode codecs on - decoding and '?' on encoding. - 'surrogateescape' - replace with private code points U+DCnn. - 'xmlcharrefreplace' - Replace with the appropriate XML - character reference (only for encoding). - 'backslashreplace' - Replace with backslashed escape sequences. - 'namereplace' - Replace with \\N{...} escape sequences - (only for encoding). - - The set of allowed values can be extended via register_error. - - """ - def encode(self, input, errors='strict'): - - """ Encodes the object input and returns a tuple (output - object, length consumed). - - errors defines the error handling to apply. It defaults to - 'strict' handling. - - The method may not store state in the Codec instance. Use - StreamWriter for codecs which have to keep state in order to - make encoding efficient. - - The encoder must be able to handle zero length input and - return an empty object of the output object type in this - situation. - - """ - raise NotImplementedError - - def decode(self, input, errors='strict'): - - """ Decodes the object input and returns a tuple (output - object, length consumed). - - input must be an object which provides the bf_getreadbuf - buffer slot. Python strings, buffer objects and memory - mapped files are examples of objects providing this slot. - - errors defines the error handling to apply. It defaults to - 'strict' handling. - - The method may not store state in the Codec instance. Use - StreamReader for codecs which have to keep state in order to - make decoding efficient. - - The decoder must be able to handle zero length input and - return an empty object of the output object type in this - situation. - - """ - raise NotImplementedError - -class IncrementalEncoder(object): - """ - An IncrementalEncoder encodes an input in multiple steps. The input can - be passed piece by piece to the encode() method. The IncrementalEncoder - remembers the state of the encoding process between calls to encode(). - """ - def __init__(self, errors='strict'): - """ - Creates an IncrementalEncoder instance. - - The IncrementalEncoder may use different error handling schemes by - providing the errors keyword argument. See the module docstring - for a list of possible values. - """ - self.errors = errors - self.buffer = "" - - def encode(self, input, final=False): - """ - Encodes input and returns the resulting object. - """ - raise NotImplementedError - - def reset(self): - """ - Resets the encoder to the initial state. - """ - - def getstate(self): - """ - Return the current state of the encoder. - """ - return 0 - - def setstate(self, state): - """ - Set the current state of the encoder. state must have been - returned by getstate(). - """ - -class BufferedIncrementalEncoder(IncrementalEncoder): - """ - This subclass of IncrementalEncoder can be used as the baseclass for an - incremental encoder if the encoder must keep some of the output in a - buffer between calls to encode(). - """ - def __init__(self, errors='strict'): - IncrementalEncoder.__init__(self, errors) - # unencoded input that is kept between calls to encode() - self.buffer = "" - - def _buffer_encode(self, input, errors, final): - # Overwrite this method in subclasses: It must encode input - # and return an (output, length consumed) tuple - raise NotImplementedError - - def encode(self, input, final=False): - # encode input (taking the buffer into account) - data = self.buffer + input - (result, consumed) = self._buffer_encode(data, self.errors, final) - # keep unencoded input until the next call - self.buffer = data[consumed:] - return result - - def reset(self): - IncrementalEncoder.reset(self) - self.buffer = "" - - def getstate(self): - return self.buffer or 0 - - def setstate(self, state): - self.buffer = state or "" - -class IncrementalDecoder(object): - """ - An IncrementalDecoder decodes an input in multiple steps. The input can - be passed piece by piece to the decode() method. The IncrementalDecoder - remembers the state of the decoding process between calls to decode(). - """ - def __init__(self, errors='strict'): - """ - Create an IncrementalDecoder instance. - - The IncrementalDecoder may use different error handling schemes by - providing the errors keyword argument. See the module docstring - for a list of possible values. - """ - self.errors = errors - - def decode(self, input, final=False): - """ - Decode input and returns the resulting object. - """ - raise NotImplementedError - - def reset(self): - """ - Reset the decoder to the initial state. - """ - - def getstate(self): - """ - Return the current state of the decoder. - - This must be a (buffered_input, additional_state_info) tuple. - buffered_input must be a bytes object containing bytes that - were passed to decode() that have not yet been converted. - additional_state_info must be a non-negative integer - representing the state of the decoder WITHOUT yet having - processed the contents of buffered_input. In the initial state - and after reset(), getstate() must return (b"", 0). - """ - return (b"", 0) - - def setstate(self, state): - """ - Set the current state of the decoder. - - state must have been returned by getstate(). The effect of - setstate((b"", 0)) must be equivalent to reset(). - """ - -class BufferedIncrementalDecoder(IncrementalDecoder): - """ - This subclass of IncrementalDecoder can be used as the baseclass for an - incremental decoder if the decoder must be able to handle incomplete - byte sequences. - """ - def __init__(self, errors='strict'): - IncrementalDecoder.__init__(self, errors) - # undecoded input that is kept between calls to decode() - self.buffer = b"" - - def _buffer_decode(self, input, errors, final): - # Overwrite this method in subclasses: It must decode input - # and return an (output, length consumed) tuple - raise NotImplementedError - - def decode(self, input, final=False): - # decode input (taking the buffer into account) - data = self.buffer + input - (result, consumed) = self._buffer_decode(data, self.errors, final) - # keep undecoded input until the next call - self.buffer = data[consumed:] - return result - - def reset(self): - IncrementalDecoder.reset(self) - self.buffer = b"" - - def getstate(self): - # additional state info is always 0 - return (self.buffer, 0) - - def setstate(self, state): - # ignore additional state info - self.buffer = state[0] - -# -# The StreamWriter and StreamReader class provide generic working -# interfaces which can be used to implement new encoding submodules -# very easily. See encodings/utf_8.py for an example on how this is -# done. -# - -class StreamWriter(Codec): - - def __init__(self, stream, errors='strict'): - - """ Creates a StreamWriter instance. - - stream must be a file-like object open for writing. - - The StreamWriter may use different error handling - schemes by providing the errors keyword argument. These - parameters are predefined: - - 'strict' - raise a ValueError (or a subclass) - 'ignore' - ignore the character and continue with the next - 'replace'- replace with a suitable replacement character - 'xmlcharrefreplace' - Replace with the appropriate XML - character reference. - 'backslashreplace' - Replace with backslashed escape - sequences. - 'namereplace' - Replace with \\N{...} escape sequences. - - The set of allowed parameter values can be extended via - register_error. - """ - self.stream = stream - self.errors = errors - - def write(self, object): - - """ Writes the object's contents encoded to self.stream. - """ - data, consumed = self.encode(object, self.errors) - self.stream.write(data) - - def writelines(self, list): - - """ Writes the concatenated list of strings to the stream - using .write(). - """ - self.write(''.join(list)) - - def reset(self): - +""" codecs -- Python Codec Registry, API and helpers. + + +Written by Marc-Andre Lemburg ([email protected]). + +(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. + +""" + +import builtins +import sys + +### Registry and builtin stateless codec functions + +try: + from _codecs import * +except ImportError as why: + raise SystemError('Failed to load the builtin codecs: %s' % why) + +__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", + "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", + "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", + "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", + "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", + "StreamReader", "StreamWriter", + "StreamReaderWriter", "StreamRecoder", + "getencoder", "getdecoder", "getincrementalencoder", + "getincrementaldecoder", "getreader", "getwriter", + "encode", "decode", "iterencode", "iterdecode", + "strict_errors", "ignore_errors", "replace_errors", + "xmlcharrefreplace_errors", + "backslashreplace_errors", "namereplace_errors", + "register_error", "lookup_error"] + +### Constants + +# +# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) +# and its possible byte string values +# for UTF8/UTF16/UTF32 output and little/big endian machines +# + +# UTF-8 +BOM_UTF8 = b'\xef\xbb\xbf' + +# UTF-16, little endian +BOM_LE = BOM_UTF16_LE = b'\xff\xfe' + +# UTF-16, big endian +BOM_BE = BOM_UTF16_BE = b'\xfe\xff' + +# UTF-32, little endian +BOM_UTF32_LE = b'\xff\xfe\x00\x00' + +# UTF-32, big endian +BOM_UTF32_BE = b'\x00\x00\xfe\xff' + +if sys.byteorder == 'little': + + # UTF-16, native endianness + BOM = BOM_UTF16 = BOM_UTF16_LE + + # UTF-32, native endianness + BOM_UTF32 = BOM_UTF32_LE + +else: + + # UTF-16, native endianness + BOM = BOM_UTF16 = BOM_UTF16_BE + + # UTF-32, native endianness + BOM_UTF32 = BOM_UTF32_BE + +# Old broken names (don't use in new code) +BOM32_LE = BOM_UTF16_LE +BOM32_BE = BOM_UTF16_BE +BOM64_LE = BOM_UTF32_LE +BOM64_BE = BOM_UTF32_BE + + +### Codec base classes (defining the API) + +class CodecInfo(tuple): + """Codec details when looking up the codec registry""" + + # Private API to allow Python 3.4 to blacklist the known non-Unicode + # codecs in the standard library. A more general mechanism to + # reliably distinguish test encodings from other codecs will hopefully + # be defined for Python 3.5 + # + # See http://bugs.python.org/issue19619 + _is_text_encoding = True # Assume codecs are text encodings by default + + def __new__(cls, encode, decode, streamreader=None, streamwriter=None, + incrementalencoder=None, incrementaldecoder=None, name=None, + *, _is_text_encoding=None): + self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) + self.name = name + self.encode = encode + self.decode = decode + self.incrementalencoder = incrementalencoder + self.incrementaldecoder = incrementaldecoder + self.streamwriter = streamwriter + self.streamreader = streamreader + if _is_text_encoding is not None: + self._is_text_encoding = _is_text_encoding + return self + + def __repr__(self): + return "<%s.%s object for encoding %s at %#x>" % \ + (self.__class__.__module__, self.__class__.__qualname__, + self.name, id(self)) + +class Codec: + + """ Defines the interface for stateless encoders/decoders. + + The .encode()/.decode() methods may use different error + handling schemes by providing the errors argument. These + string values are predefined: + + 'strict' - raise a ValueError error (or a subclass) + 'ignore' - ignore the character and continue with the next + 'replace' - replace with a suitable replacement character; + Python will use the official U+FFFD REPLACEMENT + CHARACTER for the builtin Unicode codecs on + decoding and '?' on encoding. + 'surrogateescape' - replace with private code points U+DCnn. + 'xmlcharrefreplace' - Replace with the appropriate XML + character reference (only for encoding). + 'backslashreplace' - Replace with backslashed escape sequences. + 'namereplace' - Replace with \\N{...} escape sequences + (only for encoding). + + The set of allowed values can be extended via register_error. + + """ + def encode(self, input, errors='strict'): + + """ Encodes the object input and returns a tuple (output + object, length consumed). + + errors defines the error handling to apply. It defaults to + 'strict' handling. + + The method may not store state in the Codec instance. Use + StreamWriter for codecs which have to keep state in order to + make encoding efficient. + + The encoder must be able to handle zero length input and + return an empty object of the output object type in this + situation. + + """ + raise NotImplementedError + + def decode(self, input, errors='strict'): + + """ Decodes the object input and returns a tuple (output + object, length consumed). + + input must be an object which provides the bf_getreadbuf + buffer slot. Python strings, buffer objects and memory + mapped files are examples of objects providing this slot. + + errors defines the error handling to apply. It defaults to + 'strict' handling. + + The method may not store state in the Codec instance. Use + StreamReader for codecs which have to keep state in order to + make decoding efficient. + + The decoder must be able to handle zero length input and + return an empty object of the output object type in this + situation. + + """ + raise NotImplementedError + +class IncrementalEncoder(object): + """ + An IncrementalEncoder encodes an input in multiple steps. The input can + be passed piece by piece to the encode() method. The IncrementalEncoder + remembers the state of the encoding process between calls to encode(). + """ + def __init__(self, errors='strict'): + """ + Creates an IncrementalEncoder instance. + + The IncrementalEncoder may use different error handling schemes by + providing the errors keyword argument. See the module docstring + for a list of possible values. + """ + self.errors = errors + self.buffer = "" + + def encode(self, input, final=False): + """ + Encodes input and returns the resulting object. + """ + raise NotImplementedError + + def reset(self): + """ + Resets the encoder to the initial state. + """ + + def getstate(self): + """ + Return the current state of the encoder. + """ + return 0 + + def setstate(self, state): + """ + Set the current state of the encoder. state must have been + returned by getstate(). + """ + +class BufferedIncrementalEncoder(IncrementalEncoder): + """ + This subclass of IncrementalEncoder can be used as the baseclass for an + incremental encoder if the encoder must keep some of the output in a + buffer between calls to encode(). + """ + def __init__(self, errors='strict'): + IncrementalEncoder.__init__(self, errors) + # unencoded input that is kept between calls to encode() + self.buffer = "" + + def _buffer_encode(self, input, errors, final): + # Overwrite this method in subclasses: It must encode input + # and return an (output, length consumed) tuple + raise NotImplementedError + + def encode(self, input, final=False): + # encode input (taking the buffer into account) + data = self.buffer + input + (result, consumed) = self._buffer_encode(data, self.errors, final) + # keep unencoded input until the next call + self.buffer = data[consumed:] + return result + + def reset(self): + IncrementalEncoder.reset(self) + self.buffer = "" + + def getstate(self): + return self.buffer or 0 + + def setstate(self, state): + self.buffer = state or "" + +class IncrementalDecoder(object): + """ + An IncrementalDecoder decodes an input in multiple steps. The input can + be passed piece by piece to the decode() method. The IncrementalDecoder + remembers the state of the decoding process between calls to decode(). + """ + def __init__(self, errors='strict'): + """ + Create an IncrementalDecoder instance. + + The IncrementalDecoder may use different error handling schemes by + providing the errors keyword argument. See the module docstring + for a list of possible values. + """ + self.errors = errors + + def decode(self, input, final=False): + """ + Decode input and returns the resulting object. + """ + raise NotImplementedError + + def reset(self): + """ + Reset the decoder to the initial state. + """ + + def getstate(self): + """ + Return the current state of the decoder. + + This must be a (buffered_input, additional_state_info) tuple. + buffered_input must be a bytes object containing bytes that + were passed to decode() that have not yet been converted. + additional_state_info must be a non-negative integer + representing the state of the decoder WITHOUT yet having + processed the contents of buffered_input. In the initial state + and after reset(), getstate() must return (b"", 0). + """ + return (b"", 0) + + def setstate(self, state): + """ + Set the current state of the decoder. + + state must have been returned by getstate(). The effect of + setstate((b"", 0)) must be equivalent to reset(). + """ + +class BufferedIncrementalDecoder(IncrementalDecoder): + """ + This subclass of IncrementalDecoder can be used as the baseclass for an + incremental decoder if the decoder must be able to handle incomplete + byte sequences. + """ + def __init__(self, errors='strict'): + IncrementalDecoder.__init__(self, errors) + # undecoded input that is kept between calls to decode() + self.buffer = b"" + + def _buffer_decode(self, input, errors, final): + # Overwrite this method in subclasses: It must decode input + # and return an (output, length consumed) tuple + raise NotImplementedError + + def decode(self, input, final=False): + # decode input (taking the buffer into account) + data = self.buffer + input + (result, consumed) = self._buffer_decode(data, self.errors, final) + # keep undecoded input until the next call + self.buffer = data[consumed:] + return result + + def reset(self): + IncrementalDecoder.reset(self) + self.buffer = b"" + + def getstate(self): + # additional state info is always 0 + return (self.buffer, 0) + + def setstate(self, state): + # ignore additional state info + self.buffer = state[0] + +# +# The StreamWriter and StreamReader class provide generic working +# interfaces which can be used to implement new encoding submodules +# very easily. See encodings/utf_8.py for an example on how this is +# done. +# + +class StreamWriter(Codec): + + def __init__(self, stream, errors='strict'): + + """ Creates a StreamWriter instance. + + stream must be a file-like object open for writing. + + The StreamWriter may use different error handling + schemes by providing the errors keyword argument. These + parameters are predefined: + + 'strict' - raise a ValueError (or a subclass) + 'ignore' - ignore the character and continue with the next + 'replace'- replace with a suitable replacement character + 'xmlcharrefreplace' - Replace with the appropriate XML + character reference. + 'backslashreplace' - Replace with backslashed escape + sequences. + 'namereplace' - Replace with \\N{...} escape sequences. + + The set of allowed parameter values can be extended via + register_error. + """ + self.stream = stream + self.errors = errors + + def write(self, object): + + """ Writes the object's contents encoded to self.stream. + """ + data, consumed = self.encode(object, self.errors) + self.stream.write(data) + + def writelines(self, list): + + """ Writes the concatenated list of strings to the stream + using .write(). + """ + self.write(''.join(list)) + + def reset(self): + """ Resets the codec buffers used for keeping internal state. - - Calling this method should ensure that the data on the - output is put into a clean state, that allows appending - of new fresh data without having to rescan the whole - stream to recover state. - - """ - pass - - def seek(self, offset, whence=0): - self.stream.seek(offset, whence) - if whence == 0 and offset == 0: - self.reset() - - def __getattr__(self, name, - getattr=getattr): - - """ Inherit all other methods from the underlying stream. - """ - return getattr(self.stream, name) - - def __enter__(self): - return self - - def __exit__(self, type, value, tb): - self.stream.close() - -### - -class StreamReader(Codec): - - charbuffertype = str - - def __init__(self, stream, errors='strict'): - - """ Creates a StreamReader instance. - - stream must be a file-like object open for reading. - - The StreamReader may use different error handling - schemes by providing the errors keyword argument. These - parameters are predefined: - - 'strict' - raise a ValueError (or a subclass) - 'ignore' - ignore the character and continue with the next - 'replace'- replace with a suitable replacement character - 'backslashreplace' - Replace with backslashed escape sequences; - - The set of allowed parameter values can be extended via - register_error. - """ - self.stream = stream - self.errors = errors - self.bytebuffer = b"" - self._empty_charbuffer = self.charbuffertype() - self.charbuffer = self._empty_charbuffer - self.linebuffer = None - - def decode(self, input, errors='strict'): - raise NotImplementedError - - def read(self, size=-1, chars=-1, firstline=False): - - """ Decodes data from the stream self.stream and returns the - resulting object. - - chars indicates the number of decoded code points or bytes to - return. read() will never return more data than requested, - but it might return less, if there is not enough available. - - size indicates the approximate maximum number of decoded - bytes or code points to read for decoding. The decoder - can modify this setting as appropriate. The default value - -1 indicates to read and decode as much as possible. size - is intended to prevent having to decode huge files in one - step. - - If firstline is true, and a UnicodeDecodeError happens - after the first line terminator in the input only the first line - will be returned, the rest of the input will be kept until the - next call to read(). - - The method should use a greedy read strategy, meaning that - it should read as much data as is allowed within the - definition of the encoding and the given size, e.g. if - optional encoding endings or state markers are available - on the stream, these should be read too. - """ - # If we have lines cached, first merge them back into characters - if self.linebuffer: - self.charbuffer = self._empty_charbuffer.join(self.linebuffer) - self.linebuffer = None - - if chars < 0: - # For compatibility with other read() methods that take a - # single argument - chars = size - - # read until we get the required number of characters (if available) - while True: - # can the request be satisfied from the character buffer? - if chars >= 0: - if len(self.charbuffer) >= chars: - break - # we need more data - if size < 0: - newdata = self.stream.read() - else: - newdata = self.stream.read(size) - # decode bytes (those remaining from the last call included) - data = self.bytebuffer + newdata - if not data: - break - try: - newchars, decodedbytes = self.decode(data, self.errors) - except UnicodeDecodeError as exc: - if firstline: - newchars, decodedbytes = \ - self.decode(data[:exc.start], self.errors) - lines = newchars.splitlines(keepends=True) - if len(lines)<=1: - raise - else: - raise - # keep undecoded bytes until the next call - self.bytebuffer = data[decodedbytes:] - # put new characters in the character buffer - self.charbuffer += newchars - # there was no data available - if not newdata: - break - if chars < 0: - # Return everything we've got - result = self.charbuffer - self.charbuffer = self._empty_charbuffer - else: - # Return the first chars characters - result = self.charbuffer[:chars] - self.charbuffer = self.charbuffer[chars:] - return result - - def readline(self, size=None, keepends=True): - - """ Read one line from the input stream and return the - decoded data. - - size, if given, is passed as size argument to the - read() method. - - """ - # If we have lines cached from an earlier read, return - # them unconditionally - if self.linebuffer: - line = self.linebuffer[0] - del self.linebuffer[0] - if len(self.linebuffer) == 1: - # revert to charbuffer mode; we might need more data - # next time - self.charbuffer = self.linebuffer[0] - self.linebuffer = None - if not keepends: - line = line.splitlines(keepends=False)[0] - return line - - readsize = size or 72 - line = self._empty_charbuffer - # If size is given, we call read() only once - while True: - data = self.read(readsize, firstline=True) - if data: - # If we're at a "\r" read one extra character (which might - # be a "\n") to get a proper line ending. If the stream is - # temporarily exhausted we return the wrong line ending. - if (isinstance(data, str) and data.endswith("\r")) or \ - (isinstance(data, bytes) and data.endswith(b"\r")): - data += self.read(size=1, chars=1) - - line += data - lines = line.splitlines(keepends=True) - if lines: - if len(lines) > 1: - # More than one line result; the first line is a full line - # to return - line = lines[0] - del lines[0] - if len(lines) > 1: - # cache the remaining lines - lines[-1] += self.charbuffer - self.linebuffer = lines - self.charbuffer = None - else: - # only one remaining line, put it back into charbuffer - self.charbuffer = lines[0] + self.charbuffer - if not keepends: - line = line.splitlines(keepends=False)[0] - break - line0withend = lines[0] - line0withoutend = lines[0].splitlines(keepends=False)[0] - if line0withend != line0withoutend: # We really have a line end - # Put the rest back together and keep it until the next call - self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ - self.charbuffer - if keepends: - line = line0withend - else: - line = line0withoutend - break - # we didn't get anything or this was our only try - if not data or size is not None: - if line and not keepends: - line = line.splitlines(keepends=False)[0] - break - if readsize < 8000: - readsize *= 2 - return line - - def readlines(self, sizehint=None, keepends=True): - - """ Read all lines available on the input stream - and return them as a list. - - Line breaks are implemented using the codec's decoder - method and are included in the list entries. - - sizehint, if given, is ignored since there is no efficient - way to finding the true end-of-line. - - """ - data = self.read() - return data.splitlines(keepends) - - def reset(self): - + + Calling this method should ensure that the data on the + output is put into a clean state, that allows appending + of new fresh data without having to rescan the whole + stream to recover state. + + """ + pass + + def seek(self, offset, whence=0): + self.stream.seek(offset, whence) + if whence == 0 and offset == 0: + self.reset() + + def __getattr__(self, name, + getattr=getattr): + + """ Inherit all other methods from the underlying stream. + """ + return getattr(self.stream, name) + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.stream.close() + +### + +class StreamReader(Codec): + + charbuffertype = str + + def __init__(self, stream, errors='strict'): + + """ Creates a StreamReader instance. + + stream must be a file-like object open for reading. + + The StreamReader may use different error handling + schemes by providing the errors keyword argument. These + parameters are predefined: + + 'strict' - raise a ValueError (or a subclass) + 'ignore' - ignore the character and continue with the next + 'replace'- replace with a suitable replacement character + 'backslashreplace' - Replace with backslashed escape sequences; + + The set of allowed parameter values can be extended via + register_error. + """ + self.stream = stream + self.errors = errors + self.bytebuffer = b"" + self._empty_charbuffer = self.charbuffertype() + self.charbuffer = self._empty_charbuffer + self.linebuffer = None + + def decode(self, input, errors='strict'): + raise NotImplementedError + + def read(self, size=-1, chars=-1, firstline=False): + + """ Decodes data from the stream self.stream and returns the + resulting object. + + chars indicates the number of decoded code points or bytes to + return. read() will never return more data than requested, + but it might return less, if there is not enough available. + + size indicates the approximate maximum number of decoded + bytes or code points to read for decoding. The decoder + can modify this setting as appropriate. The default value + -1 indicates to read and decode as much as possible. size + is intended to prevent having to decode huge files in one + step. + + If firstline is true, and a UnicodeDecodeError happens + after the first line terminator in the input only the first line + will be returned, the rest of the input will be kept until the + next call to read(). + + The method should use a greedy read strategy, meaning that + it should read as much data as is allowed within the + definition of the encoding and the given size, e.g. if + optional encoding endings or state markers are available + on the stream, these should be read too. + """ + # If we have lines cached, first merge them back into characters + if self.linebuffer: + self.charbuffer = self._empty_charbuffer.join(self.linebuffer) + self.linebuffer = None + + if chars < 0: + # For compatibility with other read() methods that take a + # single argument + chars = size + + # read until we get the required number of characters (if available) + while True: + # can the request be satisfied from the character buffer? + if chars >= 0: + if len(self.charbuffer) >= chars: + break + # we need more data + if size < 0: + newdata = self.stream.read() + else: + newdata = self.stream.read(size) + # decode bytes (those remaining from the last call included) + data = self.bytebuffer + newdata + if not data: + break + try: + newchars, decodedbytes = self.decode(data, self.errors) + except UnicodeDecodeError as exc: + if firstline: + newchars, decodedbytes = \ + self.decode(data[:exc.start], self.errors) + lines = newchars.splitlines(keepends=True) + if len(lines)<=1: + raise + else: + raise + # keep undecoded bytes until the next call + self.bytebuffer = data[decodedbytes:] + # put new characters in the character buffer + self.charbuffer += newchars + # there was no data available + if not newdata: + break + if chars < 0: + # Return everything we've got + result = self.charbuffer + self.charbuffer = self._empty_charbuffer + else: + # Return the first chars characters + result = self.charbuffer[:chars] + self.charbuffer = self.charbuffer[chars:] + return result + + def readline(self, size=None, keepends=True): + + """ Read one line from the input stream and return the + decoded data. + + size, if given, is passed as size argument to the + read() method. + + """ + # If we have lines cached from an earlier read, return + # them unconditionally + if self.linebuffer: + line = self.linebuffer[0] + del self.linebuffer[0] + if len(self.linebuffer) == 1: + # revert to charbuffer mode; we might need more data + # next time + self.charbuffer = self.linebuffer[0] + self.linebuffer = None + if not keepends: + line = line.splitlines(keepends=False)[0] + return line + + readsize = size or 72 + line = self._empty_charbuffer + # If size is given, we call read() only once + while True: + data = self.read(readsize, firstline=True) + if data: + # If we're at a "\r" read one extra character (which might + # be a "\n") to get a proper line ending. If the stream is + # temporarily exhausted we return the wrong line ending. + if (isinstance(data, str) and data.endswith("\r")) or \ + (isinstance(data, bytes) and data.endswith(b"\r")): + data += self.read(size=1, chars=1) + + line += data + lines = line.splitlines(keepends=True) + if lines: + if len(lines) > 1: + # More than one line result; the first line is a full line + # to return + line = lines[0] + del lines[0] + if len(lines) > 1: + # cache the remaining lines + lines[-1] += self.charbuffer + self.linebuffer = lines + self.charbuffer = None + else: + # only one remaining line, put it back into charbuffer + self.charbuffer = lines[0] + self.charbuffer + if not keepends: + line = line.splitlines(keepends=False)[0] + break + line0withend = lines[0] + line0withoutend = lines[0].splitlines(keepends=False)[0] + if line0withend != line0withoutend: # We really have a line end + # Put the rest back together and keep it until the next call + self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ + self.charbuffer + if keepends: + line = line0withend + else: + line = line0withoutend + break + # we didn't get anything or this was our only try + if not data or size is not None: + if line and not keepends: + line = line.splitlines(keepends=False)[0] + break + if readsize < 8000: + readsize *= 2 + return line + + def readlines(self, sizehint=None, keepends=True): + + """ Read all lines available on the input stream + and return them as a list. + + Line breaks are implemented using the codec's decoder + method and are included in the list entries. + + sizehint, if given, is ignored since there is no efficient + way to finding the true end-of-line. + + """ + data = self.read() + return data.splitlines(keepends) + + def reset(self): + """ Resets the codec buffers used for keeping internal state. - - Note that no stream repositioning should take place. - This method is primarily intended to be able to recover - from decoding errors. - - """ - self.bytebuffer = b"" - self.charbuffer = self._empty_charbuffer - self.linebuffer = None - - def seek(self, offset, whence=0): - """ Set the input stream's current position. - - Resets the codec buffers used for keeping state. - """ - self.stream.seek(offset, whence) - self.reset() - - def __next__(self): - - """ Return the next decoded line from the input stream.""" - line = self.readline() - if line: - return line - raise StopIteration - - def __iter__(self): - return self - - def __getattr__(self, name, - getattr=getattr): - - """ Inherit all other methods from the underlying stream. - """ - return getattr(self.stream, name) - - def __enter__(self): - return self - - def __exit__(self, type, value, tb): - self.stream.close() - -### - -class StreamReaderWriter: - - """ StreamReaderWriter instances allow wrapping streams which - work in both read and write modes. - - The design is such that one can use the factory functions - returned by the codec.lookup() function to construct the - instance. - - """ - # Optional attributes set by the file wrappers below - encoding = 'unknown' - - def __init__(self, stream, Reader, Writer, errors='strict'): - - """ Creates a StreamReaderWriter instance. - - stream must be a Stream-like object. - - Reader, Writer must be factory functions or classes - providing the StreamReader, StreamWriter interface resp. - - Error handling is done in the same way as defined for the - StreamWriter/Readers. - - """ - self.stream = stream - self.reader = Reader(stream, errors) - self.writer = Writer(stream, errors) - self.errors = errors - - def read(self, size=-1): - - return self.reader.read(size) - - def readline(self, size=None): - - return self.reader.readline(size) - - def readlines(self, sizehint=None): - - return self.reader.readlines(sizehint) - - def __next__(self): - - """ Return the next decoded line from the input stream.""" - return next(self.reader) - - def __iter__(self): - return self - - def write(self, data): - - return self.writer.write(data) - - def writelines(self, list): - - return self.writer.writelines(list) - - def reset(self): - - self.reader.reset() - self.writer.reset() - - def seek(self, offset, whence=0): - self.stream.seek(offset, whence) - self.reader.reset() - if whence == 0 and offset == 0: - self.writer.reset() - - def __getattr__(self, name, - getattr=getattr): - - """ Inherit all other methods from the underlying stream. - """ - return getattr(self.stream, name) - - # these are needed to make "with StreamReaderWriter(...)" work properly - - def __enter__(self): - return self - - def __exit__(self, type, value, tb): - self.stream.close() - -### - -class StreamRecoder: - - """ StreamRecoder instances translate data from one encoding to another. - - They use the complete set of APIs returned by the - codecs.lookup() function to implement their task. - - Data written to the StreamRecoder is first decoded into an - intermediate format (depending on the "decode" codec) and then - written to the underlying stream using an instance of the provided - Writer class. - - In the other direction, data is read from the underlying stream using - a Reader instance and then encoded and returned to the caller. - - """ - # Optional attributes set by the file wrappers below - data_encoding = 'unknown' - file_encoding = 'unknown' - - def __init__(self, stream, encode, decode, Reader, Writer, - errors='strict'): - - """ Creates a StreamRecoder instance which implements a two-way - conversion: encode and decode work on the frontend (the - data visible to .read() and .write()) while Reader and Writer - work on the backend (the data in stream). - - You can use these objects to do transparent - transcodings from e.g. latin-1 to utf-8 and back. - - stream must be a file-like object. - - encode and decode must adhere to the Codec interface; Reader and - Writer must be factory functions or classes providing the - StreamReader and StreamWriter interfaces resp. - - Error handling is done in the same way as defined for the - StreamWriter/Readers. - - """ - self.stream = stream - self.encode = encode - self.decode = decode - self.reader = Reader(stream, errors) - self.writer = Writer(stream, errors) - self.errors = errors - - def read(self, size=-1): - - data = self.reader.read(size) - data, bytesencoded = self.encode(data, self.errors) - return data - - def readline(self, size=None): - - if size is None: - data = self.reader.readline() - else: - data = self.reader.readline(size) - data, bytesencoded = self.encode(data, self.errors) - return data - - def readlines(self, sizehint=None): - - data = self.reader.read() - data, bytesencoded = self.encode(data, self.errors) - return data.splitlines(keepends=True) - - def __next__(self): - - """ Return the next decoded line from the input stream.""" - data = next(self.reader) - data, bytesencoded = self.encode(data, self.errors) - return data - - def __iter__(self): - return self - - def write(self, data): - - data, bytesdecoded = self.decode(data, self.errors) - return self.writer.write(data) - - def writelines(self, list): - + + Note that no stream repositioning should take place. + This method is primarily intended to be able to recover + from decoding errors. + + """ + self.bytebuffer = b"" + self.charbuffer = self._empty_charbuffer + self.linebuffer = None + + def seek(self, offset, whence=0): + """ Set the input stream's current position. + + Resets the codec buffers used for keeping state. + """ + self.stream.seek(offset, whence) + self.reset() + + def __next__(self): + + """ Return the next decoded line from the input stream.""" + line = self.readline() + if line: + return line + raise StopIteration + + def __iter__(self): + return self + + def __getattr__(self, name, + getattr=getattr): + + """ Inherit all other methods from the underlying stream. + """ + return getattr(self.stream, name) + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.stream.close() + +### + +class StreamReaderWriter: + + """ StreamReaderWriter instances allow wrapping streams which + work in both read and write modes. + + The design is such that one can use the factory functions + returned by the codec.lookup() function to construct the + instance. + + """ + # Optional attributes set by the file wrappers below + encoding = 'unknown' + + def __init__(self, stream, Reader, Writer, errors='strict'): + + """ Creates a StreamReaderWriter instance. + + stream must be a Stream-like object. + + Reader, Writer must be factory functions or classes + providing the StreamReader, StreamWriter interface resp. + + Error handling is done in the same way as defined for the + StreamWriter/Readers. + + """ + self.stream = stream + self.reader = Reader(stream, errors) + self.writer = Writer(stream, errors) + self.errors = errors + + def read(self, size=-1): + + return self.reader.read(size) + + def readline(self, size=None): + + return self.reader.readline(size) + + def readlines(self, sizehint=None): + + return self.reader.readlines(sizehint) + + def __next__(self): + + """ Return the next decoded line from the input stream.""" + return next(self.reader) + + def __iter__(self): + return self + + def write(self, data): + + return self.writer.write(data) + + def writelines(self, list): + + return self.writer.writelines(list) + + def reset(self): + + self.reader.reset() + self.writer.reset() + + def seek(self, offset, whence=0): + self.stream.seek(offset, whence) + self.reader.reset() + if whence == 0 and offset == 0: + self.writer.reset() + + def __getattr__(self, name, + getattr=getattr): + + """ Inherit all other methods from the underlying stream. + """ + return getattr(self.stream, name) + + # these are needed to make "with StreamReaderWriter(...)" work properly + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.stream.close() + +### + +class StreamRecoder: + + """ StreamRecoder instances translate data from one encoding to another. + + They use the complete set of APIs returned by the + codecs.lookup() function to implement their task. + + Data written to the StreamRecoder is first decoded into an + intermediate format (depending on the "decode" codec) and then + written to the underlying stream using an instance of the provided + Writer class. + + In the other direction, data is read from the underlying stream using + a Reader instance and then encoded and returned to the caller. + + """ + # Optional attributes set by the file wrappers below + data_encoding = 'unknown' + file_encoding = 'unknown' + + def __init__(self, stream, encode, decode, Reader, Writer, + errors='strict'): + + """ Creates a StreamRecoder instance which implements a two-way + conversion: encode and decode work on the frontend (the + data visible to .read() and .write()) while Reader and Writer + work on the backend (the data in stream). + + You can use these objects to do transparent + transcodings from e.g. latin-1 to utf-8 and back. + + stream must be a file-like object. + + encode and decode must adhere to the Codec interface; Reader and + Writer must be factory functions or classes providing the + StreamReader and StreamWriter interfaces resp. + + Error handling is done in the same way as defined for the + StreamWriter/Readers. + + """ + self.stream = stream + self.encode = encode + self.decode = decode + self.reader = Reader(stream, errors) + self.writer = Writer(stream, errors) + self.errors = errors + + def read(self, size=-1): + + data = self.reader.read(size) + data, bytesencoded = self.encode(data, self.errors) + return data + + def readline(self, size=None): + + if size is None: + data = self.reader.readline() + else: + data = self.reader.readline(size) + data, bytesencoded = self.encode(data, self.errors) + return data + + def readlines(self, sizehint=None): + + data = self.reader.read() + data, bytesencoded = self.encode(data, self.errors) + return data.splitlines(keepends=True) + + def __next__(self): + + """ Return the next decoded line from the input stream.""" + data = next(self.reader) + data, bytesencoded = self.encode(data, self.errors) + return data + + def __iter__(self): + return self + + def write(self, data): + + data, bytesdecoded = self.decode(data, self.errors) + return self.writer.write(data) + + def writelines(self, list): + data = b''.join(list) - data, bytesdecoded = self.decode(data, self.errors) - return self.writer.write(data) - - def reset(self): - - self.reader.reset() - self.writer.reset() - + data, bytesdecoded = self.decode(data, self.errors) + return self.writer.write(data) + + def reset(self): + + self.reader.reset() + self.writer.reset() + def seek(self, offset, whence=0): # Seeks must be propagated to both the readers and writers # as they might need to reset their internal buffers. self.reader.seek(offset, whence) self.writer.seek(offset, whence) - def __getattr__(self, name, - getattr=getattr): - - """ Inherit all other methods from the underlying stream. - """ - return getattr(self.stream, name) - - def __enter__(self): - return self - - def __exit__(self, type, value, tb): - self.stream.close() - -### Shortcuts - + def __getattr__(self, name, + getattr=getattr): + + """ Inherit all other methods from the underlying stream. + """ + return getattr(self.stream, name) + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.stream.close() + +### Shortcuts + def open(filename, mode='r', encoding=None, errors='strict', buffering=-1): - - """ Open an encoded file using the given mode and return - a wrapped version providing transparent encoding/decoding. - - Note: The wrapped version will only accept the object format - defined by the codecs, i.e. Unicode objects for most builtin - codecs. Output is also codec dependent and will usually be - Unicode as well. - - Underlying encoded files are always opened in binary mode. - The default file mode is 'r', meaning to open the file in read mode. - - encoding specifies the encoding which is to be used for the - file. - - errors may be given to define the error handling. It defaults - to 'strict' which causes ValueErrors to be raised in case an - encoding error occurs. - - buffering has the same meaning as for the builtin open() API. + + """ Open an encoded file using the given mode and return + a wrapped version providing transparent encoding/decoding. + + Note: The wrapped version will only accept the object format + defined by the codecs, i.e. Unicode objects for most builtin + codecs. Output is also codec dependent and will usually be + Unicode as well. + + Underlying encoded files are always opened in binary mode. + The default file mode is 'r', meaning to open the file in read mode. + + encoding specifies the encoding which is to be used for the + file. + + errors may be given to define the error handling. It defaults + to 'strict' which causes ValueErrors to be raised in case an + encoding error occurs. + + buffering has the same meaning as for the builtin open() API. It defaults to -1 which means that the default buffer size will be used. - - The returned wrapped file object provides an extra attribute - .encoding which allows querying the used encoding. This - attribute is only available if an encoding was specified as - parameter. - - """ - if encoding is not None and \ - 'b' not in mode: - # Force opening of the file in binary mode - mode = mode + 'b' - file = builtins.open(filename, mode, buffering) - if encoding is None: - return file - + + The returned wrapped file object provides an extra attribute + .encoding which allows querying the used encoding. This + attribute is only available if an encoding was specified as + parameter. + + """ + if encoding is not None and \ + 'b' not in mode: + # Force opening of the file in binary mode + mode = mode + 'b' + file = builtins.open(filename, mode, buffering) + if encoding is None: + return file + try: info = lookup(encoding) srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) @@ -916,211 +916,211 @@ def open(filename, mode='r', encoding=None, errors='strict', buffering=-1): file.close() raise -def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): - - """ Return a wrapped version of file which provides transparent - encoding translation. - - Data written to the wrapped file is decoded according - to the given data_encoding and then encoded to the underlying - file using file_encoding. The intermediate data type - will usually be Unicode but depends on the specified codecs. - - Bytes read from the file are decoded using file_encoding and then - passed back to the caller encoded using data_encoding. - - If file_encoding is not given, it defaults to data_encoding. - - errors may be given to define the error handling. It defaults - to 'strict' which causes ValueErrors to be raised in case an - encoding error occurs. - - The returned wrapped file object provides two extra attributes - .data_encoding and .file_encoding which reflect the given - parameters of the same name. The attributes can be used for - introspection by Python programs. - - """ - if file_encoding is None: - file_encoding = data_encoding - data_info = lookup(data_encoding) - file_info = lookup(file_encoding) - sr = StreamRecoder(file, data_info.encode, data_info.decode, - file_info.streamreader, file_info.streamwriter, errors) - # Add attributes to simplify introspection - sr.data_encoding = data_encoding - sr.file_encoding = file_encoding - return sr - -### Helpers for codec lookup - -def getencoder(encoding): - - """ Lookup up the codec for the given encoding and return - its encoder function. - - Raises a LookupError in case the encoding cannot be found. - - """ - return lookup(encoding).encode - -def getdecoder(encoding): - - """ Lookup up the codec for the given encoding and return - its decoder function. - - Raises a LookupError in case the encoding cannot be found. - - """ - return lookup(encoding).decode - -def getincrementalencoder(encoding): - - """ Lookup up the codec for the given encoding and return - its IncrementalEncoder class or factory function. - - Raises a LookupError in case the encoding cannot be found - or the codecs doesn't provide an incremental encoder. - - """ - encoder = lookup(encoding).incrementalencoder - if encoder is None: - raise LookupError(encoding) - return encoder - -def getincrementaldecoder(encoding): - - """ Lookup up the codec for the given encoding and return - its IncrementalDecoder class or factory function. - - Raises a LookupError in case the encoding cannot be found - or the codecs doesn't provide an incremental decoder. - - """ - decoder = lookup(encoding).incrementaldecoder - if decoder is None: - raise LookupError(encoding) - return decoder - -def getreader(encoding): - - """ Lookup up the codec for the given encoding and return - its StreamReader class or factory function. - - Raises a LookupError in case the encoding cannot be found. - - """ - return lookup(encoding).streamreader - -def getwriter(encoding): - - """ Lookup up the codec for the given encoding and return - its StreamWriter class or factory function. - - Raises a LookupError in case the encoding cannot be found. - - """ - return lookup(encoding).streamwriter - -def iterencode(iterator, encoding, errors='strict', **kwargs): - """ - Encoding iterator. - - Encodes the input strings from the iterator using an IncrementalEncoder. - - errors and kwargs are passed through to the IncrementalEncoder - constructor. - """ - encoder = getincrementalencoder(encoding)(errors, **kwargs) - for input in iterator: - output = encoder.encode(input) - if output: - yield output - output = encoder.encode("", True) - if output: - yield output - -def iterdecode(iterator, encoding, errors='strict', **kwargs): - """ - Decoding iterator. - - Decodes the input strings from the iterator using an IncrementalDecoder. - - errors and kwargs are passed through to the IncrementalDecoder - constructor. - """ - decoder = getincrementaldecoder(encoding)(errors, **kwargs) - for input in iterator: - output = decoder.decode(input) - if output: - yield output - output = decoder.decode(b"", True) - if output: - yield output - -### Helpers for charmap-based codecs - -def make_identity_dict(rng): - - """ make_identity_dict(rng) -> dict - - Return a dictionary where elements of the rng sequence are - mapped to themselves. - - """ - return {i:i for i in rng} - -def make_encoding_map(decoding_map): - - """ Creates an encoding map from a decoding map. - - If a target mapping in the decoding map occurs multiple - times, then that target is mapped to None (undefined mapping), - causing an exception when encountered by the charmap codec - during translation. - - One example where this happens is cp875.py which decodes - multiple character to \\u001a. - - """ - m = {} - for k,v in decoding_map.items(): - if not v in m: - m[v] = k - else: - m[v] = None - return m - -### error handlers - -try: - strict_errors = lookup_error("strict") - ignore_errors = lookup_error("ignore") - replace_errors = lookup_error("replace") - xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") - backslashreplace_errors = lookup_error("backslashreplace") - namereplace_errors = lookup_error("namereplace") -except LookupError: - # In --disable-unicode builds, these error handler are missing - strict_errors = None - ignore_errors = None - replace_errors = None - xmlcharrefreplace_errors = None - backslashreplace_errors = None - namereplace_errors = None - -# Tell modulefinder that using codecs probably needs the encodings -# package -_false = 0 -if _false: - import encodings - -### Tests - -if __name__ == '__main__': - - # Make stdout translate Latin-1 output into UTF-8 output - sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') - - # Have stdin translate Latin-1 input into UTF-8 input - sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') +def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): + + """ Return a wrapped version of file which provides transparent + encoding translation. + + Data written to the wrapped file is decoded according + to the given data_encoding and then encoded to the underlying + file using file_encoding. The intermediate data type + will usually be Unicode but depends on the specified codecs. + + Bytes read from the file are decoded using file_encoding and then + passed back to the caller encoded using data_encoding. + + If file_encoding is not given, it defaults to data_encoding. + + errors may be given to define the error handling. It defaults + to 'strict' which causes ValueErrors to be raised in case an + encoding error occurs. + + The returned wrapped file object provides two extra attributes + .data_encoding and .file_encoding which reflect the given + parameters of the same name. The attributes can be used for + introspection by Python programs. + + """ + if file_encoding is None: + file_encoding = data_encoding + data_info = lookup(data_encoding) + file_info = lookup(file_encoding) + sr = StreamRecoder(file, data_info.encode, data_info.decode, + file_info.streamreader, file_info.streamwriter, errors) + # Add attributes to simplify introspection + sr.data_encoding = data_encoding + sr.file_encoding = file_encoding + return sr + +### Helpers for codec lookup + +def getencoder(encoding): + + """ Lookup up the codec for the given encoding and return + its encoder function. + + Raises a LookupError in case the encoding cannot be found. + + """ + return lookup(encoding).encode + +def getdecoder(encoding): + + """ Lookup up the codec for the given encoding and return + its decoder function. + + Raises a LookupError in case the encoding cannot be found. + + """ + return lookup(encoding).decode + +def getincrementalencoder(encoding): + + """ Lookup up the codec for the given encoding and return + its IncrementalEncoder class or factory function. + + Raises a LookupError in case the encoding cannot be found + or the codecs doesn't provide an incremental encoder. + + """ + encoder = lookup(encoding).incrementalencoder + if encoder is None: + raise LookupError(encoding) + return encoder + +def getincrementaldecoder(encoding): + + """ Lookup up the codec for the given encoding and return + its IncrementalDecoder class or factory function. + + Raises a LookupError in case the encoding cannot be found + or the codecs doesn't provide an incremental decoder. + + """ + decoder = lookup(encoding).incrementaldecoder + if decoder is None: + raise LookupError(encoding) + return decoder + +def getreader(encoding): + + """ Lookup up the codec for the given encoding and return + its StreamReader class or factory function. + + Raises a LookupError in case the encoding cannot be found. + + """ + return lookup(encoding).streamreader + +def getwriter(encoding): + + """ Lookup up the codec for the given encoding and return + its StreamWriter class or factory function. + + Raises a LookupError in case the encoding cannot be found. + + """ + return lookup(encoding).streamwriter + +def iterencode(iterator, encoding, errors='strict', **kwargs): + """ + Encoding iterator. + + Encodes the input strings from the iterator using an IncrementalEncoder. + + errors and kwargs are passed through to the IncrementalEncoder + constructor. + """ + encoder = getincrementalencoder(encoding)(errors, **kwargs) + for input in iterator: + output = encoder.encode(input) + if output: + yield output + output = encoder.encode("", True) + if output: + yield output + +def iterdecode(iterator, encoding, errors='strict', **kwargs): + """ + Decoding iterator. + + Decodes the input strings from the iterator using an IncrementalDecoder. + + errors and kwargs are passed through to the IncrementalDecoder + constructor. + """ + decoder = getincrementaldecoder(encoding)(errors, **kwargs) + for input in iterator: + output = decoder.decode(input) + if output: + yield output + output = decoder.decode(b"", True) + if output: + yield output + +### Helpers for charmap-based codecs + +def make_identity_dict(rng): + + """ make_identity_dict(rng) -> dict + + Return a dictionary where elements of the rng sequence are + mapped to themselves. + + """ + return {i:i for i in rng} + +def make_encoding_map(decoding_map): + + """ Creates an encoding map from a decoding map. + + If a target mapping in the decoding map occurs multiple + times, then that target is mapped to None (undefined mapping), + causing an exception when encountered by the charmap codec + during translation. + + One example where this happens is cp875.py which decodes + multiple character to \\u001a. + + """ + m = {} + for k,v in decoding_map.items(): + if not v in m: + m[v] = k + else: + m[v] = None + return m + +### error handlers + +try: + strict_errors = lookup_error("strict") + ignore_errors = lookup_error("ignore") + replace_errors = lookup_error("replace") + xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") + backslashreplace_errors = lookup_error("backslashreplace") + namereplace_errors = lookup_error("namereplace") +except LookupError: + # In --disable-unicode builds, these error handler are missing + strict_errors = None + ignore_errors = None + replace_errors = None + xmlcharrefreplace_errors = None + backslashreplace_errors = None + namereplace_errors = None + +# Tell modulefinder that using codecs probably needs the encodings +# package +_false = 0 +if _false: + import encodings + +### Tests + +if __name__ == '__main__': + + # Make stdout translate Latin-1 output into UTF-8 output + sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') + + # Have stdin translate Latin-1 input into UTF-8 input + sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |