summaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/Lib/encodings/utf_8_sig.py
diff options
context:
space:
mode:
authorthegeorg <[email protected]>2024-02-19 02:38:52 +0300
committerthegeorg <[email protected]>2024-02-19 02:50:43 +0300
commitd96fa07134c06472bfee6718b5cfd1679196fc99 (patch)
tree31ec344fa9d3ff8dc038692516b6438dfbdb8a2d /contrib/tools/python3/Lib/encodings/utf_8_sig.py
parent452cf9e068aef7110e35e654c5d47eb80111ef89 (diff)
Sync contrib/tools/python3 layout with upstream
* Move src/ subdir contents to the top of the layout * Rename self-written lib -> lib2 to avoid CaseFolding warning from the VCS * Regenerate contrib/libs/python proxy-headers accordingly 4ccc62ac1511abcf0fed14ccade38e984e088f1e
Diffstat (limited to 'contrib/tools/python3/Lib/encodings/utf_8_sig.py')
-rw-r--r--contrib/tools/python3/Lib/encodings/utf_8_sig.py130
1 files changed, 130 insertions, 0 deletions
diff --git a/contrib/tools/python3/Lib/encodings/utf_8_sig.py b/contrib/tools/python3/Lib/encodings/utf_8_sig.py
new file mode 100644
index 00000000000..1bb479203f3
--- /dev/null
+++ b/contrib/tools/python3/Lib/encodings/utf_8_sig.py
@@ -0,0 +1,130 @@
+""" Python 'utf-8-sig' Codec
+This work similar to UTF-8 with the following changes:
+
+* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
+ first three bytes.
+
+* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
+ bytes will be skipped.
+"""
+import codecs
+
+### Codec APIs
+
+def encode(input, errors='strict'):
+ return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
+ len(input))
+
+def decode(input, errors='strict'):
+ prefix = 0
+ if input[:3] == codecs.BOM_UTF8:
+ input = input[3:]
+ prefix = 3
+ (output, consumed) = codecs.utf_8_decode(input, errors, True)
+ return (output, consumed+prefix)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+ def __init__(self, errors='strict'):
+ codecs.IncrementalEncoder.__init__(self, errors)
+ self.first = 1
+
+ def encode(self, input, final=False):
+ if self.first:
+ self.first = 0
+ return codecs.BOM_UTF8 + \
+ codecs.utf_8_encode(input, self.errors)[0]
+ else:
+ return codecs.utf_8_encode(input, self.errors)[0]
+
+ def reset(self):
+ codecs.IncrementalEncoder.reset(self)
+ self.first = 1
+
+ def getstate(self):
+ return self.first
+
+ def setstate(self, state):
+ self.first = state
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ def __init__(self, errors='strict'):
+ codecs.BufferedIncrementalDecoder.__init__(self, errors)
+ self.first = 1
+
+ def _buffer_decode(self, input, errors, final):
+ if self.first:
+ if len(input) < 3:
+ if codecs.BOM_UTF8.startswith(input):
+ # not enough data to decide if this really is a BOM
+ # => try again on the next call
+ return ("", 0)
+ else:
+ self.first = 0
+ else:
+ self.first = 0
+ if input[:3] == codecs.BOM_UTF8:
+ (output, consumed) = \
+ codecs.utf_8_decode(input[3:], errors, final)
+ return (output, consumed+3)
+ return codecs.utf_8_decode(input, errors, final)
+
+ def reset(self):
+ codecs.BufferedIncrementalDecoder.reset(self)
+ self.first = 1
+
+ def getstate(self):
+ state = codecs.BufferedIncrementalDecoder.getstate(self)
+ # state[1] must be 0 here, as it isn't passed along to the caller
+ return (state[0], self.first)
+
+ def setstate(self, state):
+ # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
+ codecs.BufferedIncrementalDecoder.setstate(self, state)
+ self.first = state[1]
+
+class StreamWriter(codecs.StreamWriter):
+ def reset(self):
+ codecs.StreamWriter.reset(self)
+ try:
+ del self.encode
+ except AttributeError:
+ pass
+
+ def encode(self, input, errors='strict'):
+ self.encode = codecs.utf_8_encode
+ return encode(input, errors)
+
+class StreamReader(codecs.StreamReader):
+ def reset(self):
+ codecs.StreamReader.reset(self)
+ try:
+ del self.decode
+ except AttributeError:
+ pass
+
+ def decode(self, input, errors='strict'):
+ if len(input) < 3:
+ if codecs.BOM_UTF8.startswith(input):
+ # not enough data to decide if this is a BOM
+ # => try again on the next call
+ return ("", 0)
+ elif input[:3] == codecs.BOM_UTF8:
+ self.decode = codecs.utf_8_decode
+ (output, consumed) = codecs.utf_8_decode(input[3:],errors)
+ return (output, consumed+3)
+ # (else) no BOM present
+ self.decode = codecs.utf_8_decode
+ return codecs.utf_8_decode(input, errors)
+
+### encodings module API
+
+def getregentry():
+ return codecs.CodecInfo(
+ name='utf-8-sig',
+ encode=encode,
+ decode=decode,
+ incrementalencoder=IncrementalEncoder,
+ incrementaldecoder=IncrementalDecoder,
+ streamreader=StreamReader,
+ streamwriter=StreamWriter,
+ )