diff options
author | AlexSm <alex@ydb.tech> | 2024-03-05 10:40:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-05 12:40:59 +0300 |
commit | 1ac13c847b5358faba44dbb638a828e24369467b (patch) | |
tree | 07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Lib/encodings/utf_8_sig.py | |
parent | ffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff) | |
download | ydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz |
Library import 16 (#2433)
Co-authored-by: robot-piglet <robot-piglet@yandex-team.com>
Co-authored-by: deshevoy <deshevoy@yandex-team.com>
Co-authored-by: robot-contrib <robot-contrib@yandex-team.com>
Co-authored-by: thegeorg <thegeorg@yandex-team.com>
Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com>
Co-authored-by: svidyuk <svidyuk@yandex-team.com>
Co-authored-by: shadchin <shadchin@yandex-team.com>
Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com>
Co-authored-by: innokentii <innokentii@yandex-team.com>
Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com>
Co-authored-by: snermolaev <snermolaev@yandex-team.com>
Co-authored-by: dimdim11 <dimdim11@yandex-team.com>
Co-authored-by: kickbutt <kickbutt@yandex-team.com>
Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com>
Co-authored-by: korsunandrei <korsunandrei@yandex-team.com>
Co-authored-by: petrk <petrk@yandex-team.com>
Co-authored-by: miroslav2 <miroslav2@yandex-team.com>
Co-authored-by: serjflint <serjflint@yandex-team.com>
Co-authored-by: akhropov <akhropov@yandex-team.com>
Co-authored-by: prettyboy <prettyboy@yandex-team.com>
Co-authored-by: ilikepugs <ilikepugs@yandex-team.com>
Co-authored-by: hiddenpath <hiddenpath@yandex-team.com>
Co-authored-by: mikhnenko <mikhnenko@yandex-team.com>
Co-authored-by: spreis <spreis@yandex-team.com>
Co-authored-by: andreyshspb <andreyshspb@yandex-team.com>
Co-authored-by: dimaandreev <dimaandreev@yandex-team.com>
Co-authored-by: rashid <rashid@yandex-team.com>
Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com>
Co-authored-by: r-vetrov <r-vetrov@yandex-team.com>
Co-authored-by: ypodlesov <ypodlesov@yandex-team.com>
Co-authored-by: zaverden <zaverden@yandex-team.com>
Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com>
Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com>
Co-authored-by: v-korovin <v-korovin@yandex-team.com>
Co-authored-by: arikon <arikon@yandex-team.com>
Co-authored-by: khoden <khoden@yandex-team.com>
Co-authored-by: psydmm <psydmm@yandex-team.com>
Co-authored-by: robot-javacom <robot-javacom@yandex-team.com>
Co-authored-by: dtorilov <dtorilov@yandex-team.com>
Co-authored-by: sennikovmv <sennikovmv@yandex-team.com>
Co-authored-by: hcpp <hcpp@ydb.tech>
Diffstat (limited to 'contrib/tools/python3/Lib/encodings/utf_8_sig.py')
-rw-r--r-- | contrib/tools/python3/Lib/encodings/utf_8_sig.py | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/contrib/tools/python3/Lib/encodings/utf_8_sig.py b/contrib/tools/python3/Lib/encodings/utf_8_sig.py new file mode 100644 index 0000000000..1bb479203f --- /dev/null +++ b/contrib/tools/python3/Lib/encodings/utf_8_sig.py @@ -0,0 +1,130 @@ +""" Python 'utf-8-sig' Codec +This work similar to UTF-8 with the following changes: + +* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the + first three bytes. + +* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these + bytes will be skipped. +""" +import codecs + +### Codec APIs + +def encode(input, errors='strict'): + return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], + len(input)) + +def decode(input, errors='strict'): + prefix = 0 + if input[:3] == codecs.BOM_UTF8: + input = input[3:] + prefix = 3 + (output, consumed) = codecs.utf_8_decode(input, errors, True) + return (output, consumed+prefix) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def __init__(self, errors='strict'): + codecs.IncrementalEncoder.__init__(self, errors) + self.first = 1 + + def encode(self, input, final=False): + if self.first: + self.first = 0 + return codecs.BOM_UTF8 + \ + codecs.utf_8_encode(input, self.errors)[0] + else: + return codecs.utf_8_encode(input, self.errors)[0] + + def reset(self): + codecs.IncrementalEncoder.reset(self) + self.first = 1 + + def getstate(self): + return self.first + + def setstate(self, state): + self.first = state + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def __init__(self, errors='strict'): + codecs.BufferedIncrementalDecoder.__init__(self, errors) + self.first = 1 + + def _buffer_decode(self, input, errors, final): + if self.first: + if len(input) < 3: + if codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this really is a BOM + # => try again on the next call + return ("", 0) + else: + self.first = 0 + else: + self.first = 0 + if input[:3] == codecs.BOM_UTF8: + (output, consumed) = \ + codecs.utf_8_decode(input[3:], errors, final) + return (output, consumed+3) + return codecs.utf_8_decode(input, errors, final) + + def reset(self): + codecs.BufferedIncrementalDecoder.reset(self) + self.first = 1 + + def getstate(self): + state = codecs.BufferedIncrementalDecoder.getstate(self) + # state[1] must be 0 here, as it isn't passed along to the caller + return (state[0], self.first) + + def setstate(self, state): + # state[1] will be ignored by BufferedIncrementalDecoder.setstate() + codecs.BufferedIncrementalDecoder.setstate(self, state) + self.first = state[1] + +class StreamWriter(codecs.StreamWriter): + def reset(self): + codecs.StreamWriter.reset(self) + try: + del self.encode + except AttributeError: + pass + + def encode(self, input, errors='strict'): + self.encode = codecs.utf_8_encode + return encode(input, errors) + +class StreamReader(codecs.StreamReader): + def reset(self): + codecs.StreamReader.reset(self) + try: + del self.decode + except AttributeError: + pass + + def decode(self, input, errors='strict'): + if len(input) < 3: + if codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this is a BOM + # => try again on the next call + return ("", 0) + elif input[:3] == codecs.BOM_UTF8: + self.decode = codecs.utf_8_decode + (output, consumed) = codecs.utf_8_decode(input[3:],errors) + return (output, consumed+3) + # (else) no BOM present + self.decode = codecs.utf_8_decode + return codecs.utf_8_decode(input, errors) + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='utf-8-sig', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) |