diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/python/strings/strings.py | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/python/strings/strings.py')
-rw-r--r-- | library/python/strings/strings.py | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/library/python/strings/strings.py b/library/python/strings/strings.py new file mode 100644 index 0000000000..5bfddfe78a --- /dev/null +++ b/library/python/strings/strings.py @@ -0,0 +1,129 @@ +import locale +import logging +import six +import sys +import codecs + +import library.python.func + +logger = logging.getLogger(__name__) + + +DEFAULT_ENCODING = 'utf-8' +ENCODING_ERRORS_POLICY = 'replace' + + +def left_strip(el, prefix): + """ + Strips prefix at the left of el + """ + if el.startswith(prefix): + return el[len(prefix):] + return el + + +# Explicit to-text conversion +# Chooses between str/unicode, i.e. six.binary_type/six.text_type +def to_basestring(value): + if isinstance(value, (six.binary_type, six.text_type)): + return value + try: + if six.PY2: + return unicode(value) + else: + return str(value) + except UnicodeDecodeError: + try: + return str(value) + except UnicodeEncodeError: + return repr(value) +to_text = to_basestring + + +def to_unicode(value, from_enc=DEFAULT_ENCODING): + if isinstance(value, six.text_type): + return value + if isinstance(value, six.binary_type): + if six.PY2: + return unicode(value, from_enc, ENCODING_ERRORS_POLICY) + else: + return value.decode(from_enc, errors=ENCODING_ERRORS_POLICY) + return six.text_type(value) + + +# Optional from_enc enables transcoding +def to_str(value, to_enc=DEFAULT_ENCODING, from_enc=None): + if isinstance(value, six.binary_type): + if from_enc is None or to_enc == from_enc: + # Unknown input encoding or input and output encoding are the same + return value + value = to_unicode(value, from_enc=from_enc) + if isinstance(value, six.text_type): + return value.encode(to_enc, ENCODING_ERRORS_POLICY) + return six.binary_type(value) + + +def _convert_deep(x, enc, convert, relaxed=True): + if x is None: + return None + if isinstance(x, (six.text_type, six.binary_type)): + return convert(x, enc) + if isinstance(x, dict): + return {convert(k, enc): _convert_deep(v, enc, convert, relaxed) for k, v in six.iteritems(x)} + if isinstance(x, list): + return [_convert_deep(e, enc, convert, relaxed) for e in x] + if isinstance(x, tuple): + return tuple([_convert_deep(e, enc, convert, relaxed) for e in x]) + + if relaxed: + return x + raise TypeError('unsupported type') + + +def unicodize_deep(x, enc=DEFAULT_ENCODING, relaxed=True): + return _convert_deep(x, enc, to_unicode, relaxed) + + +def stringize_deep(x, enc=DEFAULT_ENCODING, relaxed=True): + return _convert_deep(x, enc, to_str, relaxed) + + +@library.python.func.memoize() +def locale_encoding(): + try: + loc = locale.getdefaultlocale()[1] + if loc: + codecs.lookup(loc) + return loc + except LookupError as e: + logger.debug('Cannot get system locale: %s', e) + return None + except ValueError as e: + logger.warn('Cannot get system locale: %s', e) + return None + + +def fs_encoding(): + return sys.getfilesystemencoding() + + +def guess_default_encoding(): + enc = locale_encoding() + return enc if enc else DEFAULT_ENCODING + + +@library.python.func.memoize() +def get_stream_encoding(stream): + if stream.encoding: + try: + codecs.lookup(stream.encoding) + return stream.encoding + except LookupError: + pass + return DEFAULT_ENCODING + + +def encode(value, encoding=DEFAULT_ENCODING): + if isinstance(value, six.binary_type): + value = value.decode(encoding, errors='ignore') + return value.encode(encoding) |