diff options
| author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 |
|---|---|---|
| committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 |
| commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
| tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/python/strings | |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/python/strings')
| -rw-r--r-- | library/python/strings/__init__.py | 17 | ||||
| -rw-r--r-- | library/python/strings/strings.py | 129 | ||||
| -rw-r--r-- | library/python/strings/ut/test_strings.py | 205 | ||||
| -rw-r--r-- | library/python/strings/ut/ya.make | 11 | ||||
| -rw-r--r-- | library/python/strings/ya.make | 16 |
5 files changed, 378 insertions, 0 deletions
diff --git a/library/python/strings/__init__.py b/library/python/strings/__init__.py new file mode 100644 index 00000000000..bd6bf6e7cee --- /dev/null +++ b/library/python/strings/__init__.py @@ -0,0 +1,17 @@ +# flake8 noqa: F401 + +from .strings import ( + DEFAULT_ENCODING, + ENCODING_ERRORS_POLICY, + encode, + fs_encoding, + get_stream_encoding, + guess_default_encoding, + left_strip, + locale_encoding, + stringize_deep, + to_basestring, + to_str, + to_unicode, + unicodize_deep, +) diff --git a/library/python/strings/strings.py b/library/python/strings/strings.py new file mode 100644 index 00000000000..5bfddfe78ae --- /dev/null +++ b/library/python/strings/strings.py @@ -0,0 +1,129 @@ +import locale +import logging +import six +import sys +import codecs + +import library.python.func + +logger = logging.getLogger(__name__) + + +DEFAULT_ENCODING = 'utf-8' +ENCODING_ERRORS_POLICY = 'replace' + + +def left_strip(el, prefix): + """ + Strips prefix at the left of el + """ + if el.startswith(prefix): + return el[len(prefix):] + return el + + +# Explicit to-text conversion +# Chooses between str/unicode, i.e. six.binary_type/six.text_type +def to_basestring(value): + if isinstance(value, (six.binary_type, six.text_type)): + return value + try: + if six.PY2: + return unicode(value) + else: + return str(value) + except UnicodeDecodeError: + try: + return str(value) + except UnicodeEncodeError: + return repr(value) +to_text = to_basestring + + +def to_unicode(value, from_enc=DEFAULT_ENCODING): + if isinstance(value, six.text_type): + return value + if isinstance(value, six.binary_type): + if six.PY2: + return unicode(value, from_enc, ENCODING_ERRORS_POLICY) + else: + return value.decode(from_enc, errors=ENCODING_ERRORS_POLICY) + return six.text_type(value) + + +# Optional from_enc enables transcoding +def to_str(value, to_enc=DEFAULT_ENCODING, from_enc=None): + if isinstance(value, six.binary_type): + if from_enc is None or to_enc == from_enc: + # Unknown input encoding or input and output encoding are the same + return value + value = to_unicode(value, from_enc=from_enc) + if isinstance(value, six.text_type): + return value.encode(to_enc, ENCODING_ERRORS_POLICY) + return six.binary_type(value) + + +def _convert_deep(x, enc, convert, relaxed=True): + if x is None: + return None + if isinstance(x, (six.text_type, six.binary_type)): + return convert(x, enc) + if isinstance(x, dict): + return {convert(k, enc): _convert_deep(v, enc, convert, relaxed) for k, v in six.iteritems(x)} + if isinstance(x, list): + return [_convert_deep(e, enc, convert, relaxed) for e in x] + if isinstance(x, tuple): + return tuple([_convert_deep(e, enc, convert, relaxed) for e in x]) + + if relaxed: + return x + raise TypeError('unsupported type') + + +def unicodize_deep(x, enc=DEFAULT_ENCODING, relaxed=True): + return _convert_deep(x, enc, to_unicode, relaxed) + + +def stringize_deep(x, enc=DEFAULT_ENCODING, relaxed=True): + return _convert_deep(x, enc, to_str, relaxed) + + +def locale_encoding(): + try: + loc = locale.getdefaultlocale()[1] + if loc: + codecs.lookup(loc) + return loc + except LookupError as e: + logger.debug('Cannot get system locale: %s', e) + return None + except ValueError as e: + logger.warn('Cannot get system locale: %s', e) + return None + + +def fs_encoding(): + return sys.getfilesystemencoding() + + +def guess_default_encoding(): + enc = locale_encoding() + return enc if enc else DEFAULT_ENCODING + + +def get_stream_encoding(stream): + if stream.encoding: + try: + codecs.lookup(stream.encoding) + return stream.encoding + except LookupError: + pass + return DEFAULT_ENCODING + + +def encode(value, encoding=DEFAULT_ENCODING): + if isinstance(value, six.binary_type): + value = value.decode(encoding, errors='ignore') + return value.encode(encoding) diff --git a/library/python/strings/ut/test_strings.py b/library/python/strings/ut/test_strings.py new file mode 100644 index 00000000000..dd0c694ee1d --- /dev/null +++ b/library/python/strings/ut/test_strings.py @@ -0,0 +1,205 @@ +# coding=utf-8 + +import pytest +import six + +import library.python.strings + + +class Convertible(object): + text = u'текст' + text_utf8 = text.encode('utf-8') + + def __unicode__(self): + return self.text + + def __str__(self): + return self.text_utf8 + + +class ConvertibleToUnicodeOnly(Convertible): + def __str__(self): + return self.text.encode('ascii') + + +class ConvertibleToStrOnly(Convertible): + def __unicode__(self): + return self.text_utf8.decode('ascii') + + +class NonConvertible(ConvertibleToUnicodeOnly, ConvertibleToStrOnly): + pass + + +def test_to_basestring(): + assert library.python.strings.to_basestring('str') == 'str' + assert library.python.strings.to_basestring(u'юникод') == u'юникод' + if six.PY2: # __str__ should return str not bytes in Python3 + assert library.python.strings.to_basestring(Convertible()) == Convertible.text + assert library.python.strings.to_basestring(ConvertibleToUnicodeOnly()) == Convertible.text + assert library.python.strings.to_basestring(ConvertibleToStrOnly()) == Convertible.text_utf8 + assert library.python.strings.to_basestring(NonConvertible()) + + +def test_to_unicode(): + assert library.python.strings.to_unicode(u'юникод') == u'юникод' + assert library.python.strings.to_unicode('str') == u'str' + assert library.python.strings.to_unicode(u'строка'.encode('utf-8')) == u'строка' + assert library.python.strings.to_unicode(u'строка'.encode('cp1251'), 'cp1251') == u'строка' + if six.PY2: # __str__ should return str not bytes in Python3 + assert library.python.strings.to_unicode(Convertible()) == Convertible.text + assert library.python.strings.to_unicode(ConvertibleToUnicodeOnly()) == Convertible.text + with pytest.raises(UnicodeDecodeError): + library.python.strings.to_unicode(ConvertibleToStrOnly()) + with pytest.raises(UnicodeDecodeError): + library.python.strings.to_unicode(NonConvertible()) + + +def test_to_unicode_errors_replace(): + assert library.python.strings.to_unicode(u'abcабв'.encode('utf-8'), 'ascii') + assert library.python.strings.to_unicode(u'абв'.encode('utf-8'), 'ascii') + + +def test_to_str(): + assert library.python.strings.to_str('str') == 'str' if six.PY2 else b'str' + assert library.python.strings.to_str(u'unicode') == 'unicode' if six.PY2 else b'unicode' + assert library.python.strings.to_str(u'юникод') == u'юникод'.encode('utf-8') + assert library.python.strings.to_str(u'юникод', 'cp1251') == u'юникод'.encode('cp1251') + if six.PY2: + assert library.python.strings.to_str(Convertible()) == Convertible.text_utf8 + with pytest.raises(UnicodeEncodeError): + library.python.strings.to_str(ConvertibleToUnicodeOnly()) + assert library.python.strings.to_str(ConvertibleToStrOnly()) == Convertible.text_utf8 + with pytest.raises(UnicodeEncodeError): + library.python.strings.to_str(NonConvertible()) + + +def test_to_str_errors_replace(): + assert library.python.strings.to_str(u'abcабв', 'ascii') + assert library.python.strings.to_str(u'абв', 'ascii') + + +def test_to_str_transcode(): + assert library.python.strings.to_str('str', from_enc='ascii') == 'str' if six.PY2 else b'str' + assert library.python.strings.to_str('str', from_enc='utf-8') == 'str' if six.PY2 else b'str' + + assert library.python.strings.to_str(u'юникод'.encode('utf-8'), from_enc='utf-8') == u'юникод'.encode('utf-8') + assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8', from_enc='utf-8') == u'юникод'.encode('utf-8') + assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251', from_enc='utf-8') == u'юникод'.encode('cp1251') + + assert library.python.strings.to_str(u'юникод'.encode('cp1251'), from_enc='cp1251') == u'юникод'.encode('utf-8') + assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251', from_enc='cp1251') == u'юникод'.encode('cp1251') + assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8', from_enc='cp1251') == u'юникод'.encode('utf-8') + + assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), from_enc='koi8-r') == u'юникод'.encode('utf-8') + assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), to_enc='koi8-r', from_enc='koi8-r') == u'юникод'.encode('koi8-r') + assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251', from_enc='koi8-r') == u'юникод'.encode('cp1251') + + +def test_to_str_transcode_wrong(): + assert library.python.strings.to_str(u'юникод'.encode('utf-8'), from_enc='cp1251') + assert library.python.strings.to_str(u'юникод'.encode('cp1251'), from_enc='utf-8') + + +def test_to_str_transcode_disabled(): + # No transcoding enabled, set from_enc to enable + assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8') == u'юникод'.encode('utf-8') + assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251') == u'юникод'.encode('utf-8') + assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8') == u'юникод'.encode('cp1251') + assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251') == u'юникод'.encode('cp1251') + assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='koi8-r') == u'юникод'.encode('cp1251') + assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251') == u'юникод'.encode('koi8-r') + + +def test_stringize_deep(): + assert library.python.strings.stringize_deep({ + 'key 1': 'value 1', + u'ключ 2': u'значение 2', + 'list': [u'ключ 2', 'key 1', (u'к', 2)] + }) == { + 'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1', + u'ключ 2'.encode('utf-8'): u'значение 2'.encode('utf-8'), + 'list' if six.PY2 else b'list': [u'ключ 2'.encode('utf-8'), 'key 1' if six.PY2 else b'key 1', (u'к'.encode('utf-8'), 2)] + } + + +def test_stringize_deep_doesnt_transcode(): + assert library.python.strings.stringize_deep({ + u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'), + u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'), + }) == { + u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'), + u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'), + } + + +def test_stringize_deep_nested(): + assert library.python.strings.stringize_deep({ + 'key 1': 'value 1', + u'ключ 2': { + 'subkey 1': 'value 1', + u'подключ 2': u'value 2', + }, + }) == { + 'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1', + u'ключ 2'.encode('utf-8'): { + 'subkey 1' if six.PY2 else b'subkey 1': 'value 1' if six.PY2 else b'value 1', + u'подключ 2'.encode('utf-8'): u'value 2'.encode('utf-8'), + }, + } + + +def test_stringize_deep_plain(): + assert library.python.strings.stringize_deep('str') == 'str' if six.PY2 else b'str' + assert library.python.strings.stringize_deep(u'юникод') == u'юникод'.encode('utf-8') + assert library.python.strings.stringize_deep(u'юникод'.encode('utf-8')) == u'юникод'.encode('utf-8') + + +def test_stringize_deep_nonstr(): + with pytest.raises(TypeError): + library.python.strings.stringize_deep(Convertible(), relaxed=False) + x = Convertible() + assert x == library.python.strings.stringize_deep(x) + + +def test_unicodize_deep(): + assert library.python.strings.unicodize_deep({ + 'key 1': 'value 1', + u'ключ 2': u'значение 2', + u'ключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'), + }) == { + u'key 1': u'value 1', + u'ключ 2': u'значение 2', + u'ключ 3': u'значение 3', + } + + +def test_unicodize_deep_nested(): + assert library.python.strings.unicodize_deep({ + 'key 1': 'value 1', + u'ключ 2': { + 'subkey 1': 'value 1', + u'подключ 2': u'значение 2', + u'подключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'), + }, + }) == { + u'key 1': u'value 1', + u'ключ 2': { + u'subkey 1': u'value 1', + u'подключ 2': u'значение 2', + u'подключ 3': u'значение 3', + }, + } + + +def test_unicodize_deep_plain(): + assert library.python.strings.unicodize_deep('str') == u'str' + assert library.python.strings.unicodize_deep(u'юникод') == u'юникод' + assert library.python.strings.unicodize_deep(u'юникод'.encode('utf-8')) == u'юникод' + + +def test_unicodize_deep_nonstr(): + with pytest.raises(TypeError): + library.python.strings.unicodize_deep(Convertible(), relaxed=False) + x = Convertible() + assert x == library.python.strings.stringize_deep(x) diff --git a/library/python/strings/ut/ya.make b/library/python/strings/ut/ya.make new file mode 100644 index 00000000000..dfacb226c76 --- /dev/null +++ b/library/python/strings/ut/ya.make @@ -0,0 +1,11 @@ +OWNER(g:yatool) + +PY23_TEST() + +TEST_SRCS(test_strings.py) + +PEERDIR( + library/python/strings +) + +END() diff --git a/library/python/strings/ya.make b/library/python/strings/ya.make new file mode 100644 index 00000000000..7e0b033717c --- /dev/null +++ b/library/python/strings/ya.make @@ -0,0 +1,16 @@ +OWNER(g:yatool) + +PY23_LIBRARY() + +PY_SRCS( + __init__.py + CYTHONIZE_PY + strings.py +) + +PEERDIR( + library/python/func + contrib/python/six +) + +END() |
