intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <[email protected]> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <[email protected]> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/python/strings
5 files changed, 378 insertions, 0 deletions
diff --git a/library/python/strings/__init__.py b/library/python/strings/__init__.py
new file mode 100644
index 00000000000..bd6bf6e7cee
--- /dev/null
+++ b/library/python/strings/__init__.py
@@ -0,0 +1,17 @@
+# flake8 noqa: F401
+
+from .strings import (
+    DEFAULT_ENCODING,
+    ENCODING_ERRORS_POLICY,
+    encode,
+    fs_encoding,
+    get_stream_encoding,
+    guess_default_encoding,
+    left_strip,
+    locale_encoding,
+    stringize_deep,
+    to_basestring,
+    to_str,
+    to_unicode,
+    unicodize_deep,
+)
diff --git a/library/python/strings/strings.py b/library/python/strings/strings.py
new file mode 100644
index 00000000000..5bfddfe78ae
--- /dev/null
+++ b/library/python/strings/strings.py
@@ -0,0 +1,129 @@
+import locale
+import logging
+import six
+import sys
+import codecs
+
+import library.python.func
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_ENCODING = 'utf-8'
+ENCODING_ERRORS_POLICY = 'replace'
+
+
+def left_strip(el, prefix):
+    """
+    Strips prefix at the left of el
+    """
+    if el.startswith(prefix):
+        return el[len(prefix):]
+    return el
+
+
+# Explicit to-text conversion
+# Chooses between str/unicode, i.e. six.binary_type/six.text_type
+def to_basestring(value):
+    if isinstance(value, (six.binary_type, six.text_type)):
+        return value
+    try:
+        if six.PY2:
+            return unicode(value)
+        else:
+            return str(value)
+    except UnicodeDecodeError:
+        try:
+            return str(value)
+        except UnicodeEncodeError:
+            return repr(value)
+to_text = to_basestring
+
+
+def to_unicode(value, from_enc=DEFAULT_ENCODING):
+    if isinstance(value, six.text_type):
+        return value
+    if isinstance(value, six.binary_type):
+        if six.PY2:
+            return unicode(value, from_enc, ENCODING_ERRORS_POLICY)
+        else:
+            return value.decode(from_enc, errors=ENCODING_ERRORS_POLICY)
+    return six.text_type(value)
+
+
+# Optional from_enc enables transcoding
+def to_str(value, to_enc=DEFAULT_ENCODING, from_enc=None):
+    if isinstance(value, six.binary_type):
+        if from_enc is None or to_enc == from_enc:
+            # Unknown input encoding or input and output encoding are the same
+            return value
+        value = to_unicode(value, from_enc=from_enc)
+    if isinstance(value, six.text_type):
+        return value.encode(to_enc, ENCODING_ERRORS_POLICY)
+    return six.binary_type(value)
+
+
+def _convert_deep(x, enc, convert, relaxed=True):
+    if x is None:
+        return None
+    if isinstance(x, (six.text_type, six.binary_type)):
+        return convert(x, enc)
+    if isinstance(x, dict):
+        return {convert(k, enc): _convert_deep(v, enc, convert, relaxed) for k, v in six.iteritems(x)}
+    if isinstance(x, list):
+        return [_convert_deep(e, enc, convert, relaxed) for e in x]
+    if isinstance(x, tuple):
+        return tuple([_convert_deep(e, enc, convert, relaxed) for e in x])
+
+    if relaxed:
+        return x
+    raise TypeError('unsupported type')
+
+
+def unicodize_deep(x, enc=DEFAULT_ENCODING, relaxed=True):
+    return _convert_deep(x, enc, to_unicode, relaxed)
+
+
+def stringize_deep(x, enc=DEFAULT_ENCODING, relaxed=True):
+    return _convert_deep(x, enc, to_str, relaxed)
+
+
+[email protected]()
+def locale_encoding():
+    try:
+        loc = locale.getdefaultlocale()[1]
+        if loc:
+            codecs.lookup(loc)
+        return loc
+    except LookupError as e:
+        logger.debug('Cannot get system locale: %s', e)
+        return None
+    except ValueError as e:
+        logger.warn('Cannot get system locale: %s', e)
+        return None
+
+
+def fs_encoding():
+    return sys.getfilesystemencoding()
+
+
+def guess_default_encoding():
+    enc = locale_encoding()
+    return enc if enc else DEFAULT_ENCODING
+
+
+[email protected]()
+def get_stream_encoding(stream):
+    if stream.encoding:
+        try:
+            codecs.lookup(stream.encoding)
+            return stream.encoding
+        except LookupError:
+            pass
+    return DEFAULT_ENCODING
+
+
+def encode(value, encoding=DEFAULT_ENCODING):
+    if isinstance(value, six.binary_type):
+        value = value.decode(encoding, errors='ignore')
+    return value.encode(encoding)
diff --git a/library/python/strings/ut/test_strings.py b/library/python/strings/ut/test_strings.py
new file mode 100644
index 00000000000..dd0c694ee1d
--- /dev/null
+++ b/library/python/strings/ut/test_strings.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+
+import pytest
+import six
+
+import library.python.strings
+
+
+class Convertible(object):
+    text = u'текст'
+    text_utf8 = text.encode('utf-8')
+
+    def __unicode__(self):
+        return self.text
+
+    def __str__(self):
+        return self.text_utf8
+
+
+class ConvertibleToUnicodeOnly(Convertible):
+    def __str__(self):
+        return self.text.encode('ascii')
+
+
+class ConvertibleToStrOnly(Convertible):
+    def __unicode__(self):
+        return self.text_utf8.decode('ascii')
+
+
+class NonConvertible(ConvertibleToUnicodeOnly, ConvertibleToStrOnly):
+    pass
+
+
+def test_to_basestring():
+    assert library.python.strings.to_basestring('str') == 'str'
+    assert library.python.strings.to_basestring(u'юникод') == u'юникод'
+    if six.PY2:  # __str__ should return str not bytes in Python3
+        assert library.python.strings.to_basestring(Convertible()) == Convertible.text
+        assert library.python.strings.to_basestring(ConvertibleToUnicodeOnly()) == Convertible.text
+        assert library.python.strings.to_basestring(ConvertibleToStrOnly()) == Convertible.text_utf8
+        assert library.python.strings.to_basestring(NonConvertible())
+
+
+def test_to_unicode():
+    assert library.python.strings.to_unicode(u'юникод') == u'юникод'
+    assert library.python.strings.to_unicode('str') == u'str'
+    assert library.python.strings.to_unicode(u'строка'.encode('utf-8')) == u'строка'
+    assert library.python.strings.to_unicode(u'строка'.encode('cp1251'), 'cp1251') == u'строка'
+    if six.PY2:  # __str__ should return str not bytes in Python3
+        assert library.python.strings.to_unicode(Convertible()) == Convertible.text
+        assert library.python.strings.to_unicode(ConvertibleToUnicodeOnly()) == Convertible.text
+        with pytest.raises(UnicodeDecodeError):
+            library.python.strings.to_unicode(ConvertibleToStrOnly())
+        with pytest.raises(UnicodeDecodeError):
+            library.python.strings.to_unicode(NonConvertible())
+
+
+def test_to_unicode_errors_replace():
+    assert library.python.strings.to_unicode(u'abcабв'.encode('utf-8'), 'ascii')
+    assert library.python.strings.to_unicode(u'абв'.encode('utf-8'), 'ascii')
+
+
+def test_to_str():
+    assert library.python.strings.to_str('str') == 'str' if six.PY2 else b'str'
+    assert library.python.strings.to_str(u'unicode') == 'unicode' if six.PY2 else b'unicode'
+    assert library.python.strings.to_str(u'юникод') == u'юникод'.encode('utf-8')
+    assert library.python.strings.to_str(u'юникод', 'cp1251') == u'юникод'.encode('cp1251')
+    if six.PY2:
+        assert library.python.strings.to_str(Convertible()) == Convertible.text_utf8
+        with pytest.raises(UnicodeEncodeError):
+            library.python.strings.to_str(ConvertibleToUnicodeOnly())
+        assert library.python.strings.to_str(ConvertibleToStrOnly()) == Convertible.text_utf8
+        with pytest.raises(UnicodeEncodeError):
+            library.python.strings.to_str(NonConvertible())
+
+
+def test_to_str_errors_replace():
+    assert library.python.strings.to_str(u'abcабв', 'ascii')
+    assert library.python.strings.to_str(u'абв', 'ascii')
+
+
+def test_to_str_transcode():
+    assert library.python.strings.to_str('str', from_enc='ascii') == 'str' if six.PY2 else b'str'
+    assert library.python.strings.to_str('str', from_enc='utf-8') == 'str' if six.PY2 else b'str'
+
+    assert library.python.strings.to_str(u'юникод'.encode('utf-8'), from_enc='utf-8') == u'юникод'.encode('utf-8')
+    assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8', from_enc='utf-8') == u'юникод'.encode('utf-8')
+    assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251', from_enc='utf-8') == u'юникод'.encode('cp1251')
+
+    assert library.python.strings.to_str(u'юникод'.encode('cp1251'), from_enc='cp1251') == u'юникод'.encode('utf-8')
+    assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251', from_enc='cp1251') == u'юникод'.encode('cp1251')
+    assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8', from_enc='cp1251') == u'юникод'.encode('utf-8')
+
+    assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), from_enc='koi8-r') == u'юникод'.encode('utf-8')
+    assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), to_enc='koi8-r', from_enc='koi8-r') == u'юникод'.encode('koi8-r')
+    assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251', from_enc='koi8-r') == u'юникод'.encode('cp1251')
+
+
+def test_to_str_transcode_wrong():
+    assert library.python.strings.to_str(u'юникод'.encode('utf-8'), from_enc='cp1251')
+    assert library.python.strings.to_str(u'юникод'.encode('cp1251'), from_enc='utf-8')
+
+
+def test_to_str_transcode_disabled():
+    # No transcoding enabled, set from_enc to enable
+    assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8') == u'юникод'.encode('utf-8')
+    assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251') == u'юникод'.encode('utf-8')
+    assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8') == u'юникод'.encode('cp1251')
+    assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251') == u'юникод'.encode('cp1251')
+    assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='koi8-r') == u'юникод'.encode('cp1251')
+    assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251') == u'юникод'.encode('koi8-r')
+
+
+def test_stringize_deep():
+    assert library.python.strings.stringize_deep({
+        'key 1': 'value 1',
+        u'ключ 2': u'значение 2',
+        'list': [u'ключ 2', 'key 1', (u'к', 2)]
+    }) == {
+        'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1',
+        u'ключ 2'.encode('utf-8'): u'значение 2'.encode('utf-8'),
+        'list' if six.PY2 else b'list': [u'ключ 2'.encode('utf-8'), 'key 1' if six.PY2 else b'key 1', (u'к'.encode('utf-8'), 2)]
+    }
+
+
+def test_stringize_deep_doesnt_transcode():
+    assert library.python.strings.stringize_deep({
+        u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'),
+        u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'),
+    }) == {
+        u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'),
+        u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'),
+    }
+
+
+def test_stringize_deep_nested():
+    assert library.python.strings.stringize_deep({
+        'key 1': 'value 1',
+        u'ключ 2': {
+            'subkey 1': 'value 1',
+            u'подключ 2': u'value 2',
+        },
+    }) == {
+        'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1',
+        u'ключ 2'.encode('utf-8'): {
+            'subkey 1' if six.PY2 else b'subkey 1': 'value 1' if six.PY2 else b'value 1',
+            u'подключ 2'.encode('utf-8'): u'value 2'.encode('utf-8'),
+        },
+    }
+
+
+def test_stringize_deep_plain():
+    assert library.python.strings.stringize_deep('str') == 'str' if six.PY2 else b'str'
+    assert library.python.strings.stringize_deep(u'юникод') == u'юникод'.encode('utf-8')
+    assert library.python.strings.stringize_deep(u'юникод'.encode('utf-8')) == u'юникод'.encode('utf-8')
+
+
+def test_stringize_deep_nonstr():
+    with pytest.raises(TypeError):
+        library.python.strings.stringize_deep(Convertible(), relaxed=False)
+    x = Convertible()
+    assert x == library.python.strings.stringize_deep(x)
+
+
+def test_unicodize_deep():
+    assert library.python.strings.unicodize_deep({
+        'key 1': 'value 1',
+        u'ключ 2': u'значение 2',
+        u'ключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'),
+    }) == {
+        u'key 1': u'value 1',
+        u'ключ 2': u'значение 2',
+        u'ключ 3': u'значение 3',
+    }
+
+
+def test_unicodize_deep_nested():
+    assert library.python.strings.unicodize_deep({
+        'key 1': 'value 1',
+        u'ключ 2': {
+            'subkey 1': 'value 1',
+            u'подключ 2': u'значение 2',
+            u'подключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'),
+        },
+    }) == {
+        u'key 1': u'value 1',
+        u'ключ 2': {
+            u'subkey 1': u'value 1',
+            u'подключ 2': u'значение 2',
+            u'подключ 3': u'значение 3',
+        },
+    }
+
+
+def test_unicodize_deep_plain():
+    assert library.python.strings.unicodize_deep('str') == u'str'
+    assert library.python.strings.unicodize_deep(u'юникод') == u'юникод'
+    assert library.python.strings.unicodize_deep(u'юникод'.encode('utf-8')) == u'юникод'
+
+
+def test_unicodize_deep_nonstr():
+    with pytest.raises(TypeError):
+        library.python.strings.unicodize_deep(Convertible(), relaxed=False)
+    x = Convertible()
+    assert x == library.python.strings.stringize_deep(x)
diff --git a/library/python/strings/ut/ya.make b/library/python/strings/ut/ya.make
new file mode 100644
index 00000000000..dfacb226c76
--- /dev/null
+++ b/library/python/strings/ut/ya.make
@@ -0,0 +1,11 @@
+OWNER(g:yatool)
+
+PY23_TEST()
+
+TEST_SRCS(test_strings.py)
+
+PEERDIR(
+    library/python/strings
+)
+
+END()
diff --git a/library/python/strings/ya.make b/library/python/strings/ya.make
new file mode 100644
index 00000000000..7e0b033717c
--- /dev/null
+++ b/library/python/strings/ya.make
@@ -0,0 +1,16 @@
+OWNER(g:yatool)
+
+PY23_LIBRARY()
+
+PY_SRCS(
+    __init__.py
+    CYTHONIZE_PY
+    strings.py
+)
+
+PEERDIR(
+    library/python/func
+    contrib/python/six
+)
+
+END()
author	Devtools Arcadia <[email protected]>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <[email protected]>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/python/strings