[devtools/ya/yalibrary/formatter] Moved string method to library/python/strings

author: prettyboy <prettyboy@yandex-team.com> 2022-12-27 22:16:38 +0300
committer: prettyboy <prettyboy@yandex-team.com> 2022-12-27 22:16:38 +0300
commit: ec1f04b090c6fe1f12ee28b92c680006da5c58f1 (patch)
tree: e2853db3d76e725867da74ec6a941c3abe96a439
parent: d51beecd387a655ea5c2031f7a8b3e5bde77f22b (diff)
download: ydb-ec1f04b090c6fe1f12ee28b92c680006da5c58f1.tar.gz
3 files changed, 314 insertions, 0 deletions
diff --git a/library/python/strings/__init__.py b/library/python/strings/__init__.py
index c7da1463cf..ae27ddbdae 100644
--- a/library/python/strings/__init__.py
+++ b/library/python/strings/__init__.py
@@ -3,8 +3,10 @@
 from .strings import (
     DEFAULT_ENCODING,
     ENCODING_ERRORS_POLICY,
+    Whence,
     encode,
     ensure_str_deep,
+    fix_utf8,
     fs_encoding,
     get_stream_encoding,
     guess_default_encoding,
@@ -14,5 +16,6 @@ from .strings import (
     to_basestring,
     to_str,
     to_unicode,
+    truncate,
     unicodize_deep,
 )
diff --git a/library/python/strings/strings.py b/library/python/strings/strings.py
index 916ae96742..07b7fc194f 100644
--- a/library/python/strings/strings.py
+++ b/library/python/strings/strings.py
@@ -136,3 +136,40 @@ def encode(value, encoding=DEFAULT_ENCODING):
     if isinstance(value, six.binary_type):
         value = value.decode(encoding, errors='ignore')
     return value.encode(encoding)
+
+
+class Whence(object):
+    Start = 0
+    End = 1
+    Middle = 2
+
+
+def truncate(data, limit, whence=None, msg=None):
+    msg = "..." if msg is None else msg
+    msg = six.ensure_binary(msg)
+    whence = Whence.End if whence is None else whence
+    data = six.ensure_binary(data)
+
+    if len(data) <= limit:
+        return six.ensure_str(data)
+    text_limit = limit - len(msg)
+    assert text_limit >= 0
+
+    if whence == Whence.Start:
+        data = msg + data[-text_limit:]
+    elif whence == Whence.End:
+        data = data[:text_limit] + msg
+    elif whence == Whence.Middle:
+        headpos = limit // 2 - len(msg) // 2
+        tailpos = len(data) - (text_limit - headpos)
+        data = data[:headpos] + msg + data[tailpos:]
+    else:
+        raise AssertionError("Unknown whence: %s" % str(whence))
+    return fix_utf8(data)
+
+
+def fix_utf8(data):
+    # type: (six.string_types) -> str
+    # remove destroyed symbol code
+    udata = six.ensure_text(data, 'utf-8', 'ignore')
+    return six.ensure_str(udata, 'utf-8', errors='ignore')
diff --git a/library/python/strings/ut/test_strings.py b/library/python/strings/ut/test_strings.py
new file mode 100644
index 0000000000..6177c10b25
--- /dev/null
+++ b/library/python/strings/ut/test_strings.py
@@ -0,0 +1,274 @@
+# coding=utf-8
+
+import pytest
+import six
+
+from library.python import strings
+
+
+class Convertible(object):
+    text = u'текст'
+    text_utf8 = text.encode('utf-8')
+
+    def __unicode__(self):
+        return self.text
+
+    def __str__(self):
+        return self.text_utf8
+
+
+class ConvertibleToUnicodeOnly(Convertible):
+    def __str__(self):
+        return self.text.encode('ascii')
+
+
+class ConvertibleToStrOnly(Convertible):
+    def __unicode__(self):
+        return self.text_utf8.decode('ascii')
+
+
+class NonConvertible(ConvertibleToUnicodeOnly, ConvertibleToStrOnly):
+    pass
+
+
+def test_to_basestring():
+    assert strings.to_basestring('str') == 'str'
+    assert strings.to_basestring(u'юникод') == u'юникод'
+    if six.PY2:  # __str__ should return str not bytes in Python3
+        assert strings.to_basestring(Convertible()) == Convertible.text
+        assert strings.to_basestring(ConvertibleToUnicodeOnly()) == Convertible.text
+        assert strings.to_basestring(ConvertibleToStrOnly()) == Convertible.text_utf8
+        assert strings.to_basestring(NonConvertible())
+
+
+def test_to_unicode():
+    assert strings.to_unicode(u'юникод') == u'юникод'
+    assert strings.to_unicode('str') == u'str'
+    assert strings.to_unicode(u'строка'.encode('utf-8')) == u'строка'
+    assert strings.to_unicode(u'строка'.encode('cp1251'), 'cp1251') == u'строка'
+    if six.PY2:  # __str__ should return str not bytes in Python3
+        assert strings.to_unicode(Convertible()) == Convertible.text
+        assert strings.to_unicode(ConvertibleToUnicodeOnly()) == Convertible.text
+        with pytest.raises(UnicodeDecodeError):
+            strings.to_unicode(ConvertibleToStrOnly())
+        with pytest.raises(UnicodeDecodeError):
+            strings.to_unicode(NonConvertible())
+
+
+def test_to_unicode_errors_replace():
+    assert strings.to_unicode(u'abcабв'.encode('utf-8'), 'ascii')
+    assert strings.to_unicode(u'абв'.encode('utf-8'), 'ascii')
+
+
+def test_to_str():
+    assert strings.to_str('str') == 'str' if six.PY2 else b'str'
+    assert strings.to_str(u'unicode') == 'unicode' if six.PY2 else b'unicode'
+    assert strings.to_str(u'юникод') == u'юникод'.encode('utf-8')
+    assert strings.to_str(u'юникод', 'cp1251') == u'юникод'.encode('cp1251')
+    if six.PY2:
+        assert strings.to_str(Convertible()) == Convertible.text_utf8
+        with pytest.raises(UnicodeEncodeError):
+            strings.to_str(ConvertibleToUnicodeOnly())
+        assert strings.to_str(ConvertibleToStrOnly()) == Convertible.text_utf8
+        with pytest.raises(UnicodeEncodeError):
+            strings.to_str(NonConvertible())
+
+
+def test_to_str_errors_replace():
+    assert strings.to_str(u'abcабв', 'ascii')
+    assert strings.to_str(u'абв', 'ascii')
+
+
+def test_to_str_transcode():
+    assert strings.to_str('str', from_enc='ascii') == 'str' if six.PY2 else b'str'
+    assert strings.to_str('str', from_enc='utf-8') == 'str' if six.PY2 else b'str'
+
+    assert strings.to_str(u'юникод'.encode('utf-8'), from_enc='utf-8') == u'юникод'.encode('utf-8')
+    assert strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8', from_enc='utf-8') == u'юникод'.encode('utf-8')
+    assert strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251', from_enc='utf-8') == u'юникод'.encode('cp1251')
+
+    assert strings.to_str(u'юникод'.encode('cp1251'), from_enc='cp1251') == u'юникод'.encode('utf-8')
+    assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251', from_enc='cp1251') == u'юникод'.encode('cp1251')
+    assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8', from_enc='cp1251') == u'юникод'.encode('utf-8')
+
+    assert strings.to_str(u'юникод'.encode('koi8-r'), from_enc='koi8-r') == u'юникод'.encode('utf-8')
+    assert strings.to_str(u'юникод'.encode('koi8-r'), to_enc='koi8-r', from_enc='koi8-r') == u'юникод'.encode('koi8-r')
+    assert strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251', from_enc='koi8-r') == u'юникод'.encode('cp1251')
+
+
+def test_to_str_transcode_wrong():
+    assert strings.to_str(u'юникод'.encode('utf-8'), from_enc='cp1251')
+    assert strings.to_str(u'юникод'.encode('cp1251'), from_enc='utf-8')
+
+
+def test_to_str_transcode_disabled():
+    # No transcoding enabled, set from_enc to enable
+    assert strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8') == u'юникод'.encode('utf-8')
+    assert strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251') == u'юникод'.encode('utf-8')
+    assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8') == u'юникод'.encode('cp1251')
+    assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251') == u'юникод'.encode('cp1251')
+    assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='koi8-r') == u'юникод'.encode('cp1251')
+    assert strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251') == u'юникод'.encode('koi8-r')
+
+
+def test_stringize_deep():
+    assert strings.stringize_deep(
+        {
+            'key 1': 'value 1',
+            u'ключ 2': u'значение 2',
+            'list': [u'ключ 2', 'key 1', (u'к', 2)],
+        }
+    ) == {
+        'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1',
+        u'ключ 2'.encode('utf-8'): u'значение 2'.encode('utf-8'),
+        ('list' if six.PY2 else b'list'): [
+            u'ключ 2'.encode('utf-8'),
+            'key 1' if six.PY2 else b'key 1',
+            (u'к'.encode('utf-8'), 2),
+        ],
+    }
+
+
+def test_stringize_deep_doesnt_transcode():
+    assert strings.stringize_deep(
+        {
+            u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'),
+            u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'),
+        }
+    ) == {
+        u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'),
+        u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'),
+    }
+
+
+def test_stringize_deep_nested():
+    assert strings.stringize_deep(
+        {
+            'key 1': 'value 1',
+            u'ключ 2': {
+                'subkey 1': 'value 1',
+                u'подключ 2': u'value 2',
+            },
+        }
+    ) == {
+        'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1',
+        u'ключ 2'.encode('utf-8'): {
+            ('subkey 1' if six.PY2 else b'subkey 1'): 'value 1' if six.PY2 else b'value 1',
+            u'подключ 2'.encode('utf-8'): u'value 2'.encode('utf-8'),
+        },
+    }
+
+
+def test_stringize_deep_plain():
+    assert strings.stringize_deep('str') == 'str' if six.PY2 else b'str'
+    assert strings.stringize_deep(u'юникод') == u'юникод'.encode('utf-8')
+    assert strings.stringize_deep(u'юникод'.encode('utf-8')) == u'юникод'.encode('utf-8')
+
+
+def test_stringize_deep_nonstr():
+    with pytest.raises(TypeError):
+        strings.stringize_deep(Convertible(), relaxed=False)
+    x = Convertible()
+    assert x == strings.stringize_deep(x)
+
+
+def test_unicodize_deep():
+    assert strings.unicodize_deep(
+        {
+            'key 1': 'value 1',
+            u'ключ 2': u'значение 2',
+            u'ключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'),
+        }
+    ) == {
+        u'key 1': u'value 1',
+        u'ключ 2': u'значение 2',
+        u'ключ 3': u'значение 3',
+    }
+
+
+def test_unicodize_deep_nested():
+    assert strings.unicodize_deep(
+        {
+            'key 1': 'value 1',
+            u'ключ 2': {
+                'subkey 1': 'value 1',
+                u'подключ 2': u'значение 2',
+                u'подключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'),
+            },
+        }
+    ) == {
+        u'key 1': u'value 1',
+        u'ключ 2': {
+            u'subkey 1': u'value 1',
+            u'подключ 2': u'значение 2',
+            u'подключ 3': u'значение 3',
+        },
+    }
+
+
+def test_unicodize_deep_plain():
+    assert strings.unicodize_deep('str') == u'str'
+    assert strings.unicodize_deep(u'юникод') == u'юникод'
+    assert strings.unicodize_deep(u'юникод'.encode('utf-8')) == u'юникод'
+
+
+def test_unicodize_deep_nonstr():
+    with pytest.raises(TypeError):
+        strings.unicodize_deep(Convertible(), relaxed=False)
+    x = Convertible()
+    assert x == strings.stringize_deep(x)
+
+
+truncate_utf_8_data = [
+    ("hello", 5, None, None, "hello"),
+    ("hello", 6, None, None, "hello"),
+    ("hello", 4, None, None, "h..."),
+    ("hello", 4, None, "", "hell"),
+    ("hello", 4, None, ".", "hel."),
+    ("hello", 4, strings.Whence.End, ".", "hel."),
+    ("hello", 5, strings.Whence.Start, None, "hello"),
+    ("hello", 4, strings.Whence.Start, None, "...o"),
+    ("hello", 4, strings.Whence.Start, ".", ".llo"),
+    ("yoloha", 5, strings.Whence.Middle, None, "y...a"),
+    ("hello", 5, strings.Whence.Middle, None, "hello"),
+    ("hello", 4, strings.Whence.Middle, None, "h..."),
+    ("hello", 4, strings.Whence.Middle, ".", "he.o"),
+    # destroyed symbol code must be removed
+    ("меледа", 4, None, None, "..."),
+    ("меледа", 5, None, None, "м..."),
+    ("меледа", 7, None, None, "ме..."),
+    ("меледа", 12, None, None, "меледа"),
+    ("меледа", 4, None, ".", "м."),
+    ("меледа", 5, None, "ак", "ак"),
+    ("меледа", 6, None, "ак", "мак"),
+    ("меледа", 4, strings.Whence.Start, None, "..."),
+    ("меледа", 5, strings.Whence.Start, None, "...а"),
+    ("меледа", 12, strings.Whence.Start, None, "меледа"),
+    ("меледа", 9, strings.Whence.Start, ".", ".леда"),
+    ("меледа", 10, strings.Whence.Start, ".", ".леда"),
+    # half code from symbol 'м' plus half from symbol 'а' - nothing in the end
+    ("меледа", 5, strings.Whence.Middle, None, "..."),
+    ("меледа", 6, strings.Whence.Middle, None, "м..."),
+    ("меледа", 7, strings.Whence.Middle, None, "м...а"),
+    ("меледа", 12, strings.Whence.Middle, None, "меледа"),
+    ("меледа", 8, strings.Whence.Middle, ".", "ме.а"),
+    ("меледа", 9, strings.Whence.Middle, ".", "ме.да"),
+    ("меледа", 10, strings.Whence.Middle, ".", "ме.да"),
+    ("меледа", 11, strings.Whence.Middle, ".", "ме.да"),
+    (u"меледа", 6, strings.Whence.Middle, None, "м..."),
+    (u"меледа", 12, strings.Whence.Middle, None, "меледа"),
+    (u"меледа", 8, strings.Whence.Middle, ".", "ме.а"),
+]
+
+
+@pytest.mark.parametrize("data, limit, Whence, msg, expected", truncate_utf_8_data)
+def test_truncate_utf_8_text(data, limit, Whence, msg, expected):
+    assert strings.truncate(data, limit, Whence, msg) == expected
+
+
+def test_truncate_utf_8_text_wrong_limit():
+    with pytest.raises(AssertionError):
+        strings.truncate("hell", 2)
+
+    with pytest.raises(AssertionError):
+        strings.truncate("hello", 4, msg="long msg")
author	prettyboy <prettyboy@yandex-team.com>	2022-12-27 22:16:38 +0300
committer	prettyboy <prettyboy@yandex-team.com>	2022-12-27 22:16:38 +0300
commit	ec1f04b090c6fe1f12ee28b92c680006da5c58f1 (patch)
tree	e2853db3d76e725867da74ec6a941c3abe96a439
parent	d51beecd387a655ea5c2031f7a8b3e5bde77f22b (diff)
download	ydb-ec1f04b090c6fe1f12ee28b92c680006da5c58f1.tar.gz