diff options
author | prettyboy <prettyboy@yandex-team.com> | 2022-12-27 22:16:38 +0300 |
---|---|---|
committer | prettyboy <prettyboy@yandex-team.com> | 2022-12-27 22:16:38 +0300 |
commit | ec1f04b090c6fe1f12ee28b92c680006da5c58f1 (patch) | |
tree | e2853db3d76e725867da74ec6a941c3abe96a439 | |
parent | d51beecd387a655ea5c2031f7a8b3e5bde77f22b (diff) | |
download | ydb-ec1f04b090c6fe1f12ee28b92c680006da5c58f1.tar.gz |
[devtools/ya/yalibrary/formatter] Moved string method to library/python/strings
-rw-r--r-- | library/python/strings/__init__.py | 3 | ||||
-rw-r--r-- | library/python/strings/strings.py | 37 | ||||
-rw-r--r-- | library/python/strings/ut/test_strings.py | 274 |
3 files changed, 314 insertions, 0 deletions
diff --git a/library/python/strings/__init__.py b/library/python/strings/__init__.py index c7da1463cf..ae27ddbdae 100644 --- a/library/python/strings/__init__.py +++ b/library/python/strings/__init__.py @@ -3,8 +3,10 @@ from .strings import ( DEFAULT_ENCODING, ENCODING_ERRORS_POLICY, + Whence, encode, ensure_str_deep, + fix_utf8, fs_encoding, get_stream_encoding, guess_default_encoding, @@ -14,5 +16,6 @@ from .strings import ( to_basestring, to_str, to_unicode, + truncate, unicodize_deep, ) diff --git a/library/python/strings/strings.py b/library/python/strings/strings.py index 916ae96742..07b7fc194f 100644 --- a/library/python/strings/strings.py +++ b/library/python/strings/strings.py @@ -136,3 +136,40 @@ def encode(value, encoding=DEFAULT_ENCODING): if isinstance(value, six.binary_type): value = value.decode(encoding, errors='ignore') return value.encode(encoding) + + +class Whence(object): + Start = 0 + End = 1 + Middle = 2 + + +def truncate(data, limit, whence=None, msg=None): + msg = "..." if msg is None else msg + msg = six.ensure_binary(msg) + whence = Whence.End if whence is None else whence + data = six.ensure_binary(data) + + if len(data) <= limit: + return six.ensure_str(data) + text_limit = limit - len(msg) + assert text_limit >= 0 + + if whence == Whence.Start: + data = msg + data[-text_limit:] + elif whence == Whence.End: + data = data[:text_limit] + msg + elif whence == Whence.Middle: + headpos = limit // 2 - len(msg) // 2 + tailpos = len(data) - (text_limit - headpos) + data = data[:headpos] + msg + data[tailpos:] + else: + raise AssertionError("Unknown whence: %s" % str(whence)) + return fix_utf8(data) + + +def fix_utf8(data): + # type: (six.string_types) -> str + # remove destroyed symbol code + udata = six.ensure_text(data, 'utf-8', 'ignore') + return six.ensure_str(udata, 'utf-8', errors='ignore') diff --git a/library/python/strings/ut/test_strings.py b/library/python/strings/ut/test_strings.py new file mode 100644 index 0000000000..6177c10b25 --- /dev/null +++ b/library/python/strings/ut/test_strings.py @@ -0,0 +1,274 @@ +# coding=utf-8 + +import pytest +import six + +from library.python import strings + + +class Convertible(object): + text = u'текст' + text_utf8 = text.encode('utf-8') + + def __unicode__(self): + return self.text + + def __str__(self): + return self.text_utf8 + + +class ConvertibleToUnicodeOnly(Convertible): + def __str__(self): + return self.text.encode('ascii') + + +class ConvertibleToStrOnly(Convertible): + def __unicode__(self): + return self.text_utf8.decode('ascii') + + +class NonConvertible(ConvertibleToUnicodeOnly, ConvertibleToStrOnly): + pass + + +def test_to_basestring(): + assert strings.to_basestring('str') == 'str' + assert strings.to_basestring(u'юникод') == u'юникод' + if six.PY2: # __str__ should return str not bytes in Python3 + assert strings.to_basestring(Convertible()) == Convertible.text + assert strings.to_basestring(ConvertibleToUnicodeOnly()) == Convertible.text + assert strings.to_basestring(ConvertibleToStrOnly()) == Convertible.text_utf8 + assert strings.to_basestring(NonConvertible()) + + +def test_to_unicode(): + assert strings.to_unicode(u'юникод') == u'юникод' + assert strings.to_unicode('str') == u'str' + assert strings.to_unicode(u'строка'.encode('utf-8')) == u'строка' + assert strings.to_unicode(u'строка'.encode('cp1251'), 'cp1251') == u'строка' + if six.PY2: # __str__ should return str not bytes in Python3 + assert strings.to_unicode(Convertible()) == Convertible.text + assert strings.to_unicode(ConvertibleToUnicodeOnly()) == Convertible.text + with pytest.raises(UnicodeDecodeError): + strings.to_unicode(ConvertibleToStrOnly()) + with pytest.raises(UnicodeDecodeError): + strings.to_unicode(NonConvertible()) + + +def test_to_unicode_errors_replace(): + assert strings.to_unicode(u'abcабв'.encode('utf-8'), 'ascii') + assert strings.to_unicode(u'абв'.encode('utf-8'), 'ascii') + + +def test_to_str(): + assert strings.to_str('str') == 'str' if six.PY2 else b'str' + assert strings.to_str(u'unicode') == 'unicode' if six.PY2 else b'unicode' + assert strings.to_str(u'юникод') == u'юникод'.encode('utf-8') + assert strings.to_str(u'юникод', 'cp1251') == u'юникод'.encode('cp1251') + if six.PY2: + assert strings.to_str(Convertible()) == Convertible.text_utf8 + with pytest.raises(UnicodeEncodeError): + strings.to_str(ConvertibleToUnicodeOnly()) + assert strings.to_str(ConvertibleToStrOnly()) == Convertible.text_utf8 + with pytest.raises(UnicodeEncodeError): + strings.to_str(NonConvertible()) + + +def test_to_str_errors_replace(): + assert strings.to_str(u'abcабв', 'ascii') + assert strings.to_str(u'абв', 'ascii') + + +def test_to_str_transcode(): + assert strings.to_str('str', from_enc='ascii') == 'str' if six.PY2 else b'str' + assert strings.to_str('str', from_enc='utf-8') == 'str' if six.PY2 else b'str' + + assert strings.to_str(u'юникод'.encode('utf-8'), from_enc='utf-8') == u'юникод'.encode('utf-8') + assert strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8', from_enc='utf-8') == u'юникод'.encode('utf-8') + assert strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251', from_enc='utf-8') == u'юникод'.encode('cp1251') + + assert strings.to_str(u'юникод'.encode('cp1251'), from_enc='cp1251') == u'юникод'.encode('utf-8') + assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251', from_enc='cp1251') == u'юникод'.encode('cp1251') + assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8', from_enc='cp1251') == u'юникод'.encode('utf-8') + + assert strings.to_str(u'юникод'.encode('koi8-r'), from_enc='koi8-r') == u'юникод'.encode('utf-8') + assert strings.to_str(u'юникод'.encode('koi8-r'), to_enc='koi8-r', from_enc='koi8-r') == u'юникод'.encode('koi8-r') + assert strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251', from_enc='koi8-r') == u'юникод'.encode('cp1251') + + +def test_to_str_transcode_wrong(): + assert strings.to_str(u'юникод'.encode('utf-8'), from_enc='cp1251') + assert strings.to_str(u'юникод'.encode('cp1251'), from_enc='utf-8') + + +def test_to_str_transcode_disabled(): + # No transcoding enabled, set from_enc to enable + assert strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8') == u'юникод'.encode('utf-8') + assert strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251') == u'юникод'.encode('utf-8') + assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8') == u'юникод'.encode('cp1251') + assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251') == u'юникод'.encode('cp1251') + assert strings.to_str(u'юникод'.encode('cp1251'), to_enc='koi8-r') == u'юникод'.encode('cp1251') + assert strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251') == u'юникод'.encode('koi8-r') + + +def test_stringize_deep(): + assert strings.stringize_deep( + { + 'key 1': 'value 1', + u'ключ 2': u'значение 2', + 'list': [u'ключ 2', 'key 1', (u'к', 2)], + } + ) == { + 'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1', + u'ключ 2'.encode('utf-8'): u'значение 2'.encode('utf-8'), + ('list' if six.PY2 else b'list'): [ + u'ключ 2'.encode('utf-8'), + 'key 1' if six.PY2 else b'key 1', + (u'к'.encode('utf-8'), 2), + ], + } + + +def test_stringize_deep_doesnt_transcode(): + assert strings.stringize_deep( + { + u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'), + u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'), + } + ) == { + u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'), + u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'), + } + + +def test_stringize_deep_nested(): + assert strings.stringize_deep( + { + 'key 1': 'value 1', + u'ключ 2': { + 'subkey 1': 'value 1', + u'подключ 2': u'value 2', + }, + } + ) == { + 'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1', + u'ключ 2'.encode('utf-8'): { + ('subkey 1' if six.PY2 else b'subkey 1'): 'value 1' if six.PY2 else b'value 1', + u'подключ 2'.encode('utf-8'): u'value 2'.encode('utf-8'), + }, + } + + +def test_stringize_deep_plain(): + assert strings.stringize_deep('str') == 'str' if six.PY2 else b'str' + assert strings.stringize_deep(u'юникод') == u'юникод'.encode('utf-8') + assert strings.stringize_deep(u'юникод'.encode('utf-8')) == u'юникод'.encode('utf-8') + + +def test_stringize_deep_nonstr(): + with pytest.raises(TypeError): + strings.stringize_deep(Convertible(), relaxed=False) + x = Convertible() + assert x == strings.stringize_deep(x) + + +def test_unicodize_deep(): + assert strings.unicodize_deep( + { + 'key 1': 'value 1', + u'ключ 2': u'значение 2', + u'ключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'), + } + ) == { + u'key 1': u'value 1', + u'ключ 2': u'значение 2', + u'ключ 3': u'значение 3', + } + + +def test_unicodize_deep_nested(): + assert strings.unicodize_deep( + { + 'key 1': 'value 1', + u'ключ 2': { + 'subkey 1': 'value 1', + u'подключ 2': u'значение 2', + u'подключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'), + }, + } + ) == { + u'key 1': u'value 1', + u'ключ 2': { + u'subkey 1': u'value 1', + u'подключ 2': u'значение 2', + u'подключ 3': u'значение 3', + }, + } + + +def test_unicodize_deep_plain(): + assert strings.unicodize_deep('str') == u'str' + assert strings.unicodize_deep(u'юникод') == u'юникод' + assert strings.unicodize_deep(u'юникод'.encode('utf-8')) == u'юникод' + + +def test_unicodize_deep_nonstr(): + with pytest.raises(TypeError): + strings.unicodize_deep(Convertible(), relaxed=False) + x = Convertible() + assert x == strings.stringize_deep(x) + + +truncate_utf_8_data = [ + ("hello", 5, None, None, "hello"), + ("hello", 6, None, None, "hello"), + ("hello", 4, None, None, "h..."), + ("hello", 4, None, "", "hell"), + ("hello", 4, None, ".", "hel."), + ("hello", 4, strings.Whence.End, ".", "hel."), + ("hello", 5, strings.Whence.Start, None, "hello"), + ("hello", 4, strings.Whence.Start, None, "...o"), + ("hello", 4, strings.Whence.Start, ".", ".llo"), + ("yoloha", 5, strings.Whence.Middle, None, "y...a"), + ("hello", 5, strings.Whence.Middle, None, "hello"), + ("hello", 4, strings.Whence.Middle, None, "h..."), + ("hello", 4, strings.Whence.Middle, ".", "he.o"), + # destroyed symbol code must be removed + ("меледа", 4, None, None, "..."), + ("меледа", 5, None, None, "м..."), + ("меледа", 7, None, None, "ме..."), + ("меледа", 12, None, None, "меледа"), + ("меледа", 4, None, ".", "м."), + ("меледа", 5, None, "ак", "ак"), + ("меледа", 6, None, "ак", "мак"), + ("меледа", 4, strings.Whence.Start, None, "..."), + ("меледа", 5, strings.Whence.Start, None, "...а"), + ("меледа", 12, strings.Whence.Start, None, "меледа"), + ("меледа", 9, strings.Whence.Start, ".", ".леда"), + ("меледа", 10, strings.Whence.Start, ".", ".леда"), + # half code from symbol 'м' plus half from symbol 'а' - nothing in the end + ("меледа", 5, strings.Whence.Middle, None, "..."), + ("меледа", 6, strings.Whence.Middle, None, "м..."), + ("меледа", 7, strings.Whence.Middle, None, "м...а"), + ("меледа", 12, strings.Whence.Middle, None, "меледа"), + ("меледа", 8, strings.Whence.Middle, ".", "ме.а"), + ("меледа", 9, strings.Whence.Middle, ".", "ме.да"), + ("меледа", 10, strings.Whence.Middle, ".", "ме.да"), + ("меледа", 11, strings.Whence.Middle, ".", "ме.да"), + (u"меледа", 6, strings.Whence.Middle, None, "м..."), + (u"меледа", 12, strings.Whence.Middle, None, "меледа"), + (u"меледа", 8, strings.Whence.Middle, ".", "ме.а"), +] + + +@pytest.mark.parametrize("data, limit, Whence, msg, expected", truncate_utf_8_data) +def test_truncate_utf_8_text(data, limit, Whence, msg, expected): + assert strings.truncate(data, limit, Whence, msg) == expected + + +def test_truncate_utf_8_text_wrong_limit(): + with pytest.raises(AssertionError): + strings.truncate("hell", 2) + + with pytest.raises(AssertionError): + strings.truncate("hello", 4, msg="long msg") |