summaryrefslogtreecommitdiffstats
path: root/library/python/strings
diff options
context:
space:
mode:
authorDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/python/strings
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/python/strings')
-rw-r--r--library/python/strings/__init__.py17
-rw-r--r--library/python/strings/strings.py129
-rw-r--r--library/python/strings/ut/test_strings.py205
-rw-r--r--library/python/strings/ut/ya.make11
-rw-r--r--library/python/strings/ya.make16
5 files changed, 378 insertions, 0 deletions
diff --git a/library/python/strings/__init__.py b/library/python/strings/__init__.py
new file mode 100644
index 00000000000..bd6bf6e7cee
--- /dev/null
+++ b/library/python/strings/__init__.py
@@ -0,0 +1,17 @@
+# flake8 noqa: F401
+
+from .strings import (
+ DEFAULT_ENCODING,
+ ENCODING_ERRORS_POLICY,
+ encode,
+ fs_encoding,
+ get_stream_encoding,
+ guess_default_encoding,
+ left_strip,
+ locale_encoding,
+ stringize_deep,
+ to_basestring,
+ to_str,
+ to_unicode,
+ unicodize_deep,
+)
diff --git a/library/python/strings/strings.py b/library/python/strings/strings.py
new file mode 100644
index 00000000000..5bfddfe78ae
--- /dev/null
+++ b/library/python/strings/strings.py
@@ -0,0 +1,129 @@
+import locale
+import logging
+import six
+import sys
+import codecs
+
+import library.python.func
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_ENCODING = 'utf-8'
+ENCODING_ERRORS_POLICY = 'replace'
+
+
+def left_strip(el, prefix):
+ """
+ Strips prefix at the left of el
+ """
+ if el.startswith(prefix):
+ return el[len(prefix):]
+ return el
+
+
+# Explicit to-text conversion
+# Chooses between str/unicode, i.e. six.binary_type/six.text_type
+def to_basestring(value):
+ if isinstance(value, (six.binary_type, six.text_type)):
+ return value
+ try:
+ if six.PY2:
+ return unicode(value)
+ else:
+ return str(value)
+ except UnicodeDecodeError:
+ try:
+ return str(value)
+ except UnicodeEncodeError:
+ return repr(value)
+to_text = to_basestring
+
+
+def to_unicode(value, from_enc=DEFAULT_ENCODING):
+ if isinstance(value, six.text_type):
+ return value
+ if isinstance(value, six.binary_type):
+ if six.PY2:
+ return unicode(value, from_enc, ENCODING_ERRORS_POLICY)
+ else:
+ return value.decode(from_enc, errors=ENCODING_ERRORS_POLICY)
+ return six.text_type(value)
+
+
+# Optional from_enc enables transcoding
+def to_str(value, to_enc=DEFAULT_ENCODING, from_enc=None):
+ if isinstance(value, six.binary_type):
+ if from_enc is None or to_enc == from_enc:
+ # Unknown input encoding or input and output encoding are the same
+ return value
+ value = to_unicode(value, from_enc=from_enc)
+ if isinstance(value, six.text_type):
+ return value.encode(to_enc, ENCODING_ERRORS_POLICY)
+ return six.binary_type(value)
+
+
+def _convert_deep(x, enc, convert, relaxed=True):
+ if x is None:
+ return None
+ if isinstance(x, (six.text_type, six.binary_type)):
+ return convert(x, enc)
+ if isinstance(x, dict):
+ return {convert(k, enc): _convert_deep(v, enc, convert, relaxed) for k, v in six.iteritems(x)}
+ if isinstance(x, list):
+ return [_convert_deep(e, enc, convert, relaxed) for e in x]
+ if isinstance(x, tuple):
+ return tuple([_convert_deep(e, enc, convert, relaxed) for e in x])
+
+ if relaxed:
+ return x
+ raise TypeError('unsupported type')
+
+
+def unicodize_deep(x, enc=DEFAULT_ENCODING, relaxed=True):
+ return _convert_deep(x, enc, to_unicode, relaxed)
+
+
+def stringize_deep(x, enc=DEFAULT_ENCODING, relaxed=True):
+ return _convert_deep(x, enc, to_str, relaxed)
+
+
+def locale_encoding():
+ try:
+ loc = locale.getdefaultlocale()[1]
+ if loc:
+ codecs.lookup(loc)
+ return loc
+ except LookupError as e:
+ logger.debug('Cannot get system locale: %s', e)
+ return None
+ except ValueError as e:
+ logger.warn('Cannot get system locale: %s', e)
+ return None
+
+
+def fs_encoding():
+ return sys.getfilesystemencoding()
+
+
+def guess_default_encoding():
+ enc = locale_encoding()
+ return enc if enc else DEFAULT_ENCODING
+
+
+def get_stream_encoding(stream):
+ if stream.encoding:
+ try:
+ codecs.lookup(stream.encoding)
+ return stream.encoding
+ except LookupError:
+ pass
+ return DEFAULT_ENCODING
+
+
+def encode(value, encoding=DEFAULT_ENCODING):
+ if isinstance(value, six.binary_type):
+ value = value.decode(encoding, errors='ignore')
+ return value.encode(encoding)
diff --git a/library/python/strings/ut/test_strings.py b/library/python/strings/ut/test_strings.py
new file mode 100644
index 00000000000..dd0c694ee1d
--- /dev/null
+++ b/library/python/strings/ut/test_strings.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+
+import pytest
+import six
+
+import library.python.strings
+
+
+class Convertible(object):
+ text = u'текст'
+ text_utf8 = text.encode('utf-8')
+
+ def __unicode__(self):
+ return self.text
+
+ def __str__(self):
+ return self.text_utf8
+
+
+class ConvertibleToUnicodeOnly(Convertible):
+ def __str__(self):
+ return self.text.encode('ascii')
+
+
+class ConvertibleToStrOnly(Convertible):
+ def __unicode__(self):
+ return self.text_utf8.decode('ascii')
+
+
+class NonConvertible(ConvertibleToUnicodeOnly, ConvertibleToStrOnly):
+ pass
+
+
+def test_to_basestring():
+ assert library.python.strings.to_basestring('str') == 'str'
+ assert library.python.strings.to_basestring(u'юникод') == u'юникод'
+ if six.PY2: # __str__ should return str not bytes in Python3
+ assert library.python.strings.to_basestring(Convertible()) == Convertible.text
+ assert library.python.strings.to_basestring(ConvertibleToUnicodeOnly()) == Convertible.text
+ assert library.python.strings.to_basestring(ConvertibleToStrOnly()) == Convertible.text_utf8
+ assert library.python.strings.to_basestring(NonConvertible())
+
+
+def test_to_unicode():
+ assert library.python.strings.to_unicode(u'юникод') == u'юникод'
+ assert library.python.strings.to_unicode('str') == u'str'
+ assert library.python.strings.to_unicode(u'строка'.encode('utf-8')) == u'строка'
+ assert library.python.strings.to_unicode(u'строка'.encode('cp1251'), 'cp1251') == u'строка'
+ if six.PY2: # __str__ should return str not bytes in Python3
+ assert library.python.strings.to_unicode(Convertible()) == Convertible.text
+ assert library.python.strings.to_unicode(ConvertibleToUnicodeOnly()) == Convertible.text
+ with pytest.raises(UnicodeDecodeError):
+ library.python.strings.to_unicode(ConvertibleToStrOnly())
+ with pytest.raises(UnicodeDecodeError):
+ library.python.strings.to_unicode(NonConvertible())
+
+
+def test_to_unicode_errors_replace():
+ assert library.python.strings.to_unicode(u'abcабв'.encode('utf-8'), 'ascii')
+ assert library.python.strings.to_unicode(u'абв'.encode('utf-8'), 'ascii')
+
+
+def test_to_str():
+ assert library.python.strings.to_str('str') == 'str' if six.PY2 else b'str'
+ assert library.python.strings.to_str(u'unicode') == 'unicode' if six.PY2 else b'unicode'
+ assert library.python.strings.to_str(u'юникод') == u'юникод'.encode('utf-8')
+ assert library.python.strings.to_str(u'юникод', 'cp1251') == u'юникод'.encode('cp1251')
+ if six.PY2:
+ assert library.python.strings.to_str(Convertible()) == Convertible.text_utf8
+ with pytest.raises(UnicodeEncodeError):
+ library.python.strings.to_str(ConvertibleToUnicodeOnly())
+ assert library.python.strings.to_str(ConvertibleToStrOnly()) == Convertible.text_utf8
+ with pytest.raises(UnicodeEncodeError):
+ library.python.strings.to_str(NonConvertible())
+
+
+def test_to_str_errors_replace():
+ assert library.python.strings.to_str(u'abcабв', 'ascii')
+ assert library.python.strings.to_str(u'абв', 'ascii')
+
+
+def test_to_str_transcode():
+ assert library.python.strings.to_str('str', from_enc='ascii') == 'str' if six.PY2 else b'str'
+ assert library.python.strings.to_str('str', from_enc='utf-8') == 'str' if six.PY2 else b'str'
+
+ assert library.python.strings.to_str(u'юникод'.encode('utf-8'), from_enc='utf-8') == u'юникод'.encode('utf-8')
+ assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8', from_enc='utf-8') == u'юникод'.encode('utf-8')
+ assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251', from_enc='utf-8') == u'юникод'.encode('cp1251')
+
+ assert library.python.strings.to_str(u'юникод'.encode('cp1251'), from_enc='cp1251') == u'юникод'.encode('utf-8')
+ assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251', from_enc='cp1251') == u'юникод'.encode('cp1251')
+ assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8', from_enc='cp1251') == u'юникод'.encode('utf-8')
+
+ assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), from_enc='koi8-r') == u'юникод'.encode('utf-8')
+ assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), to_enc='koi8-r', from_enc='koi8-r') == u'юникод'.encode('koi8-r')
+ assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251', from_enc='koi8-r') == u'юникод'.encode('cp1251')
+
+
+def test_to_str_transcode_wrong():
+ assert library.python.strings.to_str(u'юникод'.encode('utf-8'), from_enc='cp1251')
+ assert library.python.strings.to_str(u'юникод'.encode('cp1251'), from_enc='utf-8')
+
+
+def test_to_str_transcode_disabled():
+ # No transcoding enabled, set from_enc to enable
+ assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='utf-8') == u'юникод'.encode('utf-8')
+ assert library.python.strings.to_str(u'юникод'.encode('utf-8'), to_enc='cp1251') == u'юникод'.encode('utf-8')
+ assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='utf-8') == u'юникод'.encode('cp1251')
+ assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='cp1251') == u'юникод'.encode('cp1251')
+ assert library.python.strings.to_str(u'юникод'.encode('cp1251'), to_enc='koi8-r') == u'юникод'.encode('cp1251')
+ assert library.python.strings.to_str(u'юникод'.encode('koi8-r'), to_enc='cp1251') == u'юникод'.encode('koi8-r')
+
+
+def test_stringize_deep():
+ assert library.python.strings.stringize_deep({
+ 'key 1': 'value 1',
+ u'ключ 2': u'значение 2',
+ 'list': [u'ключ 2', 'key 1', (u'к', 2)]
+ }) == {
+ 'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1',
+ u'ключ 2'.encode('utf-8'): u'значение 2'.encode('utf-8'),
+ 'list' if six.PY2 else b'list': [u'ключ 2'.encode('utf-8'), 'key 1' if six.PY2 else b'key 1', (u'к'.encode('utf-8'), 2)]
+ }
+
+
+def test_stringize_deep_doesnt_transcode():
+ assert library.python.strings.stringize_deep({
+ u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'),
+ u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'),
+ }) == {
+ u'ключ 1'.encode('utf-8'): u'значение 1'.encode('utf-8'),
+ u'ключ 2'.encode('cp1251'): u'значение 2'.encode('cp1251'),
+ }
+
+
+def test_stringize_deep_nested():
+ assert library.python.strings.stringize_deep({
+ 'key 1': 'value 1',
+ u'ключ 2': {
+ 'subkey 1': 'value 1',
+ u'подключ 2': u'value 2',
+ },
+ }) == {
+ 'key 1' if six.PY2 else b'key 1': 'value 1' if six.PY2 else b'value 1',
+ u'ключ 2'.encode('utf-8'): {
+ 'subkey 1' if six.PY2 else b'subkey 1': 'value 1' if six.PY2 else b'value 1',
+ u'подключ 2'.encode('utf-8'): u'value 2'.encode('utf-8'),
+ },
+ }
+
+
+def test_stringize_deep_plain():
+ assert library.python.strings.stringize_deep('str') == 'str' if six.PY2 else b'str'
+ assert library.python.strings.stringize_deep(u'юникод') == u'юникод'.encode('utf-8')
+ assert library.python.strings.stringize_deep(u'юникод'.encode('utf-8')) == u'юникод'.encode('utf-8')
+
+
+def test_stringize_deep_nonstr():
+ with pytest.raises(TypeError):
+ library.python.strings.stringize_deep(Convertible(), relaxed=False)
+ x = Convertible()
+ assert x == library.python.strings.stringize_deep(x)
+
+
+def test_unicodize_deep():
+ assert library.python.strings.unicodize_deep({
+ 'key 1': 'value 1',
+ u'ключ 2': u'значение 2',
+ u'ключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'),
+ }) == {
+ u'key 1': u'value 1',
+ u'ключ 2': u'значение 2',
+ u'ключ 3': u'значение 3',
+ }
+
+
+def test_unicodize_deep_nested():
+ assert library.python.strings.unicodize_deep({
+ 'key 1': 'value 1',
+ u'ключ 2': {
+ 'subkey 1': 'value 1',
+ u'подключ 2': u'значение 2',
+ u'подключ 3'.encode('utf-8'): u'значение 3'.encode('utf-8'),
+ },
+ }) == {
+ u'key 1': u'value 1',
+ u'ключ 2': {
+ u'subkey 1': u'value 1',
+ u'подключ 2': u'значение 2',
+ u'подключ 3': u'значение 3',
+ },
+ }
+
+
+def test_unicodize_deep_plain():
+ assert library.python.strings.unicodize_deep('str') == u'str'
+ assert library.python.strings.unicodize_deep(u'юникод') == u'юникод'
+ assert library.python.strings.unicodize_deep(u'юникод'.encode('utf-8')) == u'юникод'
+
+
+def test_unicodize_deep_nonstr():
+ with pytest.raises(TypeError):
+ library.python.strings.unicodize_deep(Convertible(), relaxed=False)
+ x = Convertible()
+ assert x == library.python.strings.stringize_deep(x)
diff --git a/library/python/strings/ut/ya.make b/library/python/strings/ut/ya.make
new file mode 100644
index 00000000000..dfacb226c76
--- /dev/null
+++ b/library/python/strings/ut/ya.make
@@ -0,0 +1,11 @@
+OWNER(g:yatool)
+
+PY23_TEST()
+
+TEST_SRCS(test_strings.py)
+
+PEERDIR(
+ library/python/strings
+)
+
+END()
diff --git a/library/python/strings/ya.make b/library/python/strings/ya.make
new file mode 100644
index 00000000000..7e0b033717c
--- /dev/null
+++ b/library/python/strings/ya.make
@@ -0,0 +1,16 @@
+OWNER(g:yatool)
+
+PY23_LIBRARY()
+
+PY_SRCS(
+ __init__.py
+ CYTHONIZE_PY
+ strings.py
+)
+
+PEERDIR(
+ library/python/func
+ contrib/python/six
+)
+
+END()