aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/python/chardet/py2
diff options
context:
space:
mode:
authorshmel1k <shmel1k@ydb.tech>2023-11-26 18:16:14 +0300
committershmel1k <shmel1k@ydb.tech>2023-11-26 18:43:30 +0300
commitb8cf9e88f4c5c64d9406af533d8948deb050d695 (patch)
tree218eb61fb3c3b96ec08b4d8cdfef383104a87d63 /contrib/python/chardet/py2
parent523f645a83a0ec97a0332dbc3863bb354c92a328 (diff)
downloadydb-b8cf9e88f4c5c64d9406af533d8948deb050d695.tar.gz
add kikimr_configure
Diffstat (limited to 'contrib/python/chardet/py2')
-rw-r--r--contrib/python/chardet/py2/test.py151
-rw-r--r--contrib/python/chardet/py2/tests/ya.make21
2 files changed, 172 insertions, 0 deletions
diff --git a/contrib/python/chardet/py2/test.py b/contrib/python/chardet/py2/test.py
new file mode 100644
index 0000000000..4235e1f49d
--- /dev/null
+++ b/contrib/python/chardet/py2/test.py
@@ -0,0 +1,151 @@
+"""
+Run chardet on a bunch of documents and see that we get the correct encodings.
+
+:author: Dan Blanchard
+:author: Ian Cordasco
+"""
+
+from __future__ import with_statement
+
+import textwrap
+from difflib import ndiff
+from io import open
+from os import listdir
+from os.path import dirname, isdir, join, splitext, basename
+
+try:
+ import hypothesis.strategies as st
+ from hypothesis import given, assume, settings, Verbosity
+ HAVE_HYPOTHESIS = True
+except ImportError:
+ HAVE_HYPOTHESIS = False
+import pytest
+
+import chardet
+import yatest.common
+
+
+# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
+# retrain model.
+MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
+ 'windows-1254', 'windows-1256'}
+EXPECTED_FAILURES = {'iso-8859-7-greek/disabled.gr.xml',
+ 'iso-8859-9-turkish/divxplanet.com.xml',
+ 'iso-8859-9-turkish/subtitle.srt',
+ 'iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
+
+def gen_test_params():
+ """Yields tuples of paths and encodings to use for test_encoding_detection"""
+ base_path = yatest.common.work_path('test_data')
+ for encoding in listdir(base_path):
+ path = join(base_path, encoding)
+ # Skip files in tests directory
+ if not isdir(path):
+ continue
+ # Remove language suffixes from encoding if pressent
+ encoding = encoding.lower()
+ for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
+ '-hebrew', '-hungarian', '-turkish']:
+ if encoding.endswith(postfix):
+ encoding = encoding.rpartition(postfix)[0]
+ break
+ # Skip directories for encodings we don't handle yet.
+ if encoding in MISSING_ENCODINGS:
+ continue
+ # Test encoding detection for each file we have of encoding for
+ for file_name in listdir(path):
+ ext = splitext(file_name)[1].lower()
+ if ext not in ['.html', '.txt', '.xml', '.srt']:
+ continue
+ full_path = join(path, file_name)
+ test_case = full_path, encoding
+ if join(basename(path), file_name) in EXPECTED_FAILURES:
+ test_case = pytest.param(*test_case, marks=pytest.mark.xfail)
+ yield test_case
+
+
+def get_test_name(args):
+ return join(basename(dirname(args)), basename(args))
+
+
+@pytest.mark.parametrize ('file_name, encoding', gen_test_params(), ids=get_test_name)
+def test_encoding_detection(file_name, encoding):
+ with open(file_name, 'rb') as f:
+ input_bytes = f.read()
+ result = chardet.detect(input_bytes)
+ try:
+ expected_unicode = input_bytes.decode(encoding)
+ except LookupError:
+ expected_unicode = ''
+ try:
+ detected_unicode = input_bytes.decode(result['encoding'])
+ except (LookupError, UnicodeDecodeError, TypeError):
+ detected_unicode = ''
+ if result:
+ encoding_match = (result['encoding'] or '').lower() == encoding
+ else:
+ encoding_match = False
+ # Only care about mismatches that would actually result in different
+ # behavior when decoding
+ if not encoding_match and expected_unicode != detected_unicode:
+ wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n'
+ wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n'
+ diff = ''.join(ndiff(wrapped_expected.splitlines(True),
+ wrapped_detected.splitlines(True)))
+ else:
+ diff = ''
+ encoding_match = True
+ assert encoding_match, ("Expected %s, but got %s for %s. Character "
+ "differences: \n%s" % (encoding,
+ result,
+ file_name,
+ diff))
+
+
+if HAVE_HYPOTHESIS:
+ class JustALengthIssue(Exception):
+ pass
+
+
+ @pytest.mark.xfail
+ @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
+ 'utf-32', 'iso-8859-7',
+ 'iso-8859-8', 'windows-1255']),
+ st.randoms())
+ @settings(max_examples=200)
+ def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
+ try:
+ data = txt.encode(enc)
+ except UnicodeEncodeError:
+ assume(False)
+ detected = chardet.detect(data)['encoding']
+ if detected is None:
+ with pytest.raises(JustALengthIssue):
+ @given(st.text(), random=rnd)
+ @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
+ def string_poisons_following_text(suffix):
+ try:
+ extended = (txt + suffix).encode(enc)
+ except UnicodeEncodeError:
+ assume(False)
+ result = chardet.detect(extended)
+ if result and result['encoding'] is not None:
+ raise JustALengthIssue()
+
+
+ @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
+ 'utf-32', 'iso-8859-7',
+ 'iso-8859-8', 'windows-1255']),
+ st.randoms())
+ @settings(max_examples=200)
+ def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
+ try:
+ data = txt.encode(enc)
+ except UnicodeEncodeError:
+ assume(False)
+ try:
+ result = chardet.detect(data)
+ results = chardet.detect_all(data)
+ assert result['encoding'] == results[0]['encoding']
+ except Exception:
+ raise Exception('%s != %s' % (result, results))
diff --git a/contrib/python/chardet/py2/tests/ya.make b/contrib/python/chardet/py2/tests/ya.make
new file mode 100644
index 0000000000..3795bfa7fb
--- /dev/null
+++ b/contrib/python/chardet/py2/tests/ya.make
@@ -0,0 +1,21 @@
+PY2TEST()
+
+SRCDIR(contrib/python/chardet/py2)
+
+TEST_SRCS(
+ test.py
+)
+
+PEERDIR(
+ contrib/python/chardet
+)
+
+DATA(
+ sbr://405525759
+)
+
+SIZE(MEDIUM)
+
+NO_LINT()
+
+END()