diff options
author | shmel1k <shmel1k@ydb.tech> | 2023-11-26 18:16:14 +0300 |
---|---|---|
committer | shmel1k <shmel1k@ydb.tech> | 2023-11-26 18:43:30 +0300 |
commit | b8cf9e88f4c5c64d9406af533d8948deb050d695 (patch) | |
tree | 218eb61fb3c3b96ec08b4d8cdfef383104a87d63 /contrib/python/chardet/py2 | |
parent | 523f645a83a0ec97a0332dbc3863bb354c92a328 (diff) | |
download | ydb-b8cf9e88f4c5c64d9406af533d8948deb050d695.tar.gz |
add kikimr_configure
Diffstat (limited to 'contrib/python/chardet/py2')
-rw-r--r-- | contrib/python/chardet/py2/test.py | 151 | ||||
-rw-r--r-- | contrib/python/chardet/py2/tests/ya.make | 21 |
2 files changed, 172 insertions, 0 deletions
diff --git a/contrib/python/chardet/py2/test.py b/contrib/python/chardet/py2/test.py new file mode 100644 index 0000000000..4235e1f49d --- /dev/null +++ b/contrib/python/chardet/py2/test.py @@ -0,0 +1,151 @@ +""" +Run chardet on a bunch of documents and see that we get the correct encodings. + +:author: Dan Blanchard +:author: Ian Cordasco +""" + +from __future__ import with_statement + +import textwrap +from difflib import ndiff +from io import open +from os import listdir +from os.path import dirname, isdir, join, splitext, basename + +try: + import hypothesis.strategies as st + from hypothesis import given, assume, settings, Verbosity + HAVE_HYPOTHESIS = True +except ImportError: + HAVE_HYPOTHESIS = False +import pytest + +import chardet +import yatest.common + + +# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we +# retrain model. +MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250', + 'windows-1254', 'windows-1256'} +EXPECTED_FAILURES = {'iso-8859-7-greek/disabled.gr.xml', + 'iso-8859-9-turkish/divxplanet.com.xml', + 'iso-8859-9-turkish/subtitle.srt', + 'iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'} + +def gen_test_params(): + """Yields tuples of paths and encodings to use for test_encoding_detection""" + base_path = yatest.common.work_path('test_data') + for encoding in listdir(base_path): + path = join(base_path, encoding) + # Skip files in tests directory + if not isdir(path): + continue + # Remove language suffixes from encoding if pressent + encoding = encoding.lower() + for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek', + '-hebrew', '-hungarian', '-turkish']: + if encoding.endswith(postfix): + encoding = encoding.rpartition(postfix)[0] + break + # Skip directories for encodings we don't handle yet. + if encoding in MISSING_ENCODINGS: + continue + # Test encoding detection for each file we have of encoding for + for file_name in listdir(path): + ext = splitext(file_name)[1].lower() + if ext not in ['.html', '.txt', '.xml', '.srt']: + continue + full_path = join(path, file_name) + test_case = full_path, encoding + if join(basename(path), file_name) in EXPECTED_FAILURES: + test_case = pytest.param(*test_case, marks=pytest.mark.xfail) + yield test_case + + +def get_test_name(args): + return join(basename(dirname(args)), basename(args)) + + +@pytest.mark.parametrize ('file_name, encoding', gen_test_params(), ids=get_test_name) +def test_encoding_detection(file_name, encoding): + with open(file_name, 'rb') as f: + input_bytes = f.read() + result = chardet.detect(input_bytes) + try: + expected_unicode = input_bytes.decode(encoding) + except LookupError: + expected_unicode = '' + try: + detected_unicode = input_bytes.decode(result['encoding']) + except (LookupError, UnicodeDecodeError, TypeError): + detected_unicode = '' + if result: + encoding_match = (result['encoding'] or '').lower() == encoding + else: + encoding_match = False + # Only care about mismatches that would actually result in different + # behavior when decoding + if not encoding_match and expected_unicode != detected_unicode: + wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n' + wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n' + diff = ''.join(ndiff(wrapped_expected.splitlines(True), + wrapped_detected.splitlines(True))) + else: + diff = '' + encoding_match = True + assert encoding_match, ("Expected %s, but got %s for %s. Character " + "differences: \n%s" % (encoding, + result, + file_name, + diff)) + + +if HAVE_HYPOTHESIS: + class JustALengthIssue(Exception): + pass + + + @pytest.mark.xfail + @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16', + 'utf-32', 'iso-8859-7', + 'iso-8859-8', 'windows-1255']), + st.randoms()) + @settings(max_examples=200) + def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd): + try: + data = txt.encode(enc) + except UnicodeEncodeError: + assume(False) + detected = chardet.detect(data)['encoding'] + if detected is None: + with pytest.raises(JustALengthIssue): + @given(st.text(), random=rnd) + @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50) + def string_poisons_following_text(suffix): + try: + extended = (txt + suffix).encode(enc) + except UnicodeEncodeError: + assume(False) + result = chardet.detect(extended) + if result and result['encoding'] is not None: + raise JustALengthIssue() + + + @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16', + 'utf-32', 'iso-8859-7', + 'iso-8859-8', 'windows-1255']), + st.randoms()) + @settings(max_examples=200) + def test_detect_all_and_detect_one_should_agree(txt, enc, rnd): + try: + data = txt.encode(enc) + except UnicodeEncodeError: + assume(False) + try: + result = chardet.detect(data) + results = chardet.detect_all(data) + assert result['encoding'] == results[0]['encoding'] + except Exception: + raise Exception('%s != %s' % (result, results)) diff --git a/contrib/python/chardet/py2/tests/ya.make b/contrib/python/chardet/py2/tests/ya.make new file mode 100644 index 0000000000..3795bfa7fb --- /dev/null +++ b/contrib/python/chardet/py2/tests/ya.make @@ -0,0 +1,21 @@ +PY2TEST() + +SRCDIR(contrib/python/chardet/py2) + +TEST_SRCS( + test.py +) + +PEERDIR( + contrib/python/chardet +) + +DATA( + sbr://405525759 +) + +SIZE(MEDIUM) + +NO_LINT() + +END() |