YDB Import 603

733cd37277ee72b54fc223cbea2c1d141412ee3a
author: robot-ydb-importer <[email protected]> 2024-06-21 19:38:50 +0300
committer: robot-ydb-importer <[email protected]> 2024-06-21 19:51:10 +0300
commit: 3cdeda6fa6a035965e143abf6c6972c912707bef (patch)
tree: 986be7f2e99f183f5c8babc34f452e49f1b4a138 /contrib/python/chardet/py2/test.py
parent: 4205a925c8efc7e3c87c27a0c6d697e54cd41beb (diff)
1 files changed, 151 insertions, 0 deletions
diff --git a/contrib/python/chardet/py2/test.py b/contrib/python/chardet/py2/test.py
new file mode 100644
index 00000000000..4235e1f49de
--- /dev/null
+++ b/contrib/python/chardet/py2/test.py
@@ -0,0 +1,151 @@
+"""
+Run chardet on a bunch of documents and see that we get the correct encodings.
+
+:author: Dan Blanchard
+:author: Ian Cordasco
+"""
+
+from __future__ import with_statement
+
+import textwrap
+from difflib import ndiff
+from io import open
+from os import listdir
+from os.path import dirname, isdir, join, splitext, basename
+
+try:
+    import hypothesis.strategies as st
+    from hypothesis import given, assume, settings, Verbosity
+    HAVE_HYPOTHESIS = True
+except ImportError:
+    HAVE_HYPOTHESIS = False
+import pytest
+
+import chardet
+import yatest.common
+
+
+# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
+#       retrain model.
+MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
+                     'windows-1254', 'windows-1256'}
+EXPECTED_FAILURES = {'iso-8859-7-greek/disabled.gr.xml',
+                     'iso-8859-9-turkish/divxplanet.com.xml',
+                     'iso-8859-9-turkish/subtitle.srt',
+                     'iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
+
+def gen_test_params():
+    """Yields tuples of paths and encodings to use for test_encoding_detection"""
+    base_path = yatest.common.work_path('test_data')
+    for encoding in listdir(base_path):
+        path = join(base_path, encoding)
+        # Skip files in tests directory
+        if not isdir(path):
+            continue
+        # Remove language suffixes from encoding if pressent
+        encoding = encoding.lower()
+        for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
+                        '-hebrew', '-hungarian', '-turkish']:
+            if encoding.endswith(postfix):
+                encoding = encoding.rpartition(postfix)[0]
+                break
+        # Skip directories for encodings we don't handle yet.
+        if encoding in MISSING_ENCODINGS:
+            continue
+        # Test encoding detection for each file we have of encoding for
+        for file_name in listdir(path):
+            ext = splitext(file_name)[1].lower()
+            if ext not in ['.html', '.txt', '.xml', '.srt']:
+                continue
+            full_path = join(path, file_name)
+            test_case = full_path, encoding
+            if join(basename(path), file_name) in EXPECTED_FAILURES:
+                test_case = pytest.param(*test_case, marks=pytest.mark.xfail)
+            yield test_case
+
+
+def get_test_name(args):
+    return join(basename(dirname(args)), basename(args))
+
+
+[email protected] ('file_name, encoding', gen_test_params(), ids=get_test_name)
+def test_encoding_detection(file_name, encoding):
+    with open(file_name, 'rb') as f:
+        input_bytes = f.read()
+        result = chardet.detect(input_bytes)
+        try:
+            expected_unicode = input_bytes.decode(encoding)
+        except LookupError:
+            expected_unicode = ''
+        try:
+            detected_unicode = input_bytes.decode(result['encoding'])
+        except (LookupError, UnicodeDecodeError, TypeError):
+            detected_unicode = ''
+    if result:
+        encoding_match = (result['encoding'] or '').lower() == encoding
+    else:
+        encoding_match = False
+    # Only care about mismatches that would actually result in different
+    # behavior when decoding
+    if not encoding_match and expected_unicode != detected_unicode:
+        wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n'
+        wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n'
+        diff = ''.join(ndiff(wrapped_expected.splitlines(True),
+                             wrapped_detected.splitlines(True)))
+    else:
+        diff = ''
+        encoding_match = True
+    assert encoding_match, ("Expected %s, but got %s for %s.  Character "
+                            "differences: \n%s" % (encoding,
+                                                   result,
+                                                   file_name,
+                                                   diff))
+
+
+if HAVE_HYPOTHESIS:
+    class JustALengthIssue(Exception):
+        pass
+
+
+    @pytest.mark.xfail
+    @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
+                                                 'utf-32', 'iso-8859-7',
+                                                 'iso-8859-8', 'windows-1255']),
+           st.randoms())
+    @settings(max_examples=200)
+    def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
+        try:
+            data = txt.encode(enc)
+        except UnicodeEncodeError:
+            assume(False)
+        detected = chardet.detect(data)['encoding']
+        if detected is None:
+            with pytest.raises(JustALengthIssue):
+                @given(st.text(), random=rnd)
+                @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
+                def string_poisons_following_text(suffix):
+                    try:
+                        extended = (txt + suffix).encode(enc)
+                    except UnicodeEncodeError:
+                        assume(False)
+                    result = chardet.detect(extended)
+                    if result and result['encoding'] is not None:
+                        raise JustALengthIssue()
+
+
+    @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
+                                                 'utf-32', 'iso-8859-7',
+                                                 'iso-8859-8', 'windows-1255']),
+           st.randoms())
+    @settings(max_examples=200)
+    def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
+        try:
+            data = txt.encode(enc)
+        except UnicodeEncodeError:
+            assume(False)
+        try:
+            result = chardet.detect(data)
+            results = chardet.detect_all(data)
+            assert result['encoding'] == results[0]['encoding']
+        except Exception:
+            raise Exception('%s != %s' % (result, results))
author	robot-ydb-importer <[email protected]>	2024-06-21 19:38:50 +0300
committer	robot-ydb-importer <[email protected]>	2024-06-21 19:51:10 +0300
commit	3cdeda6fa6a035965e143abf6c6972c912707bef (patch)
tree	986be7f2e99f183f5c8babc34f452e49f1b4a138 /contrib/python/chardet/py2/test.py
parent	4205a925c8efc7e3c87c27a0c6d697e54cd41beb (diff)