YDB Import 597

2221b5c9d4887055279d8e5e336b944948a706cc
author: robot-ydb-importer <[email protected]> 2024-05-14 12:38:30 +0300
committer: robot-ydb-importer <[email protected]> 2024-05-14 12:50:25 +0300
commit: 18c097ee61446bfe3e7cf13d0838626e2ecae59c (patch)
tree: da486ca986d9e9a46ce57ac44439e619f0782263 /contrib/python/chardet/py2/test.py
parent: d7d36caff079ed14f6dada5814fa44fe3e65660d (diff)
1 files changed, 0 insertions, 151 deletions
diff --git a/contrib/python/chardet/py2/test.py b/contrib/python/chardet/py2/test.py
deleted file mode 100644
index 4235e1f49de..00000000000
--- a/contrib/python/chardet/py2/test.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""
-Run chardet on a bunch of documents and see that we get the correct encodings.
-
-:author: Dan Blanchard
-:author: Ian Cordasco
-"""
-
-from __future__ import with_statement
-
-import textwrap
-from difflib import ndiff
-from io import open
-from os import listdir
-from os.path import dirname, isdir, join, splitext, basename
-
-try:
-    import hypothesis.strategies as st
-    from hypothesis import given, assume, settings, Verbosity
-    HAVE_HYPOTHESIS = True
-except ImportError:
-    HAVE_HYPOTHESIS = False
-import pytest
-
-import chardet
-import yatest.common
-
-
-# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
-#       retrain model.
-MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
-                     'windows-1254', 'windows-1256'}
-EXPECTED_FAILURES = {'iso-8859-7-greek/disabled.gr.xml',
-                     'iso-8859-9-turkish/divxplanet.com.xml',
-                     'iso-8859-9-turkish/subtitle.srt',
-                     'iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
-
-def gen_test_params():
-    """Yields tuples of paths and encodings to use for test_encoding_detection"""
-    base_path = yatest.common.work_path('test_data')
-    for encoding in listdir(base_path):
-        path = join(base_path, encoding)
-        # Skip files in tests directory
-        if not isdir(path):
-            continue
-        # Remove language suffixes from encoding if pressent
-        encoding = encoding.lower()
-        for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
-                        '-hebrew', '-hungarian', '-turkish']:
-            if encoding.endswith(postfix):
-                encoding = encoding.rpartition(postfix)[0]
-                break
-        # Skip directories for encodings we don't handle yet.
-        if encoding in MISSING_ENCODINGS:
-            continue
-        # Test encoding detection for each file we have of encoding for
-        for file_name in listdir(path):
-            ext = splitext(file_name)[1].lower()
-            if ext not in ['.html', '.txt', '.xml', '.srt']:
-                continue
-            full_path = join(path, file_name)
-            test_case = full_path, encoding
-            if join(basename(path), file_name) in EXPECTED_FAILURES:
-                test_case = pytest.param(*test_case, marks=pytest.mark.xfail)
-            yield test_case
-
-
-def get_test_name(args):
-    return join(basename(dirname(args)), basename(args))
-
-
-[email protected] ('file_name, encoding', gen_test_params(), ids=get_test_name)
-def test_encoding_detection(file_name, encoding):
-    with open(file_name, 'rb') as f:
-        input_bytes = f.read()
-        result = chardet.detect(input_bytes)
-        try:
-            expected_unicode = input_bytes.decode(encoding)
-        except LookupError:
-            expected_unicode = ''
-        try:
-            detected_unicode = input_bytes.decode(result['encoding'])
-        except (LookupError, UnicodeDecodeError, TypeError):
-            detected_unicode = ''
-    if result:
-        encoding_match = (result['encoding'] or '').lower() == encoding
-    else:
-        encoding_match = False
-    # Only care about mismatches that would actually result in different
-    # behavior when decoding
-    if not encoding_match and expected_unicode != detected_unicode:
-        wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n'
-        wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n'
-        diff = ''.join(ndiff(wrapped_expected.splitlines(True),
-                             wrapped_detected.splitlines(True)))
-    else:
-        diff = ''
-        encoding_match = True
-    assert encoding_match, ("Expected %s, but got %s for %s.  Character "
-                            "differences: \n%s" % (encoding,
-                                                   result,
-                                                   file_name,
-                                                   diff))
-
-
-if HAVE_HYPOTHESIS:
-    class JustALengthIssue(Exception):
-        pass
-
-
-    @pytest.mark.xfail
-    @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
-                                                 'utf-32', 'iso-8859-7',
-                                                 'iso-8859-8', 'windows-1255']),
-           st.randoms())
-    @settings(max_examples=200)
-    def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
-        try:
-            data = txt.encode(enc)
-        except UnicodeEncodeError:
-            assume(False)
-        detected = chardet.detect(data)['encoding']
-        if detected is None:
-            with pytest.raises(JustALengthIssue):
-                @given(st.text(), random=rnd)
-                @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
-                def string_poisons_following_text(suffix):
-                    try:
-                        extended = (txt + suffix).encode(enc)
-                    except UnicodeEncodeError:
-                        assume(False)
-                    result = chardet.detect(extended)
-                    if result and result['encoding'] is not None:
-                        raise JustALengthIssue()
-
-
-    @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
-                                                 'utf-32', 'iso-8859-7',
-                                                 'iso-8859-8', 'windows-1255']),
-           st.randoms())
-    @settings(max_examples=200)
-    def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
-        try:
-            data = txt.encode(enc)
-        except UnicodeEncodeError:
-            assume(False)
-        try:
-            result = chardet.detect(data)
-            results = chardet.detect_all(data)
-            assert result['encoding'] == results[0]['encoding']
-        except Exception:
-            raise Exception('%s != %s' % (result, results))
author	robot-ydb-importer <[email protected]>	2024-05-14 12:38:30 +0300
committer	robot-ydb-importer <[email protected]>	2024-05-14 12:50:25 +0300
commit	18c097ee61446bfe3e7cf13d0838626e2ecae59c (patch)
tree	da486ca986d9e9a46ce57ac44439e619f0782263 /contrib/python/chardet/py2/test.py
parent	d7d36caff079ed14f6dada5814fa44fe3e65660d (diff)