YDB Import 597

2221b5c9d4887055279d8e5e336b944948a706cc
author: robot-ydb-importer <[email protected]> 2024-05-14 12:38:30 +0300
committer: robot-ydb-importer <[email protected]> 2024-05-14 12:50:25 +0300
commit: 18c097ee61446bfe3e7cf13d0838626e2ecae59c (patch)
tree: da486ca986d9e9a46ce57ac44439e619f0782263 /contrib/python/chardet/py3/test.py
parent: d7d36caff079ed14f6dada5814fa44fe3e65660d (diff)
1 files changed, 0 insertions, 240 deletions
diff --git a/contrib/python/chardet/py3/test.py b/contrib/python/chardet/py3/test.py
deleted file mode 100644
index 5be3ab3fa0f..00000000000
--- a/contrib/python/chardet/py3/test.py
+++ /dev/null
@@ -1,240 +0,0 @@
-"""
-Run chardet on a bunch of documents and see that we get the correct encodings.
-
-:author: Dan Blanchard
-:author: Ian Cordasco
-"""
-
-
-import textwrap
-from difflib import ndiff
-from os import listdir
-from os.path import dirname, isdir, join, realpath, relpath, splitext
-from pprint import pformat
-from unicodedata import normalize
-
-try:
-    import hypothesis.strategies as st
-    from hypothesis import Verbosity, assume, given, settings
-
-    HAVE_HYPOTHESIS = True
-except ImportError:
-    HAVE_HYPOTHESIS = False
-import pytest  # pylint: disable=import-error
-
-import chardet
-from chardet.metadata.languages import LANGUAGES
-
-# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
-#       retrain model.
-MISSING_ENCODINGS = {
-    "iso-8859-2",
-    "iso-8859-6",
-    "windows-1250",
-    "windows-1254",
-    "windows-1256",
-}
-EXPECTED_FAILURES = {
-    "tests/iso-8859-9-turkish/_ude_1.txt",
-    "tests/iso-8859-9-turkish/_ude_2.txt",
-    "tests/iso-8859-9-turkish/divxplanet.com.xml",
-    "tests/iso-8859-9-turkish/subtitle.srt",
-    "tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt",
-}
-
-
-def gen_test_params():
-    """Yields tuples of paths and encodings to use for test_encoding_detection"""
-    import yatest.common
-    base_path = yatest.common.work_path('test_data/tests')
-    for encoding in listdir(base_path):
-        path = join(base_path, encoding)
-        # Skip files in tests directory
-        if not isdir(path):
-            continue
-        # Remove language suffixes from encoding if present
-        encoding = encoding.lower()
-        for language in sorted(LANGUAGES.keys()):
-            postfix = "-" + language.lower()
-            if encoding.endswith(postfix):
-                encoding = encoding.rpartition(postfix)[0]
-                break
-        # Skip directories for encodings we don't handle yet.
-        if encoding in MISSING_ENCODINGS:
-            continue
-        # Test encoding detection for each file we have of encoding for
-        for file_name in listdir(path):
-            ext = splitext(file_name)[1].lower()
-            if ext not in [".html", ".txt", ".xml", ".srt"]:
-                continue
-            full_path = join(path, file_name)
-            test_case = full_path, encoding
-            name_test = full_path.split("/test_data/")[-1]
-            if name_test in EXPECTED_FAILURES:
-                test_case = pytest.param(*test_case, marks=pytest.mark.xfail, id=name_test)
-            else:
-                test_case = pytest.param(*test_case, id=name_test)
-            yield test_case
-
-
-[email protected]("file_name, encoding", gen_test_params())
-def test_encoding_detection(file_name, encoding):
-    with open(file_name, "rb") as f:
-        input_bytes = f.read()
-        result = chardet.detect(input_bytes)
-        try:
-            expected_unicode = input_bytes.decode(encoding)
-        except LookupError:
-            expected_unicode = ""
-        try:
-            detected_unicode = input_bytes.decode(result["encoding"])
-        except (LookupError, UnicodeDecodeError, TypeError):
-            detected_unicode = ""
-    if result:
-        encoding_match = (result["encoding"] or "").lower() == encoding
-    else:
-        encoding_match = False
-    # Only care about mismatches that would actually result in different
-    # behavior when decoding
-    expected_unicode = normalize("NFKC", expected_unicode)
-    detected_unicode = normalize("NFKC", detected_unicode)
-    if not encoding_match and expected_unicode != detected_unicode:
-        wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
-        wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
-        diff = "".join(
-            [
-                line
-                for line in ndiff(
-                    wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
-                )
-                if not line.startswith(" ")
-            ][:20]
-        )
-        all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
-    else:
-        diff = ""
-        encoding_match = True
-        all_encodings = [result]
-    assert encoding_match, (
-        f"Expected {encoding}, but got {result} for {file_name}.  First 20 "
-        f"lines with character differences: \n{diff}\n"
-        f"All encodings: {pformat(all_encodings)}"
-    )
-
-
-[email protected]("file_name, encoding", gen_test_params())
-def test_encoding_detection_rename_legacy(file_name, encoding):
-    with open(file_name, "rb") as f:
-        input_bytes = f.read()
-        result = chardet.detect(input_bytes, should_rename_legacy=True)
-        try:
-            expected_unicode = input_bytes.decode(encoding)
-        except LookupError:
-            expected_unicode = ""
-        try:
-            detected_unicode = input_bytes.decode(result["encoding"])
-        except (LookupError, UnicodeDecodeError, TypeError):
-            detected_unicode = ""
-    if result:
-        encoding_match = (result["encoding"] or "").lower() == encoding
-    else:
-        encoding_match = False
-    # Only care about mismatches that would actually result in different
-    # behavior when decoding
-    expected_unicode = normalize("NFKD", expected_unicode)
-    detected_unicode = normalize("NFKD", detected_unicode)
-    if not encoding_match and expected_unicode != detected_unicode:
-        wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
-        wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
-        diff = "".join(
-            [
-                line
-                for line in ndiff(
-                    wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
-                )
-                if not line.startswith(" ")
-            ][:20]
-        )
-        all_encodings = chardet.detect_all(
-            input_bytes, ignore_threshold=True, should_rename_legacy=True
-        )
-    else:
-        diff = ""
-        encoding_match = True
-        all_encodings = [result]
-    assert encoding_match, (
-        f"Expected {encoding}, but got {result} for {file_name}.  First 20 "
-        f"lines of character differences: \n{diff}\n"
-        f"All encodings: {pformat(all_encodings)}"
-    )
-
-
-if HAVE_HYPOTHESIS:
-
-    class JustALengthIssue(Exception):
-        pass
-
-    @pytest.mark.xfail
-    @given(
-        st.text(min_size=1),
-        st.sampled_from(
-            [
-                "ascii",
-                "utf-8",
-                "utf-16",
-                "utf-32",
-                "iso-8859-7",
-                "iso-8859-8",
-                "windows-1255",
-            ]
-        ),
-        st.randoms(),
-    )
-    @settings(max_examples=200)
-    def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
-        try:
-            data = txt.encode(enc)
-        except UnicodeEncodeError:
-            assume(False)
-        detected = chardet.detect(data)["encoding"]
-        if detected is None:
-            with pytest.raises(JustALengthIssue):
-
-                @given(st.text(), random=rnd)
-                @settings(verbosity=Verbosity.quiet, max_examples=50)
-                def string_poisons_following_text(suffix):
-                    try:
-                        extended = (txt + suffix).encode(enc)
-                    except UnicodeEncodeError:
-                        assume(False)
-                    result = chardet.detect(extended)
-                    if result and result["encoding"] is not None:
-                        raise JustALengthIssue()
-
-    @given(
-        st.text(min_size=1),
-        st.sampled_from(
-            [
-                "ascii",
-                "utf-8",
-                "utf-16",
-                "utf-32",
-                "iso-8859-7",
-                "iso-8859-8",
-                "windows-1255",
-            ]
-        ),
-        st.randoms(),
-    )
-    @settings(max_examples=200)
-    def test_detect_all_and_detect_one_should_agree(txt, enc, _):
-        try:
-            data = txt.encode(enc)
-        except UnicodeEncodeError:
-            assume(False)
-        try:
-            result = chardet.detect(data)
-            results = chardet.detect_all(data)
-            assert result["encoding"] == results[0]["encoding"]
-        except Exception as exc:
-            raise RuntimeError(f"{result} != {results}") from exc
author	robot-ydb-importer <[email protected]>	2024-05-14 12:38:30 +0300
committer	robot-ydb-importer <[email protected]>	2024-05-14 12:50:25 +0300
commit	18c097ee61446bfe3e7cf13d0838626e2ecae59c (patch)
tree	da486ca986d9e9a46ce57ac44439e619f0782263 /contrib/python/chardet/py3/test.py
parent	d7d36caff079ed14f6dada5814fa44fe3e65660d (diff)