summaryrefslogtreecommitdiffstats
path: root/contrib/python/chardet/py3/test.py
diff options
context:
space:
mode:
authorrobot-ydb-importer <[email protected]>2024-06-21 19:38:50 +0300
committerrobot-ydb-importer <[email protected]>2024-06-21 19:51:10 +0300
commit3cdeda6fa6a035965e143abf6c6972c912707bef (patch)
tree986be7f2e99f183f5c8babc34f452e49f1b4a138 /contrib/python/chardet/py3/test.py
parent4205a925c8efc7e3c87c27a0c6d697e54cd41beb (diff)
YDB Import 603
733cd37277ee72b54fc223cbea2c1d141412ee3a
Diffstat (limited to 'contrib/python/chardet/py3/test.py')
-rw-r--r--contrib/python/chardet/py3/test.py240
1 files changed, 240 insertions, 0 deletions
diff --git a/contrib/python/chardet/py3/test.py b/contrib/python/chardet/py3/test.py
new file mode 100644
index 00000000000..5be3ab3fa0f
--- /dev/null
+++ b/contrib/python/chardet/py3/test.py
@@ -0,0 +1,240 @@
+"""
+Run chardet on a bunch of documents and see that we get the correct encodings.
+
+:author: Dan Blanchard
+:author: Ian Cordasco
+"""
+
+
+import textwrap
+from difflib import ndiff
+from os import listdir
+from os.path import dirname, isdir, join, realpath, relpath, splitext
+from pprint import pformat
+from unicodedata import normalize
+
+try:
+ import hypothesis.strategies as st
+ from hypothesis import Verbosity, assume, given, settings
+
+ HAVE_HYPOTHESIS = True
+except ImportError:
+ HAVE_HYPOTHESIS = False
+import pytest # pylint: disable=import-error
+
+import chardet
+from chardet.metadata.languages import LANGUAGES
+
+# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
+# retrain model.
+MISSING_ENCODINGS = {
+ "iso-8859-2",
+ "iso-8859-6",
+ "windows-1250",
+ "windows-1254",
+ "windows-1256",
+}
+EXPECTED_FAILURES = {
+ "tests/iso-8859-9-turkish/_ude_1.txt",
+ "tests/iso-8859-9-turkish/_ude_2.txt",
+ "tests/iso-8859-9-turkish/divxplanet.com.xml",
+ "tests/iso-8859-9-turkish/subtitle.srt",
+ "tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt",
+}
+
+
+def gen_test_params():
+ """Yields tuples of paths and encodings to use for test_encoding_detection"""
+ import yatest.common
+ base_path = yatest.common.work_path('test_data/tests')
+ for encoding in listdir(base_path):
+ path = join(base_path, encoding)
+ # Skip files in tests directory
+ if not isdir(path):
+ continue
+ # Remove language suffixes from encoding if present
+ encoding = encoding.lower()
+ for language in sorted(LANGUAGES.keys()):
+ postfix = "-" + language.lower()
+ if encoding.endswith(postfix):
+ encoding = encoding.rpartition(postfix)[0]
+ break
+ # Skip directories for encodings we don't handle yet.
+ if encoding in MISSING_ENCODINGS:
+ continue
+ # Test encoding detection for each file we have of encoding for
+ for file_name in listdir(path):
+ ext = splitext(file_name)[1].lower()
+ if ext not in [".html", ".txt", ".xml", ".srt"]:
+ continue
+ full_path = join(path, file_name)
+ test_case = full_path, encoding
+ name_test = full_path.split("/test_data/")[-1]
+ if name_test in EXPECTED_FAILURES:
+ test_case = pytest.param(*test_case, marks=pytest.mark.xfail, id=name_test)
+ else:
+ test_case = pytest.param(*test_case, id=name_test)
+ yield test_case
+
+
[email protected]("file_name, encoding", gen_test_params())
+def test_encoding_detection(file_name, encoding):
+ with open(file_name, "rb") as f:
+ input_bytes = f.read()
+ result = chardet.detect(input_bytes)
+ try:
+ expected_unicode = input_bytes.decode(encoding)
+ except LookupError:
+ expected_unicode = ""
+ try:
+ detected_unicode = input_bytes.decode(result["encoding"])
+ except (LookupError, UnicodeDecodeError, TypeError):
+ detected_unicode = ""
+ if result:
+ encoding_match = (result["encoding"] or "").lower() == encoding
+ else:
+ encoding_match = False
+ # Only care about mismatches that would actually result in different
+ # behavior when decoding
+ expected_unicode = normalize("NFKC", expected_unicode)
+ detected_unicode = normalize("NFKC", detected_unicode)
+ if not encoding_match and expected_unicode != detected_unicode:
+ wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
+ wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
+ diff = "".join(
+ [
+ line
+ for line in ndiff(
+ wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
+ )
+ if not line.startswith(" ")
+ ][:20]
+ )
+ all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
+ else:
+ diff = ""
+ encoding_match = True
+ all_encodings = [result]
+ assert encoding_match, (
+ f"Expected {encoding}, but got {result} for {file_name}. First 20 "
+ f"lines with character differences: \n{diff}\n"
+ f"All encodings: {pformat(all_encodings)}"
+ )
+
+
[email protected]("file_name, encoding", gen_test_params())
+def test_encoding_detection_rename_legacy(file_name, encoding):
+ with open(file_name, "rb") as f:
+ input_bytes = f.read()
+ result = chardet.detect(input_bytes, should_rename_legacy=True)
+ try:
+ expected_unicode = input_bytes.decode(encoding)
+ except LookupError:
+ expected_unicode = ""
+ try:
+ detected_unicode = input_bytes.decode(result["encoding"])
+ except (LookupError, UnicodeDecodeError, TypeError):
+ detected_unicode = ""
+ if result:
+ encoding_match = (result["encoding"] or "").lower() == encoding
+ else:
+ encoding_match = False
+ # Only care about mismatches that would actually result in different
+ # behavior when decoding
+ expected_unicode = normalize("NFKD", expected_unicode)
+ detected_unicode = normalize("NFKD", detected_unicode)
+ if not encoding_match and expected_unicode != detected_unicode:
+ wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
+ wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
+ diff = "".join(
+ [
+ line
+ for line in ndiff(
+ wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
+ )
+ if not line.startswith(" ")
+ ][:20]
+ )
+ all_encodings = chardet.detect_all(
+ input_bytes, ignore_threshold=True, should_rename_legacy=True
+ )
+ else:
+ diff = ""
+ encoding_match = True
+ all_encodings = [result]
+ assert encoding_match, (
+ f"Expected {encoding}, but got {result} for {file_name}. First 20 "
+ f"lines of character differences: \n{diff}\n"
+ f"All encodings: {pformat(all_encodings)}"
+ )
+
+
+if HAVE_HYPOTHESIS:
+
+ class JustALengthIssue(Exception):
+ pass
+
+ @pytest.mark.xfail
+ @given(
+ st.text(min_size=1),
+ st.sampled_from(
+ [
+ "ascii",
+ "utf-8",
+ "utf-16",
+ "utf-32",
+ "iso-8859-7",
+ "iso-8859-8",
+ "windows-1255",
+ ]
+ ),
+ st.randoms(),
+ )
+ @settings(max_examples=200)
+ def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
+ try:
+ data = txt.encode(enc)
+ except UnicodeEncodeError:
+ assume(False)
+ detected = chardet.detect(data)["encoding"]
+ if detected is None:
+ with pytest.raises(JustALengthIssue):
+
+ @given(st.text(), random=rnd)
+ @settings(verbosity=Verbosity.quiet, max_examples=50)
+ def string_poisons_following_text(suffix):
+ try:
+ extended = (txt + suffix).encode(enc)
+ except UnicodeEncodeError:
+ assume(False)
+ result = chardet.detect(extended)
+ if result and result["encoding"] is not None:
+ raise JustALengthIssue()
+
+ @given(
+ st.text(min_size=1),
+ st.sampled_from(
+ [
+ "ascii",
+ "utf-8",
+ "utf-16",
+ "utf-32",
+ "iso-8859-7",
+ "iso-8859-8",
+ "windows-1255",
+ ]
+ ),
+ st.randoms(),
+ )
+ @settings(max_examples=200)
+ def test_detect_all_and_detect_one_should_agree(txt, enc, _):
+ try:
+ data = txt.encode(enc)
+ except UnicodeEncodeError:
+ assume(False)
+ try:
+ result = chardet.detect(data)
+ results = chardet.detect_all(data)
+ assert result["encoding"] == results[0]["encoding"]
+ except Exception as exc:
+ raise RuntimeError(f"{result} != {results}") from exc