summaryrefslogtreecommitdiffstats
path: root/contrib/python/chardet/py2/test.py
diff options
context:
space:
mode:
authorrobot-ydb-importer <[email protected]>2024-05-14 12:38:30 +0300
committerrobot-ydb-importer <[email protected]>2024-05-14 12:50:25 +0300
commit18c097ee61446bfe3e7cf13d0838626e2ecae59c (patch)
treeda486ca986d9e9a46ce57ac44439e619f0782263 /contrib/python/chardet/py2/test.py
parentd7d36caff079ed14f6dada5814fa44fe3e65660d (diff)
YDB Import 597
2221b5c9d4887055279d8e5e336b944948a706cc
Diffstat (limited to 'contrib/python/chardet/py2/test.py')
-rw-r--r--contrib/python/chardet/py2/test.py151
1 files changed, 0 insertions, 151 deletions
diff --git a/contrib/python/chardet/py2/test.py b/contrib/python/chardet/py2/test.py
deleted file mode 100644
index 4235e1f49de..00000000000
--- a/contrib/python/chardet/py2/test.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""
-Run chardet on a bunch of documents and see that we get the correct encodings.
-
-:author: Dan Blanchard
-:author: Ian Cordasco
-"""
-
-from __future__ import with_statement
-
-import textwrap
-from difflib import ndiff
-from io import open
-from os import listdir
-from os.path import dirname, isdir, join, splitext, basename
-
-try:
- import hypothesis.strategies as st
- from hypothesis import given, assume, settings, Verbosity
- HAVE_HYPOTHESIS = True
-except ImportError:
- HAVE_HYPOTHESIS = False
-import pytest
-
-import chardet
-import yatest.common
-
-
-# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
-# retrain model.
-MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
- 'windows-1254', 'windows-1256'}
-EXPECTED_FAILURES = {'iso-8859-7-greek/disabled.gr.xml',
- 'iso-8859-9-turkish/divxplanet.com.xml',
- 'iso-8859-9-turkish/subtitle.srt',
- 'iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
-
-def gen_test_params():
- """Yields tuples of paths and encodings to use for test_encoding_detection"""
- base_path = yatest.common.work_path('test_data')
- for encoding in listdir(base_path):
- path = join(base_path, encoding)
- # Skip files in tests directory
- if not isdir(path):
- continue
- # Remove language suffixes from encoding if pressent
- encoding = encoding.lower()
- for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
- '-hebrew', '-hungarian', '-turkish']:
- if encoding.endswith(postfix):
- encoding = encoding.rpartition(postfix)[0]
- break
- # Skip directories for encodings we don't handle yet.
- if encoding in MISSING_ENCODINGS:
- continue
- # Test encoding detection for each file we have of encoding for
- for file_name in listdir(path):
- ext = splitext(file_name)[1].lower()
- if ext not in ['.html', '.txt', '.xml', '.srt']:
- continue
- full_path = join(path, file_name)
- test_case = full_path, encoding
- if join(basename(path), file_name) in EXPECTED_FAILURES:
- test_case = pytest.param(*test_case, marks=pytest.mark.xfail)
- yield test_case
-
-
-def get_test_name(args):
- return join(basename(dirname(args)), basename(args))
-
-
[email protected] ('file_name, encoding', gen_test_params(), ids=get_test_name)
-def test_encoding_detection(file_name, encoding):
- with open(file_name, 'rb') as f:
- input_bytes = f.read()
- result = chardet.detect(input_bytes)
- try:
- expected_unicode = input_bytes.decode(encoding)
- except LookupError:
- expected_unicode = ''
- try:
- detected_unicode = input_bytes.decode(result['encoding'])
- except (LookupError, UnicodeDecodeError, TypeError):
- detected_unicode = ''
- if result:
- encoding_match = (result['encoding'] or '').lower() == encoding
- else:
- encoding_match = False
- # Only care about mismatches that would actually result in different
- # behavior when decoding
- if not encoding_match and expected_unicode != detected_unicode:
- wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n'
- wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n'
- diff = ''.join(ndiff(wrapped_expected.splitlines(True),
- wrapped_detected.splitlines(True)))
- else:
- diff = ''
- encoding_match = True
- assert encoding_match, ("Expected %s, but got %s for %s. Character "
- "differences: \n%s" % (encoding,
- result,
- file_name,
- diff))
-
-
-if HAVE_HYPOTHESIS:
- class JustALengthIssue(Exception):
- pass
-
-
- @pytest.mark.xfail
- @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
- 'utf-32', 'iso-8859-7',
- 'iso-8859-8', 'windows-1255']),
- st.randoms())
- @settings(max_examples=200)
- def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
- try:
- data = txt.encode(enc)
- except UnicodeEncodeError:
- assume(False)
- detected = chardet.detect(data)['encoding']
- if detected is None:
- with pytest.raises(JustALengthIssue):
- @given(st.text(), random=rnd)
- @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
- def string_poisons_following_text(suffix):
- try:
- extended = (txt + suffix).encode(enc)
- except UnicodeEncodeError:
- assume(False)
- result = chardet.detect(extended)
- if result and result['encoding'] is not None:
- raise JustALengthIssue()
-
-
- @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
- 'utf-32', 'iso-8859-7',
- 'iso-8859-8', 'windows-1255']),
- st.randoms())
- @settings(max_examples=200)
- def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
- try:
- data = txt.encode(enc)
- except UnicodeEncodeError:
- assume(False)
- try:
- result = chardet.detect(data)
- results = chardet.detect_all(data)
- assert result['encoding'] == results[0]['encoding']
- except Exception:
- raise Exception('%s != %s' % (result, results))