diff options
author | arcadia-devtools <arcadia-devtools@yandex-team.ru> | 2022-06-20 18:39:30 +0300 |
---|---|---|
committer | arcadia-devtools <arcadia-devtools@yandex-team.ru> | 2022-06-20 18:39:30 +0300 |
commit | 798d25a291578fceb2223382b508fba1723fef4a (patch) | |
tree | 227b9a24000c40ae3354f4321ff9fe19143423f7 /contrib | |
parent | d934aec555f13784eabe2d7682211050918e6cf5 (diff) | |
download | ydb-798d25a291578fceb2223382b508fba1723fef4a.tar.gz |
intermediate changes
ref:ac842eacda5e614f20cf9d3985d932732f92beab
Diffstat (limited to 'contrib')
11 files changed, 1451 insertions, 1541 deletions
diff --git a/contrib/python/charset-normalizer/.dist-info/METADATA b/contrib/python/charset-normalizer/.dist-info/METADATA index 1b04ed4c4e..0ba0f9d513 100644 --- a/contrib/python/charset-normalizer/.dist-info/METADATA +++ b/contrib/python/charset-normalizer/.dist-info/METADATA @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: charset-normalizer -Version: 2.0.12 +Version: 2.1.0 Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet. Home-page: https://github.com/ousret/charset_normalizer Author: Ahmed TAHRI @Ousret @@ -10,13 +10,13 @@ Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest Keywords: encoding,i18n,txt,text,charset,charset-detector,normalization,unicode,chardet Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable Classifier: License :: OSI Approved :: MIT License Classifier: Intended Audience :: Developers Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: 3.8 @@ -27,7 +27,7 @@ Classifier: Topic :: Text Processing :: Linguistic Classifier: Topic :: Utilities Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Typing :: Typed -Requires-Python: >=3.5.0 +Requires-Python: >=3.6.0 Description-Content-Type: text/markdown License-File: LICENSE Provides-Extra: unicode_backport @@ -87,13 +87,13 @@ This package offer better performance than its counterpart Chardet. Here are som | Package | Accuracy | Mean per file (ms) | File per sec (est) | | ------------- | :-------------: | :------------------: | :------------------: | -| [chardet](https://github.com/chardet/chardet) | 92 % | 220 ms | 5 file/sec | -| charset-normalizer | **98 %** | **40 ms** | 25 file/sec | +| [chardet](https://github.com/chardet/chardet) | 92 % | 200 ms | 5 file/sec | +| charset-normalizer | **98 %** | **39 ms** | 26 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | | ------------- | :-------------: | :------------------: | :------------------: | -| [chardet](https://github.com/chardet/chardet) | 1115 ms | 300 ms | 27 ms | -| charset-normalizer | 460 ms | 240 ms | 18 ms | +| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms | +| charset-normalizer | 400 ms | 200 ms | 15 ms | Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload. diff --git a/contrib/python/charset-normalizer/README.md b/contrib/python/charset-normalizer/README.md index b4c957a63c..904b60ea22 100644 --- a/contrib/python/charset-normalizer/README.md +++ b/contrib/python/charset-normalizer/README.md @@ -51,13 +51,13 @@ This package offer better performance than its counterpart Chardet. Here are som | Package | Accuracy | Mean per file (ms) | File per sec (est) | | ------------- | :-------------: | :------------------: | :------------------: | -| [chardet](https://github.com/chardet/chardet) | 92 % | 220 ms | 5 file/sec | -| charset-normalizer | **98 %** | **40 ms** | 25 file/sec | +| [chardet](https://github.com/chardet/chardet) | 92 % | 200 ms | 5 file/sec | +| charset-normalizer | **98 %** | **39 ms** | 26 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | | ------------- | :-------------: | :------------------: | :------------------: | -| [chardet](https://github.com/chardet/chardet) | 1115 ms | 300 ms | 27 ms | -| charset-normalizer | 460 ms | 240 ms | 18 ms | +| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms | +| charset-normalizer | 400 ms | 200 ms | 15 ms | Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload. diff --git a/contrib/python/charset-normalizer/charset_normalizer/api.py b/contrib/python/charset-normalizer/charset_normalizer/api.py index bdc8ed9893..ae08361bb4 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/api.py +++ b/contrib/python/charset-normalizer/charset_normalizer/api.py @@ -1,12 +1,8 @@ import logging +from os import PathLike from os.path import basename, splitext from typing import BinaryIO, List, Optional, Set -try: - from os import PathLike -except ImportError: # pragma: no cover - PathLike = str # type: ignore - from .cd import ( coherence_ratio, encoding_languages, @@ -18,6 +14,7 @@ from .md import mess_ratio from .models import CharsetMatch, CharsetMatches from .utils import ( any_specified_encoding, + cut_sequence_chunks, iana_name, identify_sig_or_bom, is_cp_similar, @@ -70,11 +67,11 @@ def from_bytes( ) if explain: - previous_logger_level = logger.level # type: int + previous_logger_level: int = logger.level logger.addHandler(explain_handler) logger.setLevel(TRACE) - length = len(sequences) # type: int + length: int = len(sequences) if length == 0: logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.") @@ -119,8 +116,8 @@ def from_bytes( if steps > 1 and length / steps < chunk_size: chunk_size = int(length / steps) - is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE # type: bool - is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool + is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE + is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE if is_too_small_sequence: logger.log( @@ -137,11 +134,11 @@ def from_bytes( ), ) - prioritized_encodings = [] # type: List[str] + prioritized_encodings: List[str] = [] - specified_encoding = ( + specified_encoding: Optional[str] = ( any_specified_encoding(sequences) if preemptive_behaviour else None - ) # type: Optional[str] + ) if specified_encoding is not None: prioritized_encodings.append(specified_encoding) @@ -151,15 +148,15 @@ def from_bytes( specified_encoding, ) - tested = set() # type: Set[str] - tested_but_hard_failure = [] # type: List[str] - tested_but_soft_failure = [] # type: List[str] + tested: Set[str] = set() + tested_but_hard_failure: List[str] = [] + tested_but_soft_failure: List[str] = [] - fallback_ascii = None # type: Optional[CharsetMatch] - fallback_u8 = None # type: Optional[CharsetMatch] - fallback_specified = None # type: Optional[CharsetMatch] + fallback_ascii: Optional[CharsetMatch] = None + fallback_u8: Optional[CharsetMatch] = None + fallback_specified: Optional[CharsetMatch] = None - results = CharsetMatches() # type: CharsetMatches + results: CharsetMatches = CharsetMatches() sig_encoding, sig_payload = identify_sig_or_bom(sequences) @@ -190,11 +187,11 @@ def from_bytes( tested.add(encoding_iana) - decoded_payload = None # type: Optional[str] - bom_or_sig_available = sig_encoding == encoding_iana # type: bool - strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom( + decoded_payload: Optional[str] = None + bom_or_sig_available: bool = sig_encoding == encoding_iana + strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom( encoding_iana - ) # type: bool + ) if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available: logger.log( @@ -205,7 +202,7 @@ def from_bytes( continue try: - is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool + is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana) except (ModuleNotFoundError, ImportError): logger.log( TRACE, @@ -240,7 +237,7 @@ def from_bytes( tested_but_hard_failure.append(encoding_iana) continue - similar_soft_failure_test = False # type: bool + similar_soft_failure_test: bool = False for encoding_soft_failed in tested_but_soft_failure: if is_cp_similar(encoding_iana, encoding_soft_failed): @@ -262,11 +259,11 @@ def from_bytes( int(length / steps), ) - multi_byte_bonus = ( + multi_byte_bonus: bool = ( is_multi_byte_decoder and decoded_payload is not None and len(decoded_payload) < length - ) # type: bool + ) if multi_byte_bonus: logger.log( @@ -276,72 +273,47 @@ def from_bytes( encoding_iana, ) - max_chunk_gave_up = int(len(r_) / 4) # type: int + max_chunk_gave_up: int = int(len(r_) / 4) max_chunk_gave_up = max(max_chunk_gave_up, 2) - early_stop_count = 0 # type: int + early_stop_count: int = 0 lazy_str_hard_failure = False - md_chunks = [] # type: List[str] + md_chunks: List[str] = [] md_ratios = [] - for i in r_: - if i + chunk_size > length + 8: - continue - - cut_sequence = sequences[i : i + chunk_size] - - if bom_or_sig_available and strip_sig_or_bom is False: - cut_sequence = sig_payload + cut_sequence - - try: - chunk = cut_sequence.decode( - encoding_iana, - errors="ignore" if is_multi_byte_decoder else "strict", - ) # type: str - except UnicodeDecodeError as e: # Lazy str loading may have missed something there - logger.log( - TRACE, - "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s", - encoding_iana, - str(e), - ) - early_stop_count = max_chunk_gave_up - lazy_str_hard_failure = True - break + try: + for chunk in cut_sequence_chunks( + sequences, + encoding_iana, + r_, + chunk_size, + bom_or_sig_available, + strip_sig_or_bom, + sig_payload, + is_multi_byte_decoder, + decoded_payload, + ): + md_chunks.append(chunk) - # multi-byte bad cutting detector and adjustment - # not the cleanest way to perform that fix but clever enough for now. - if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: + md_ratios.append(mess_ratio(chunk, threshold)) - chunk_partial_size_chk = min(chunk_size, 16) # type: int + if md_ratios[-1] >= threshold: + early_stop_count += 1 - if ( - decoded_payload - and chunk[:chunk_partial_size_chk] not in decoded_payload + if (early_stop_count >= max_chunk_gave_up) or ( + bom_or_sig_available and strip_sig_or_bom is False ): - for j in range(i, i - 4, -1): - cut_sequence = sequences[j : i + chunk_size] - - if bom_or_sig_available and strip_sig_or_bom is False: - cut_sequence = sig_payload + cut_sequence - - chunk = cut_sequence.decode(encoding_iana, errors="ignore") - - if chunk[:chunk_partial_size_chk] in decoded_payload: - break - - md_chunks.append(chunk) - - md_ratios.append(mess_ratio(chunk, threshold)) - - if md_ratios[-1] >= threshold: - early_stop_count += 1 - - if (early_stop_count >= max_chunk_gave_up) or ( - bom_or_sig_available and strip_sig_or_bom is False - ): - break + break + except UnicodeDecodeError as e: # Lazy str loading may have missed something there + logger.log( + TRACE, + "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s", + encoding_iana, + str(e), + ) + early_stop_count = max_chunk_gave_up + lazy_str_hard_failure = True # We might want to check the sequence again with the whole content # Only if initial MD tests passes @@ -362,9 +334,7 @@ def from_bytes( tested_but_hard_failure.append(encoding_iana) continue - mean_mess_ratio = ( - sum(md_ratios) / len(md_ratios) if md_ratios else 0.0 - ) # type: float + mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0 if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up: tested_but_soft_failure.append(encoding_iana) logger.log( @@ -399,7 +369,7 @@ def from_bytes( ) if not is_multi_byte_decoder: - target_languages = encoding_languages(encoding_iana) # type: List[str] + target_languages: List[str] = encoding_languages(encoding_iana) else: target_languages = mb_encoding_languages(encoding_iana) diff --git a/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py b/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py index b2e56ff398..b9a3700f79 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py +++ b/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py @@ -1,1244 +1,1122 @@ # -*- coding: utf_8 -*- -from collections import OrderedDict +from typing import Dict, List -FREQUENCIES = OrderedDict( - [ - ( - "English", - [ - "e", - "a", - "t", - "i", - "o", - "n", - "s", - "r", - "h", - "l", - "d", - "c", - "u", - "m", - "f", - "p", - "g", - "w", - "y", - "b", - "v", - "k", - "x", - "j", - "z", - "q", - ], - ), - ( - "German", - [ - "e", - "n", - "i", - "r", - "s", - "t", - "a", - "d", - "h", - "u", - "l", - "g", - "o", - "c", - "m", - "b", - "f", - "k", - "w", - "z", - "p", - "v", - "ü", - "ä", - "ö", - "j", - ], - ), - ( - "French", - [ - "e", - "a", - "s", - "n", - "i", - "t", - "r", - "l", - "u", - "o", - "d", - "c", - "p", - "m", - "é", - "v", - "g", - "f", - "b", - "h", - "q", - "à", - "x", - "è", - "y", - "j", - ], - ), - ( - "Dutch", - [ - "e", - "n", - "a", - "i", - "r", - "t", - "o", - "d", - "s", - "l", - "g", - "h", - "v", - "m", - "u", - "k", - "c", - "p", - "b", - "w", - "j", - "z", - "f", - "y", - "x", - "ë", - ], - ), - ( - "Italian", - [ - "e", - "i", - "a", - "o", - "n", - "l", - "t", - "r", - "s", - "c", - "d", - "u", - "p", - "m", - "g", - "v", - "f", - "b", - "z", - "h", - "q", - "è", - "à", - "k", - "y", - "ò", - ], - ), - ( - "Polish", - [ - "a", - "i", - "o", - "e", - "n", - "r", - "z", - "w", - "s", - "c", - "t", - "k", - "y", - "d", - "p", - "m", - "u", - "l", - "j", - "ł", - "g", - "b", - "h", - "ą", - "ę", - "ó", - ], - ), - ( - "Spanish", - [ - "e", - "a", - "o", - "n", - "s", - "r", - "i", - "l", - "d", - "t", - "c", - "u", - "m", - "p", - "b", - "g", - "v", - "f", - "y", - "ó", - "h", - "q", - "í", - "j", - "z", - "á", - ], - ), - ( - "Russian", - [ - "о", - "а", - "е", - "и", - "н", - "с", - "т", - "р", - "в", - "л", - "к", - "м", - "д", - "п", - "у", - "г", - "я", - "ы", - "з", - "б", - "й", - "ь", - "ч", - "х", - "ж", - "ц", - ], - ), - ( - "Japanese", - [ - "の", - "に", - "る", - "た", - "は", - "ー", - "と", - "し", - "を", - "で", - "て", - "が", - "い", - "ン", - "れ", - "な", - "年", - "ス", - "っ", - "ル", - "か", - "ら", - "あ", - "さ", - "も", - "り", - ], - ), - ( - "Portuguese", - [ - "a", - "e", - "o", - "s", - "i", - "r", - "d", - "n", - "t", - "m", - "u", - "c", - "l", - "p", - "g", - "v", - "b", - "f", - "h", - "ã", - "q", - "é", - "ç", - "á", - "z", - "í", - ], - ), - ( - "Swedish", - [ - "e", - "a", - "n", - "r", - "t", - "s", - "i", - "l", - "d", - "o", - "m", - "k", - "g", - "v", - "h", - "f", - "u", - "p", - "ä", - "c", - "b", - "ö", - "å", - "y", - "j", - "x", - ], - ), - ( - "Chinese", - [ - "的", - "一", - "是", - "不", - "了", - "在", - "人", - "有", - "我", - "他", - "这", - "个", - "们", - "中", - "来", - "上", - "大", - "为", - "和", - "国", - "地", - "到", - "以", - "说", - "时", - "要", - "就", - "出", - "会", - ], - ), - ( - "Ukrainian", - [ - "о", - "а", - "н", - "і", - "и", - "р", - "в", - "т", - "е", - "с", - "к", - "л", - "у", - "д", - "м", - "п", - "з", - "я", - "ь", - "б", - "г", - "й", - "ч", - "х", - "ц", - "ї", - ], - ), - ( - "Norwegian", - [ - "e", - "r", - "n", - "t", - "a", - "s", - "i", - "o", - "l", - "d", - "g", - "k", - "m", - "v", - "f", - "p", - "u", - "b", - "h", - "å", - "y", - "j", - "ø", - "c", - "æ", - "w", - ], - ), - ( - "Finnish", - [ - "a", - "i", - "n", - "t", - "e", - "s", - "l", - "o", - "u", - "k", - "ä", - "m", - "r", - "v", - "j", - "h", - "p", - "y", - "d", - "ö", - "g", - "c", - "b", - "f", - "w", - "z", - ], - ), - ( - "Vietnamese", - [ - "n", - "h", - "t", - "i", - "c", - "g", - "a", - "o", - "u", - "m", - "l", - "r", - "à", - "đ", - "s", - "e", - "v", - "p", - "b", - "y", - "ư", - "d", - "á", - "k", - "ộ", - "ế", - ], - ), - ( - "Czech", - [ - "o", - "e", - "a", - "n", - "t", - "s", - "i", - "l", - "v", - "r", - "k", - "d", - "u", - "m", - "p", - "í", - "c", - "h", - "z", - "á", - "y", - "j", - "b", - "ě", - "é", - "ř", - ], - ), - ( - "Hungarian", - [ - "e", - "a", - "t", - "l", - "s", - "n", - "k", - "r", - "i", - "o", - "z", - "á", - "é", - "g", - "m", - "b", - "y", - "v", - "d", - "h", - "u", - "p", - "j", - "ö", - "f", - "c", - ], - ), - ( - "Korean", - [ - "이", - "다", - "에", - "의", - "는", - "로", - "하", - "을", - "가", - "고", - "지", - "서", - "한", - "은", - "기", - "으", - "년", - "대", - "사", - "시", - "를", - "리", - "도", - "인", - "스", - "일", - ], - ), - ( - "Indonesian", - [ - "a", - "n", - "e", - "i", - "r", - "t", - "u", - "s", - "d", - "k", - "m", - "l", - "g", - "p", - "b", - "o", - "h", - "y", - "j", - "c", - "w", - "f", - "v", - "z", - "x", - "q", - ], - ), - ( - "Turkish", - [ - "a", - "e", - "i", - "n", - "r", - "l", - "ı", - "k", - "d", - "t", - "s", - "m", - "y", - "u", - "o", - "b", - "ü", - "ş", - "v", - "g", - "z", - "h", - "c", - "p", - "ç", - "ğ", - ], - ), - ( - "Romanian", - [ - "e", - "i", - "a", - "r", - "n", - "t", - "u", - "l", - "o", - "c", - "s", - "d", - "p", - "m", - "ă", - "f", - "v", - "î", - "g", - "b", - "ș", - "ț", - "z", - "h", - "â", - "j", - ], - ), - ( - "Farsi", - [ - "ا", - "ی", - "ر", - "د", - "ن", - "ه", - "و", - "م", - "ت", - "ب", - "س", - "ل", - "ک", - "ش", - "ز", - "ف", - "گ", - "ع", - "خ", - "ق", - "ج", - "آ", - "پ", - "ح", - "ط", - "ص", - ], - ), - ( - "Arabic", - [ - "ا", - "ل", - "ي", - "م", - "و", - "ن", - "ر", - "ت", - "ب", - "ة", - "ع", - "د", - "س", - "ف", - "ه", - "ك", - "ق", - "أ", - "ح", - "ج", - "ش", - "ط", - "ص", - "ى", - "خ", - "إ", - ], - ), - ( - "Danish", - [ - "e", - "r", - "n", - "t", - "a", - "i", - "s", - "d", - "l", - "o", - "g", - "m", - "k", - "f", - "v", - "u", - "b", - "h", - "p", - "å", - "y", - "ø", - "æ", - "c", - "j", - "w", - ], - ), - ( - "Serbian", - [ - "а", - "и", - "о", - "е", - "н", - "р", - "с", - "у", - "т", - "к", - "ј", - "в", - "д", - "м", - "п", - "л", - "г", - "з", - "б", - "a", - "i", - "e", - "o", - "n", - "ц", - "ш", - ], - ), - ( - "Lithuanian", - [ - "i", - "a", - "s", - "o", - "r", - "e", - "t", - "n", - "u", - "k", - "m", - "l", - "p", - "v", - "d", - "j", - "g", - "ė", - "b", - "y", - "ų", - "š", - "ž", - "c", - "ą", - "į", - ], - ), - ( - "Slovene", - [ - "e", - "a", - "i", - "o", - "n", - "r", - "s", - "l", - "t", - "j", - "v", - "k", - "d", - "p", - "m", - "u", - "z", - "b", - "g", - "h", - "č", - "c", - "š", - "ž", - "f", - "y", - ], - ), - ( - "Slovak", - [ - "o", - "a", - "e", - "n", - "i", - "r", - "v", - "t", - "s", - "l", - "k", - "d", - "m", - "p", - "u", - "c", - "h", - "j", - "b", - "z", - "á", - "y", - "ý", - "í", - "č", - "é", - ], - ), - ( - "Hebrew", - [ - "י", - "ו", - "ה", - "ל", - "ר", - "ב", - "ת", - "מ", - "א", - "ש", - "נ", - "ע", - "ם", - "ד", - "ק", - "ח", - "פ", - "ס", - "כ", - "ג", - "ט", - "צ", - "ן", - "ז", - "ך", - ], - ), - ( - "Bulgarian", - [ - "а", - "и", - "о", - "е", - "н", - "т", - "р", - "с", - "в", - "л", - "к", - "д", - "п", - "м", - "з", - "г", - "я", - "ъ", - "у", - "б", - "ч", - "ц", - "й", - "ж", - "щ", - "х", - ], - ), - ( - "Croatian", - [ - "a", - "i", - "o", - "e", - "n", - "r", - "j", - "s", - "t", - "u", - "k", - "l", - "v", - "d", - "m", - "p", - "g", - "z", - "b", - "c", - "č", - "h", - "š", - "ž", - "ć", - "f", - ], - ), - ( - "Hindi", - [ - "क", - "र", - "स", - "न", - "त", - "म", - "ह", - "प", - "य", - "ल", - "व", - "ज", - "द", - "ग", - "ब", - "श", - "ट", - "अ", - "ए", - "थ", - "भ", - "ड", - "च", - "ध", - "ष", - "इ", - ], - ), - ( - "Estonian", - [ - "a", - "i", - "e", - "s", - "t", - "l", - "u", - "n", - "o", - "k", - "r", - "d", - "m", - "v", - "g", - "p", - "j", - "h", - "ä", - "b", - "õ", - "ü", - "f", - "c", - "ö", - "y", - ], - ), - ( - "Simple English", - [ - "e", - "a", - "t", - "i", - "o", - "n", - "s", - "r", - "h", - "l", - "d", - "c", - "m", - "u", - "f", - "p", - "g", - "w", - "b", - "y", - "v", - "k", - "j", - "x", - "z", - "q", - ], - ), - ( - "Thai", - [ - "า", - "น", - "ร", - "อ", - "ก", - "เ", - "ง", - "ม", - "ย", - "ล", - "ว", - "ด", - "ท", - "ส", - "ต", - "ะ", - "ป", - "บ", - "ค", - "ห", - "แ", - "จ", - "พ", - "ช", - "ข", - "ใ", - ], - ), - ( - "Greek", - [ - "α", - "τ", - "ο", - "ι", - "ε", - "ν", - "ρ", - "σ", - "κ", - "η", - "π", - "ς", - "υ", - "μ", - "λ", - "ί", - "ό", - "ά", - "γ", - "έ", - "δ", - "ή", - "ω", - "χ", - "θ", - "ύ", - ], - ), - ( - "Tamil", - [ - "க", - "த", - "ப", - "ட", - "ர", - "ம", - "ல", - "ன", - "வ", - "ற", - "ய", - "ள", - "ச", - "ந", - "இ", - "ண", - "அ", - "ஆ", - "ழ", - "ங", - "எ", - "உ", - "ஒ", - "ஸ", - ], - ), - ( - "Classical Chinese", - [ - "之", - "年", - "為", - "也", - "以", - "一", - "人", - "其", - "者", - "國", - "有", - "二", - "十", - "於", - "曰", - "三", - "不", - "大", - "而", - "子", - "中", - "五", - "四", - ], - ), - ( - "Kazakh", - [ - "а", - "ы", - "е", - "н", - "т", - "р", - "л", - "і", - "д", - "с", - "м", - "қ", - "к", - "о", - "б", - "и", - "у", - "ғ", - "ж", - "ң", - "з", - "ш", - "й", - "п", - "г", - "ө", - ], - ), - ] -) +FREQUENCIES: Dict[str, List[str]] = { + "English": [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "u", + "m", + "f", + "p", + "g", + "w", + "y", + "b", + "v", + "k", + "x", + "j", + "z", + "q", + ], + "German": [ + "e", + "n", + "i", + "r", + "s", + "t", + "a", + "d", + "h", + "u", + "l", + "g", + "o", + "c", + "m", + "b", + "f", + "k", + "w", + "z", + "p", + "v", + "ü", + "ä", + "ö", + "j", + ], + "French": [ + "e", + "a", + "s", + "n", + "i", + "t", + "r", + "l", + "u", + "o", + "d", + "c", + "p", + "m", + "é", + "v", + "g", + "f", + "b", + "h", + "q", + "à", + "x", + "è", + "y", + "j", + ], + "Dutch": [ + "e", + "n", + "a", + "i", + "r", + "t", + "o", + "d", + "s", + "l", + "g", + "h", + "v", + "m", + "u", + "k", + "c", + "p", + "b", + "w", + "j", + "z", + "f", + "y", + "x", + "ë", + ], + "Italian": [ + "e", + "i", + "a", + "o", + "n", + "l", + "t", + "r", + "s", + "c", + "d", + "u", + "p", + "m", + "g", + "v", + "f", + "b", + "z", + "h", + "q", + "è", + "à", + "k", + "y", + "ò", + ], + "Polish": [ + "a", + "i", + "o", + "e", + "n", + "r", + "z", + "w", + "s", + "c", + "t", + "k", + "y", + "d", + "p", + "m", + "u", + "l", + "j", + "ł", + "g", + "b", + "h", + "ą", + "ę", + "ó", + ], + "Spanish": [ + "e", + "a", + "o", + "n", + "s", + "r", + "i", + "l", + "d", + "t", + "c", + "u", + "m", + "p", + "b", + "g", + "v", + "f", + "y", + "ó", + "h", + "q", + "í", + "j", + "z", + "á", + ], + "Russian": [ + "о", + "а", + "е", + "и", + "н", + "с", + "т", + "р", + "в", + "л", + "к", + "м", + "д", + "п", + "у", + "г", + "я", + "ы", + "з", + "б", + "й", + "ь", + "ч", + "х", + "ж", + "ц", + ], + "Japanese": [ + "の", + "に", + "る", + "た", + "は", + "ー", + "と", + "し", + "を", + "で", + "て", + "が", + "い", + "ン", + "れ", + "な", + "年", + "ス", + "っ", + "ル", + "か", + "ら", + "あ", + "さ", + "も", + "り", + ], + "Portuguese": [ + "a", + "e", + "o", + "s", + "i", + "r", + "d", + "n", + "t", + "m", + "u", + "c", + "l", + "p", + "g", + "v", + "b", + "f", + "h", + "ã", + "q", + "é", + "ç", + "á", + "z", + "í", + ], + "Swedish": [ + "e", + "a", + "n", + "r", + "t", + "s", + "i", + "l", + "d", + "o", + "m", + "k", + "g", + "v", + "h", + "f", + "u", + "p", + "ä", + "c", + "b", + "ö", + "å", + "y", + "j", + "x", + ], + "Chinese": [ + "的", + "一", + "是", + "不", + "了", + "在", + "人", + "有", + "我", + "他", + "这", + "个", + "们", + "中", + "来", + "上", + "大", + "为", + "和", + "国", + "地", + "到", + "以", + "说", + "时", + "要", + "就", + "出", + "会", + ], + "Ukrainian": [ + "о", + "а", + "н", + "і", + "и", + "р", + "в", + "т", + "е", + "с", + "к", + "л", + "у", + "д", + "м", + "п", + "з", + "я", + "ь", + "б", + "г", + "й", + "ч", + "х", + "ц", + "ї", + ], + "Norwegian": [ + "e", + "r", + "n", + "t", + "a", + "s", + "i", + "o", + "l", + "d", + "g", + "k", + "m", + "v", + "f", + "p", + "u", + "b", + "h", + "å", + "y", + "j", + "ø", + "c", + "æ", + "w", + ], + "Finnish": [ + "a", + "i", + "n", + "t", + "e", + "s", + "l", + "o", + "u", + "k", + "ä", + "m", + "r", + "v", + "j", + "h", + "p", + "y", + "d", + "ö", + "g", + "c", + "b", + "f", + "w", + "z", + ], + "Vietnamese": [ + "n", + "h", + "t", + "i", + "c", + "g", + "a", + "o", + "u", + "m", + "l", + "r", + "à", + "đ", + "s", + "e", + "v", + "p", + "b", + "y", + "ư", + "d", + "á", + "k", + "ộ", + "ế", + ], + "Czech": [ + "o", + "e", + "a", + "n", + "t", + "s", + "i", + "l", + "v", + "r", + "k", + "d", + "u", + "m", + "p", + "í", + "c", + "h", + "z", + "á", + "y", + "j", + "b", + "ě", + "é", + "ř", + ], + "Hungarian": [ + "e", + "a", + "t", + "l", + "s", + "n", + "k", + "r", + "i", + "o", + "z", + "á", + "é", + "g", + "m", + "b", + "y", + "v", + "d", + "h", + "u", + "p", + "j", + "ö", + "f", + "c", + ], + "Korean": [ + "이", + "다", + "에", + "의", + "는", + "로", + "하", + "을", + "가", + "고", + "지", + "서", + "한", + "은", + "기", + "으", + "년", + "대", + "사", + "시", + "를", + "리", + "도", + "인", + "스", + "일", + ], + "Indonesian": [ + "a", + "n", + "e", + "i", + "r", + "t", + "u", + "s", + "d", + "k", + "m", + "l", + "g", + "p", + "b", + "o", + "h", + "y", + "j", + "c", + "w", + "f", + "v", + "z", + "x", + "q", + ], + "Turkish": [ + "a", + "e", + "i", + "n", + "r", + "l", + "ı", + "k", + "d", + "t", + "s", + "m", + "y", + "u", + "o", + "b", + "ü", + "ş", + "v", + "g", + "z", + "h", + "c", + "p", + "ç", + "ğ", + ], + "Romanian": [ + "e", + "i", + "a", + "r", + "n", + "t", + "u", + "l", + "o", + "c", + "s", + "d", + "p", + "m", + "ă", + "f", + "v", + "î", + "g", + "b", + "ș", + "ț", + "z", + "h", + "â", + "j", + ], + "Farsi": [ + "ا", + "ی", + "ر", + "د", + "ن", + "ه", + "و", + "م", + "ت", + "ب", + "س", + "ل", + "ک", + "ش", + "ز", + "ف", + "گ", + "ع", + "خ", + "ق", + "ج", + "آ", + "پ", + "ح", + "ط", + "ص", + ], + "Arabic": [ + "ا", + "ل", + "ي", + "م", + "و", + "ن", + "ر", + "ت", + "ب", + "ة", + "ع", + "د", + "س", + "ف", + "ه", + "ك", + "ق", + "أ", + "ح", + "ج", + "ش", + "ط", + "ص", + "ى", + "خ", + "إ", + ], + "Danish": [ + "e", + "r", + "n", + "t", + "a", + "i", + "s", + "d", + "l", + "o", + "g", + "m", + "k", + "f", + "v", + "u", + "b", + "h", + "p", + "å", + "y", + "ø", + "æ", + "c", + "j", + "w", + ], + "Serbian": [ + "а", + "и", + "о", + "е", + "н", + "р", + "с", + "у", + "т", + "к", + "ј", + "в", + "д", + "м", + "п", + "л", + "г", + "з", + "б", + "a", + "i", + "e", + "o", + "n", + "ц", + "ш", + ], + "Lithuanian": [ + "i", + "a", + "s", + "o", + "r", + "e", + "t", + "n", + "u", + "k", + "m", + "l", + "p", + "v", + "d", + "j", + "g", + "ė", + "b", + "y", + "ų", + "š", + "ž", + "c", + "ą", + "į", + ], + "Slovene": [ + "e", + "a", + "i", + "o", + "n", + "r", + "s", + "l", + "t", + "j", + "v", + "k", + "d", + "p", + "m", + "u", + "z", + "b", + "g", + "h", + "č", + "c", + "š", + "ž", + "f", + "y", + ], + "Slovak": [ + "o", + "a", + "e", + "n", + "i", + "r", + "v", + "t", + "s", + "l", + "k", + "d", + "m", + "p", + "u", + "c", + "h", + "j", + "b", + "z", + "á", + "y", + "ý", + "í", + "č", + "é", + ], + "Hebrew": [ + "י", + "ו", + "ה", + "ל", + "ר", + "ב", + "ת", + "מ", + "א", + "ש", + "נ", + "ע", + "ם", + "ד", + "ק", + "ח", + "פ", + "ס", + "כ", + "ג", + "ט", + "צ", + "ן", + "ז", + "ך", + ], + "Bulgarian": [ + "а", + "и", + "о", + "е", + "н", + "т", + "р", + "с", + "в", + "л", + "к", + "д", + "п", + "м", + "з", + "г", + "я", + "ъ", + "у", + "б", + "ч", + "ц", + "й", + "ж", + "щ", + "х", + ], + "Croatian": [ + "a", + "i", + "o", + "e", + "n", + "r", + "j", + "s", + "t", + "u", + "k", + "l", + "v", + "d", + "m", + "p", + "g", + "z", + "b", + "c", + "č", + "h", + "š", + "ž", + "ć", + "f", + ], + "Hindi": [ + "क", + "र", + "स", + "न", + "त", + "म", + "ह", + "प", + "य", + "ल", + "व", + "ज", + "द", + "ग", + "ब", + "श", + "ट", + "अ", + "ए", + "थ", + "भ", + "ड", + "च", + "ध", + "ष", + "इ", + ], + "Estonian": [ + "a", + "i", + "e", + "s", + "t", + "l", + "u", + "n", + "o", + "k", + "r", + "d", + "m", + "v", + "g", + "p", + "j", + "h", + "ä", + "b", + "õ", + "ü", + "f", + "c", + "ö", + "y", + ], + "Simple English": [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "m", + "u", + "f", + "p", + "g", + "w", + "b", + "y", + "v", + "k", + "j", + "x", + "z", + "q", + ], + "Thai": [ + "า", + "น", + "ร", + "อ", + "ก", + "เ", + "ง", + "ม", + "ย", + "ล", + "ว", + "ด", + "ท", + "ส", + "ต", + "ะ", + "ป", + "บ", + "ค", + "ห", + "แ", + "จ", + "พ", + "ช", + "ข", + "ใ", + ], + "Greek": [ + "α", + "τ", + "ο", + "ι", + "ε", + "ν", + "ρ", + "σ", + "κ", + "η", + "π", + "ς", + "υ", + "μ", + "λ", + "ί", + "ό", + "ά", + "γ", + "έ", + "δ", + "ή", + "ω", + "χ", + "θ", + "ύ", + ], + "Tamil": [ + "க", + "த", + "ப", + "ட", + "ர", + "ம", + "ல", + "ன", + "வ", + "ற", + "ய", + "ள", + "ச", + "ந", + "இ", + "ண", + "அ", + "ஆ", + "ழ", + "ங", + "எ", + "உ", + "ஒ", + "ஸ", + ], + "Classical Chinese": [ + "之", + "年", + "為", + "也", + "以", + "一", + "人", + "其", + "者", + "國", + "有", + "二", + "十", + "於", + "曰", + "三", + "不", + "大", + "而", + "子", + "中", + "五", + "四", + ], + "Kazakh": [ + "а", + "ы", + "е", + "н", + "т", + "р", + "л", + "і", + "д", + "с", + "м", + "қ", + "к", + "о", + "б", + "и", + "у", + "ғ", + "ж", + "ң", + "з", + "ш", + "й", + "п", + "г", + "ө", + ], +} diff --git a/contrib/python/charset-normalizer/charset_normalizer/cd.py b/contrib/python/charset-normalizer/charset_normalizer/cd.py index 8429a0eb20..8998bb545c 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/cd.py +++ b/contrib/python/charset-normalizer/charset_normalizer/cd.py @@ -1,6 +1,6 @@ import importlib from codecs import IncrementalDecoder -from collections import Counter, OrderedDict +from collections import Counter from functools import lru_cache from typing import Dict, List, Optional, Tuple @@ -26,15 +26,15 @@ def encoding_unicode_range(iana_name: str) -> List[str]: decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore - p = decoder(errors="ignore") # type: IncrementalDecoder - seen_ranges = {} # type: Dict[str, int] - character_count = 0 # type: int + p: IncrementalDecoder = decoder(errors="ignore") + seen_ranges: Dict[str, int] = {} + character_count: int = 0 for i in range(0x40, 0xFF): - chunk = p.decode(bytes([i])) # type: str + chunk: str = p.decode(bytes([i])) if chunk: - character_range = unicode_range(chunk) # type: Optional[str] + character_range: Optional[str] = unicode_range(chunk) if character_range is None: continue @@ -58,7 +58,7 @@ def unicode_range_languages(primary_range: str) -> List[str]: """ Return inferred languages used with a unicode range. """ - languages = [] # type: List[str] + languages: List[str] = [] for language, characters in FREQUENCIES.items(): for character in characters: @@ -75,8 +75,8 @@ def encoding_languages(iana_name: str) -> List[str]: Single-byte encoding language association. Some code page are heavily linked to particular language(s). This function does the correspondence. """ - unicode_ranges = encoding_unicode_range(iana_name) # type: List[str] - primary_range = None # type: Optional[str] + unicode_ranges: List[str] = encoding_unicode_range(iana_name) + primary_range: Optional[str] = None for specified_range in unicode_ranges: if "Latin" not in specified_range: @@ -115,8 +115,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]: """ Determine main aspects from a supported language if it contains accents and if is pure Latin. """ - target_have_accents = False # type: bool - target_pure_latin = True # type: bool + target_have_accents: bool = False + target_pure_latin: bool = True for character in FREQUENCIES[language]: if not target_have_accents and is_accentuated(character): @@ -133,7 +133,7 @@ def alphabet_languages( """ Return associated languages associated to given characters. """ - languages = [] # type: List[Tuple[str, float]] + languages: List[Tuple[str, float]] = [] source_have_accents = any(is_accentuated(character) for character in characters) @@ -147,13 +147,13 @@ def alphabet_languages( if target_have_accents is False and source_have_accents: continue - character_count = len(language_characters) # type: int + character_count: int = len(language_characters) - character_match_count = len( + character_match_count: int = len( [c for c in language_characters if c in characters] - ) # type: int + ) - ratio = character_match_count / character_count # type: float + ratio: float = character_match_count / character_count if ratio >= 0.2: languages.append((language, ratio)) @@ -174,36 +174,33 @@ def characters_popularity_compare( if language not in FREQUENCIES: raise ValueError("{} not available".format(language)) - character_approved_count = 0 # type: int + character_approved_count: int = 0 + FREQUENCIES_language_set = set(FREQUENCIES[language]) for character in ordered_characters: - if character not in FREQUENCIES[language]: + if character not in FREQUENCIES_language_set: continue - characters_before_source = FREQUENCIES[language][ + characters_before_source: List[str] = FREQUENCIES[language][ 0 : FREQUENCIES[language].index(character) - ] # type: List[str] - characters_after_source = FREQUENCIES[language][ + ] + characters_after_source: List[str] = FREQUENCIES[language][ FREQUENCIES[language].index(character) : - ] # type: List[str] - - characters_before = ordered_characters[ + ] + characters_before: List[str] = ordered_characters[ 0 : ordered_characters.index(character) - ] # type: List[str] - characters_after = ordered_characters[ + ] + characters_after: List[str] = ordered_characters[ ordered_characters.index(character) : - ] # type: List[str] - - before_match_count = [ - e in characters_before for e in characters_before_source - ].count( - True - ) # type: int - after_match_count = [ - e in characters_after for e in characters_after_source - ].count( - True - ) # type: int + ] + + before_match_count: int = len( + set(characters_before) & set(characters_before_source) + ) + + after_match_count: int = len( + set(characters_after) & set(characters_after_source) + ) if len(characters_before_source) == 0 and before_match_count <= 4: character_approved_count += 1 @@ -229,18 +226,18 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]: Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; One containing the latin letters and the other hebrew. """ - layers = OrderedDict() # type: Dict[str, str] + layers: Dict[str, str] = {} for character in decoded_sequence: if character.isalpha() is False: continue - character_range = unicode_range(character) # type: Optional[str] + character_range: Optional[str] = unicode_range(character) if character_range is None: continue - layer_target_range = None # type: Optional[str] + layer_target_range: Optional[str] = None for discovered_range in layers: if ( @@ -267,7 +264,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: This function merge results previously given by the function coherence_ratio. The return type is the same as coherence_ratio. """ - per_language_ratios = OrderedDict() # type: Dict[str, List[float]] + per_language_ratios: Dict[str, List[float]] = {} for result in results: for sub_result in result: language, ratio = sub_result @@ -299,10 +296,10 @@ def coherence_ratio( A layer = Character extraction by alphabets/ranges. """ - results = [] # type: List[Tuple[str, float]] - ignore_non_latin = False # type: bool + results: List[Tuple[str, float]] = [] + ignore_non_latin: bool = False - sufficient_match_count = 0 # type: int + sufficient_match_count: int = 0 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] if "Latin Based" in lg_inclusion_list: @@ -310,22 +307,22 @@ def coherence_ratio( lg_inclusion_list.remove("Latin Based") for layer in alpha_unicode_split(decoded_sequence): - sequence_frequencies = Counter(layer) # type: Counter + sequence_frequencies: Counter = Counter(layer) most_common = sequence_frequencies.most_common() - character_count = sum(o for c, o in most_common) # type: int + character_count: int = sum(o for c, o in most_common) if character_count <= TOO_SMALL_SEQUENCE: continue - popular_character_ordered = [c for c, o in most_common] # type: List[str] + popular_character_ordered: List[str] = [c for c, o in most_common] for language in lg_inclusion_list or alphabet_languages( popular_character_ordered, ignore_non_latin ): - ratio = characters_popularity_compare( + ratio: float = characters_popularity_compare( language, popular_character_ordered - ) # type: float + ) if ratio < threshold: continue diff --git a/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py b/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py index 5f912c923b..540e5e2a1a 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py +++ b/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py @@ -5,6 +5,11 @@ from os.path import abspath from platform import python_version from typing import List +try: + from unicodedata2 import unidata_version +except ImportError: + from unicodedata import unidata_version + from charset_normalizer import from_fp from charset_normalizer.models import CliDetectionResult from charset_normalizer.version import __version__ @@ -111,7 +116,7 @@ def cli_detect(argv: List[str] = None) -> int: "-t", "--threshold", action="store", - default=0.1, + default=0.2, type=float, dest="threshold", help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.", @@ -119,8 +124,8 @@ def cli_detect(argv: List[str] = None) -> int: parser.add_argument( "--version", action="version", - version="Charset-Normalizer {} - Python {}".format( - __version__, python_version() + version="Charset-Normalizer {} - Python {} - Unicode {}".format( + __version__, python_version(), unidata_version ), help="Show version information and exit.", ) @@ -229,7 +234,7 @@ def cli_detect(argv: List[str] = None) -> int: my_file.close() continue - o_ = my_file.name.split(".") # type: List[str] + o_: List[str] = my_file.name.split(".") if args.replace is False: o_.insert(-1, best_guess.encoding) diff --git a/contrib/python/charset-normalizer/charset_normalizer/constant.py b/contrib/python/charset-normalizer/charset_normalizer/constant.py index c32f5cf2d6..ac840c461f 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/constant.py +++ b/contrib/python/charset-normalizer/charset_normalizer/constant.py @@ -1,5 +1,4 @@ from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE -from collections import OrderedDict from encodings.aliases import aliases from re import IGNORECASE, compile as re_compile from typing import Dict, List, Set, Union @@ -7,31 +6,26 @@ from typing import Dict, List, Set, Union from .assets import FREQUENCIES # Contain for each eligible encoding a list of/item bytes SIG/BOM -ENCODING_MARKS = OrderedDict( - [ - ("utf_8", BOM_UTF8), - ( - "utf_7", - [ - b"\x2b\x2f\x76\x38", - b"\x2b\x2f\x76\x39", - b"\x2b\x2f\x76\x2b", - b"\x2b\x2f\x76\x2f", - b"\x2b\x2f\x76\x38\x2d", - ], - ), - ("gb18030", b"\x84\x31\x95\x33"), - ("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]), - ("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]), - ] -) # type: Dict[str, Union[bytes, List[bytes]]] +ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = { + "utf_8": BOM_UTF8, + "utf_7": [ + b"\x2b\x2f\x76\x38", + b"\x2b\x2f\x76\x39", + b"\x2b\x2f\x76\x2b", + b"\x2b\x2f\x76\x2f", + b"\x2b\x2f\x76\x38\x2d", + ], + "gb18030": b"\x84\x31\x95\x33", + "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE], + "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE], +} -TOO_SMALL_SEQUENCE = 32 # type: int -TOO_BIG_SEQUENCE = int(10e6) # type: int +TOO_SMALL_SEQUENCE: int = 32 +TOO_BIG_SEQUENCE: int = int(10e6) -UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int +UTF8_MAXIMAL_ALLOCATION: int = 1112064 -UNICODE_RANGES_COMBINED = { +UNICODE_RANGES_COMBINED: Dict[str, range] = { "Control character": range(31 + 1), "Basic Latin": range(32, 127 + 1), "Latin-1 Supplement": range(128, 255 + 1), @@ -311,10 +305,10 @@ UNICODE_RANGES_COMBINED = { "CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1), "Tags": range(917504, 917631 + 1), "Variation Selectors Supplement": range(917760, 917999 + 1), -} # type: Dict[str, range] +} -UNICODE_SECONDARY_RANGE_KEYWORD = [ +UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [ "Supplement", "Extended", "Extensions", @@ -330,25 +324,25 @@ UNICODE_SECONDARY_RANGE_KEYWORD = [ "Shapes", "Supplemental", "Tags", -] # type: List[str] +] RE_POSSIBLE_ENCODING_INDICATION = re_compile( r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)", IGNORECASE, ) -IANA_SUPPORTED = sorted( +IANA_SUPPORTED: List[str] = sorted( filter( lambda x: x.endswith("_codec") is False and x not in {"rot_13", "tactis", "mbcs"}, list(set(aliases.values())), ) -) # type: List[str] +) -IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED) # type: int +IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED) # pre-computed code page that are similar using the function cp_similarity. -IANA_SUPPORTED_SIMILAR = { +IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = { "cp037": ["cp1026", "cp1140", "cp273", "cp500"], "cp1026": ["cp037", "cp1140", "cp273", "cp500"], "cp1125": ["cp866"], @@ -434,10 +428,10 @@ IANA_SUPPORTED_SIMILAR = { "mac_turkish": ["mac_iceland", "mac_roman"], "ptcp154": ["cp1251", "kz1048"], "tis_620": ["iso8859_11"], -} # type: Dict[str, List[str]] +} -CHARDET_CORRESPONDENCE = { +CHARDET_CORRESPONDENCE: Dict[str, str] = { "iso2022_kr": "ISO-2022-KR", "iso2022_jp": "ISO-2022-JP", "euc_kr": "EUC-KR", @@ -470,10 +464,10 @@ CHARDET_CORRESPONDENCE = { "cp1256": "windows-1256", "cp1254": "Windows-1254", "cp949": "CP949", -} # type: Dict[str, str] +} -COMMON_SAFE_ASCII_CHARACTERS = { +COMMON_SAFE_ASCII_CHARACTERS: Set[str] = { "<", ">", "=", @@ -489,15 +483,15 @@ COMMON_SAFE_ASCII_CHARACTERS = { "|", '"', "-", -} # type: Set[str] +} -KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str] -ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str] +KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"} +ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"} NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+") -LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES) # type: int +LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES) # Logging LEVEL bellow DEBUG -TRACE = 5 # type: int +TRACE: int = 5 diff --git a/contrib/python/charset-normalizer/charset_normalizer/md.py b/contrib/python/charset-normalizer/charset_normalizer/md.py index f3d6505cf0..31808af84c 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/md.py +++ b/contrib/python/charset-normalizer/charset_normalizer/md.py @@ -16,6 +16,7 @@ from .utils import ( is_separator, is_symbol, is_thai, + is_unprintable, remove_accent, unicode_range, ) @@ -57,12 +58,12 @@ class MessDetectorPlugin: class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): def __init__(self) -> None: - self._punctuation_count = 0 # type: int - self._symbol_count = 0 # type: int - self._character_count = 0 # type: int + self._punctuation_count: int = 0 + self._symbol_count: int = 0 + self._character_count: int = 0 - self._last_printable_char = None # type: Optional[str] - self._frenzy_symbol_in_word = False # type: bool + self._last_printable_char: Optional[str] = None + self._frenzy_symbol_in_word: bool = False def eligible(self, character: str) -> bool: return character.isprintable() @@ -95,17 +96,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): if self._character_count == 0: return 0.0 - ratio_of_punctuation = ( + ratio_of_punctuation: float = ( self._punctuation_count + self._symbol_count - ) / self._character_count # type: float + ) / self._character_count return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 class TooManyAccentuatedPlugin(MessDetectorPlugin): def __init__(self) -> None: - self._character_count = 0 # type: int - self._accentuated_count = 0 # type: int + self._character_count: int = 0 + self._accentuated_count: int = 0 def eligible(self, character: str) -> bool: return character.isalpha() @@ -124,26 +125,20 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin): def ratio(self) -> float: if self._character_count == 0: return 0.0 - ratio_of_accentuation = ( - self._accentuated_count / self._character_count - ) # type: float + ratio_of_accentuation: float = self._accentuated_count / self._character_count return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 class UnprintablePlugin(MessDetectorPlugin): def __init__(self) -> None: - self._unprintable_count = 0 # type: int - self._character_count = 0 # type: int + self._unprintable_count: int = 0 + self._character_count: int = 0 def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: - if ( - character.isspace() is False # includes \n \t \r \v - and character.isprintable() is False - and character != "\x1A" # Why? Its the ASCII substitute character. - ): + if is_unprintable(character): self._unprintable_count += 1 self._character_count += 1 @@ -160,10 +155,10 @@ class UnprintablePlugin(MessDetectorPlugin): class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): def __init__(self) -> None: - self._successive_count = 0 # type: int - self._character_count = 0 # type: int + self._successive_count: int = 0 + self._character_count: int = 0 - self._last_latin_character = None # type: Optional[str] + self._last_latin_character: Optional[str] = None def eligible(self, character: str) -> bool: return character.isalpha() and is_latin(character) @@ -197,9 +192,9 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): class SuspiciousRange(MessDetectorPlugin): def __init__(self) -> None: - self._suspicious_successive_range_count = 0 # type: int - self._character_count = 0 # type: int - self._last_printable_seen = None # type: Optional[str] + self._suspicious_successive_range_count: int = 0 + self._character_count: int = 0 + self._last_printable_seen: Optional[str] = None def eligible(self, character: str) -> bool: return character.isprintable() @@ -219,10 +214,8 @@ class SuspiciousRange(MessDetectorPlugin): self._last_printable_seen = character return - unicode_range_a = unicode_range( - self._last_printable_seen - ) # type: Optional[str] - unicode_range_b = unicode_range(character) # type: Optional[str] + unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) + unicode_range_b: Optional[str] = unicode_range(character) if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): self._suspicious_successive_range_count += 1 @@ -239,9 +232,9 @@ class SuspiciousRange(MessDetectorPlugin): if self._character_count == 0: return 0.0 - ratio_of_suspicious_range_usage = ( + ratio_of_suspicious_range_usage: float = ( self._suspicious_successive_range_count * 2 - ) / self._character_count # type: float + ) / self._character_count if ratio_of_suspicious_range_usage < 0.1: return 0.0 @@ -251,25 +244,25 @@ class SuspiciousRange(MessDetectorPlugin): class SuperWeirdWordPlugin(MessDetectorPlugin): def __init__(self) -> None: - self._word_count = 0 # type: int - self._bad_word_count = 0 # type: int - self._foreign_long_count = 0 # type: int + self._word_count: int = 0 + self._bad_word_count: int = 0 + self._foreign_long_count: int = 0 - self._is_current_word_bad = False # type: bool - self._foreign_long_watch = False # type: bool + self._is_current_word_bad: bool = False + self._foreign_long_watch: bool = False - self._character_count = 0 # type: int - self._bad_character_count = 0 # type: int + self._character_count: int = 0 + self._bad_character_count: int = 0 - self._buffer = "" # type: str - self._buffer_accent_count = 0 # type: int + self._buffer: str = "" + self._buffer_accent_count: int = 0 def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: if character.isalpha(): - self._buffer = "".join([self._buffer, character]) + self._buffer += character if is_accentuated(character): self._buffer_accent_count += 1 if ( @@ -289,7 +282,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin): character.isspace() or is_punctuation(character) or is_separator(character) ) and self._buffer: self._word_count += 1 - buffer_length = len(self._buffer) # type: int + buffer_length: int = len(self._buffer) self._character_count += buffer_length @@ -346,8 +339,8 @@ class CjkInvalidStopPlugin(MessDetectorPlugin): """ def __init__(self) -> None: - self._wrong_stop_count = 0 # type: int - self._cjk_character_count = 0 # type: int + self._wrong_stop_count: int = 0 + self._cjk_character_count: int = 0 def eligible(self, character: str) -> bool: return True @@ -372,17 +365,17 @@ class CjkInvalidStopPlugin(MessDetectorPlugin): class ArchaicUpperLowerPlugin(MessDetectorPlugin): def __init__(self) -> None: - self._buf = False # type: bool + self._buf: bool = False - self._character_count_since_last_sep = 0 # type: int + self._character_count_since_last_sep: int = 0 - self._successive_upper_lower_count = 0 # type: int - self._successive_upper_lower_count_final = 0 # type: int + self._successive_upper_lower_count: int = 0 + self._successive_upper_lower_count_final: int = 0 - self._character_count = 0 # type: int + self._character_count: int = 0 - self._last_alpha_seen = None # type: Optional[str] - self._current_ascii_only = True # type: bool + self._last_alpha_seen: Optional[str] = None + self._current_ascii_only: bool = True def eligible(self, character: str) -> bool: return True @@ -446,6 +439,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin): return self._successive_upper_lower_count_final / self._character_count +@lru_cache(maxsize=1024) def is_suspiciously_successive_range( unicode_range_a: Optional[str], unicode_range_b: Optional[str] ) -> bool: @@ -524,16 +518,16 @@ def mess_ratio( Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. """ - detectors = [ + detectors: List[MessDetectorPlugin] = [ md_class() for md_class in MessDetectorPlugin.__subclasses__() - ] # type: List[MessDetectorPlugin] + ] - length = len(decoded_sequence) + 1 # type: int + length: int = len(decoded_sequence) + 1 - mean_mess_ratio = 0.0 # type: float + mean_mess_ratio: float = 0.0 if length < 512: - intermediary_mean_mess_ratio_calc = 32 # type: int + intermediary_mean_mess_ratio_calc: int = 32 elif length <= 1024: intermediary_mean_mess_ratio_calc = 64 else: diff --git a/contrib/python/charset-normalizer/charset_normalizer/models.py b/contrib/python/charset-normalizer/charset_normalizer/models.py index c38da31fa5..b9d71eb4fd 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/models.py +++ b/contrib/python/charset-normalizer/charset_normalizer/models.py @@ -21,21 +21,21 @@ class CharsetMatch: languages: "CoherenceMatches", decoded_payload: Optional[str] = None, ): - self._payload = payload # type: bytes + self._payload: bytes = payload - self._encoding = guessed_encoding # type: str - self._mean_mess_ratio = mean_mess_ratio # type: float - self._languages = languages # type: CoherenceMatches - self._has_sig_or_bom = has_sig_or_bom # type: bool - self._unicode_ranges = None # type: Optional[List[str]] + self._encoding: str = guessed_encoding + self._mean_mess_ratio: float = mean_mess_ratio + self._languages: CoherenceMatches = languages + self._has_sig_or_bom: bool = has_sig_or_bom + self._unicode_ranges: Optional[List[str]] = None - self._leaves = [] # type: List[CharsetMatch] - self._mean_coherence_ratio = 0.0 # type: float + self._leaves: List[CharsetMatch] = [] + self._mean_coherence_ratio: float = 0.0 - self._output_payload = None # type: Optional[bytes] - self._output_encoding = None # type: Optional[str] + self._output_payload: Optional[bytes] = None + self._output_encoding: Optional[str] = None - self._string = decoded_payload # type: Optional[str] + self._string: Optional[str] = decoded_payload def __eq__(self, other: object) -> bool: if not isinstance(other, CharsetMatch): @@ -53,8 +53,8 @@ class CharsetMatch: if not isinstance(other, CharsetMatch): raise ValueError - chaos_difference = abs(self.chaos - other.chaos) # type: float - coherence_difference = abs(self.coherence - other.coherence) # type: float + chaos_difference: float = abs(self.chaos - other.chaos) + coherence_difference: float = abs(self.coherence - other.coherence) # Bellow 1% difference --> Use Coherence if chaos_difference < 0.01 and coherence_difference > 0.02: @@ -137,7 +137,7 @@ class CharsetMatch: """ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. """ - also_known_as = [] # type: List[str] + also_known_as: List[str] = [] for u, p in aliases.items(): if self.encoding == u: also_known_as.append(p) @@ -227,9 +227,9 @@ class CharsetMatch: if self._unicode_ranges is not None: return self._unicode_ranges # list detected ranges - detected_ranges = [ + detected_ranges: List[Optional[str]] = [ unicode_range(char) for char in str(self) - ] # type: List[Optional[str]] + ] # filter and sort self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) return self._unicode_ranges @@ -281,7 +281,7 @@ class CharsetMatches: """ def __init__(self, results: List[CharsetMatch] = None): - self._results = sorted(results) if results else [] # type: List[CharsetMatch] + self._results: List[CharsetMatch] = sorted(results) if results else [] def __iter__(self) -> Iterator[CharsetMatch]: yield from self._results @@ -360,17 +360,17 @@ class CliDetectionResult: unicode_path: Optional[str], is_preferred: bool, ): - self.path = path # type: str - self.unicode_path = unicode_path # type: Optional[str] - self.encoding = encoding # type: Optional[str] - self.encoding_aliases = encoding_aliases # type: List[str] - self.alternative_encodings = alternative_encodings # type: List[str] - self.language = language # type: str - self.alphabets = alphabets # type: List[str] - self.has_sig_or_bom = has_sig_or_bom # type: bool - self.chaos = chaos # type: float - self.coherence = coherence # type: float - self.is_preferred = is_preferred # type: bool + self.path: str = path + self.unicode_path: Optional[str] = unicode_path + self.encoding: Optional[str] = encoding + self.encoding_aliases: List[str] = encoding_aliases + self.alternative_encodings: List[str] = alternative_encodings + self.language: str = language + self.alphabets: List[str] = alphabets + self.has_sig_or_bom: bool = has_sig_or_bom + self.chaos: float = chaos + self.coherence: float = coherence + self.is_preferred: bool = is_preferred @property def __dict__(self) -> Dict[str, Any]: # type: ignore diff --git a/contrib/python/charset-normalizer/charset_normalizer/utils.py b/contrib/python/charset-normalizer/charset_normalizer/utils.py index dcb14dfee1..17eaee0408 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/utils.py +++ b/contrib/python/charset-normalizer/charset_normalizer/utils.py @@ -1,4 +1,6 @@ try: + # WARNING: unicodedata2 support is going to be removed in 3.0 + # Python is quickly catching up. import unicodedata2 as unicodedata except ImportError: import unicodedata # type: ignore[no-redef] @@ -9,7 +11,7 @@ from codecs import IncrementalDecoder from encodings.aliases import aliases from functools import lru_cache from re import findall -from typing import List, Optional, Set, Tuple, Union +from typing import Generator, List, Optional, Set, Tuple, Union from _multibytecodec import MultibyteIncrementalDecoder # type: ignore @@ -26,7 +28,7 @@ from .constant import ( @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_accentuated(character: str) -> bool: try: - description = unicodedata.name(character) # type: str + description: str = unicodedata.name(character) except ValueError: return False return ( @@ -41,11 +43,11 @@ def is_accentuated(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def remove_accent(character: str) -> str: - decomposed = unicodedata.decomposition(character) # type: str + decomposed: str = unicodedata.decomposition(character) if not decomposed: return character - codes = decomposed.split(" ") # type: List[str] + codes: List[str] = decomposed.split(" ") return chr(int(codes[0], 16)) @@ -55,7 +57,7 @@ def unicode_range(character: str) -> Optional[str]: """ Retrieve the Unicode range official name from a single character. """ - character_ord = ord(character) # type: int + character_ord: int = ord(character) for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): if character_ord in ord_range: @@ -67,12 +69,13 @@ def unicode_range(character: str) -> Optional[str]: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_latin(character: str) -> bool: try: - description = unicodedata.name(character) # type: str + description: str = unicodedata.name(character) except ValueError: return False return "LATIN" in description +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_ascii(character: str) -> bool: try: character.encode("ascii") @@ -83,12 +86,12 @@ def is_ascii(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_punctuation(character: str) -> bool: - character_category = unicodedata.category(character) # type: str + character_category: str = unicodedata.category(character) if "P" in character_category: return True - character_range = unicode_range(character) # type: Optional[str] + character_range: Optional[str] = unicode_range(character) if character_range is None: return False @@ -98,12 +101,12 @@ def is_punctuation(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_symbol(character: str) -> bool: - character_category = unicodedata.category(character) # type: str + character_category: str = unicodedata.category(character) if "S" in character_category or "N" in character_category: return True - character_range = unicode_range(character) # type: Optional[str] + character_range: Optional[str] = unicode_range(character) if character_range is None: return False @@ -113,7 +116,7 @@ def is_symbol(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_emoticon(character: str) -> bool: - character_range = unicode_range(character) # type: Optional[str] + character_range: Optional[str] = unicode_range(character) if character_range is None: return False @@ -126,7 +129,7 @@ def is_separator(character: str) -> bool: if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}: return True - character_category = unicodedata.category(character) # type: str + character_category: str = unicodedata.category(character) return "Z" in character_category @@ -137,7 +140,7 @@ def is_case_variable(character: str) -> bool: def is_private_use_only(character: str) -> bool: - character_category = unicodedata.category(character) # type: str + character_category: str = unicodedata.category(character) return character_category == "Co" @@ -197,6 +200,17 @@ def is_unicode_range_secondary(range_name: str) -> bool: return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_unprintable(character: str) -> bool: + return ( + character.isspace() is False # includes \n \t \r \v + and character.isprintable() is False + and character != "\x1A" # Why? Its the ASCII substitute character. + and character != b"\xEF\xBB\xBF".decode("utf_8") # bug discovered in Python, + # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. + ) + + def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: """ Extract using ASCII-only decoder any specified encoding in the first n-bytes. @@ -204,12 +218,12 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional if not isinstance(sequence, bytes): raise TypeError - seq_len = len(sequence) # type: int + seq_len: int = len(sequence) - results = findall( + results: List[str] = findall( RE_POSSIBLE_ENCODING_INDICATION, sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), - ) # type: List[str] + ) if len(results) == 0: return None @@ -253,7 +267,7 @@ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: """ for iana_encoding in ENCODING_MARKS: - marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]] + marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding] if isinstance(marks, bytes): marks = [marks] @@ -283,10 +297,10 @@ def iana_name(cp_name: str, strict: bool = True) -> str: def range_scan(decoded_sequence: str) -> List[str]: - ranges = set() # type: Set[str] + ranges: Set[str] = set() for character in decoded_sequence: - character_range = unicode_range(character) # type: Optional[str] + character_range: Optional[str] = unicode_range(character) if character_range is None: continue @@ -304,13 +318,13 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore - id_a = decoder_a(errors="ignore") # type: IncrementalDecoder - id_b = decoder_b(errors="ignore") # type: IncrementalDecoder + id_a: IncrementalDecoder = decoder_a(errors="ignore") + id_b: IncrementalDecoder = decoder_b(errors="ignore") - character_match_count = 0 # type: int + character_match_count: int = 0 for i in range(255): - to_be_decoded = bytes([i]) # type: bytes + to_be_decoded: bytes = bytes([i]) if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): character_match_count += 1 @@ -340,3 +354,61 @@ def set_logging_handler( handler = logging.StreamHandler() handler.setFormatter(logging.Formatter(format_string)) logger.addHandler(handler) + + +def cut_sequence_chunks( + sequences: bytes, + encoding_iana: str, + offsets: range, + chunk_size: int, + bom_or_sig_available: bool, + strip_sig_or_bom: bool, + sig_payload: bytes, + is_multi_byte_decoder: bool, + decoded_payload: Optional[str] = None, +) -> Generator[str, None, None]: + + if decoded_payload and is_multi_byte_decoder is False: + for i in offsets: + chunk = decoded_payload[i : i + chunk_size] + if not chunk: + break + yield chunk + else: + for i in offsets: + chunk_end = i + chunk_size + if chunk_end > len(sequences) + 8: + continue + + cut_sequence = sequences[i : i + chunk_size] + + if bom_or_sig_available and strip_sig_or_bom is False: + cut_sequence = sig_payload + cut_sequence + + chunk = cut_sequence.decode( + encoding_iana, + errors="ignore" if is_multi_byte_decoder else "strict", + ) + + # multi-byte bad cutting detector and adjustment + # not the cleanest way to perform that fix but clever enough for now. + if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: + + chunk_partial_size_chk: int = min(chunk_size, 16) + + if ( + decoded_payload + and chunk[:chunk_partial_size_chk] not in decoded_payload + ): + for j in range(i, i - 4, -1): + cut_sequence = sequences[j:chunk_end] + + if bom_or_sig_available and strip_sig_or_bom is False: + cut_sequence = sig_payload + cut_sequence + + chunk = cut_sequence.decode(encoding_iana, errors="ignore") + + if chunk[:chunk_partial_size_chk] in decoded_payload: + break + + yield chunk diff --git a/contrib/python/charset-normalizer/charset_normalizer/version.py b/contrib/python/charset-normalizer/charset_normalizer/version.py index 77cfff25d6..af7e749e82 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/version.py +++ b/contrib/python/charset-normalizer/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.12" +__version__ = "2.1.0" VERSION = __version__.split(".") |