aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorarcadia-devtools <arcadia-devtools@yandex-team.ru>2022-06-20 18:39:30 +0300
committerarcadia-devtools <arcadia-devtools@yandex-team.ru>2022-06-20 18:39:30 +0300
commit798d25a291578fceb2223382b508fba1723fef4a (patch)
tree227b9a24000c40ae3354f4321ff9fe19143423f7
parentd934aec555f13784eabe2d7682211050918e6cf5 (diff)
downloadydb-798d25a291578fceb2223382b508fba1723fef4a.tar.gz
intermediate changes
ref:ac842eacda5e614f20cf9d3985d932732f92beab
-rw-r--r--contrib/python/charset-normalizer/.dist-info/METADATA14
-rw-r--r--contrib/python/charset-normalizer/README.md8
-rw-r--r--contrib/python/charset-normalizer/charset_normalizer/api.py144
-rw-r--r--contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py2362
-rw-r--r--contrib/python/charset-normalizer/charset_normalizer/cd.py97
-rw-r--r--contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py13
-rw-r--r--contrib/python/charset-normalizer/charset_normalizer/constant.py72
-rw-r--r--contrib/python/charset-normalizer/charset_normalizer/md.py106
-rw-r--r--contrib/python/charset-normalizer/charset_normalizer/models.py56
-rw-r--r--contrib/python/charset-normalizer/charset_normalizer/utils.py118
-rw-r--r--contrib/python/charset-normalizer/charset_normalizer/version.py2
11 files changed, 1451 insertions, 1541 deletions
diff --git a/contrib/python/charset-normalizer/.dist-info/METADATA b/contrib/python/charset-normalizer/.dist-info/METADATA
index 1b04ed4c4e..0ba0f9d513 100644
--- a/contrib/python/charset-normalizer/.dist-info/METADATA
+++ b/contrib/python/charset-normalizer/.dist-info/METADATA
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: charset-normalizer
-Version: 2.0.12
+Version: 2.1.0
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
Home-page: https://github.com/ousret/charset_normalizer
Author: Ahmed TAHRI @Ousret
@@ -10,13 +10,13 @@ Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
Keywords: encoding,i18n,txt,text,charset,charset-detector,normalization,unicode,chardet
Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
Classifier: License :: OSI Approved :: MIT License
Classifier: Intended Audience :: Developers
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
@@ -27,7 +27,7 @@ Classifier: Topic :: Text Processing :: Linguistic
Classifier: Topic :: Utilities
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Typing :: Typed
-Requires-Python: >=3.5.0
+Requires-Python: >=3.6.0
Description-Content-Type: text/markdown
License-File: LICENSE
Provides-Extra: unicode_backport
@@ -87,13 +87,13 @@ This package offer better performance than its counterpart Chardet. Here are som
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
| ------------- | :-------------: | :------------------: | :------------------: |
-| [chardet](https://github.com/chardet/chardet) | 92 % | 220 ms | 5 file/sec |
-| charset-normalizer | **98 %** | **40 ms** | 25 file/sec |
+| [chardet](https://github.com/chardet/chardet) | 92 % | 200 ms | 5 file/sec |
+| charset-normalizer | **98 %** | **39 ms** | 26 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
| ------------- | :-------------: | :------------------: | :------------------: |
-| [chardet](https://github.com/chardet/chardet) | 1115 ms | 300 ms | 27 ms |
-| charset-normalizer | 460 ms | 240 ms | 18 ms |
+| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
+| charset-normalizer | 400 ms | 200 ms | 15 ms |
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
diff --git a/contrib/python/charset-normalizer/README.md b/contrib/python/charset-normalizer/README.md
index b4c957a63c..904b60ea22 100644
--- a/contrib/python/charset-normalizer/README.md
+++ b/contrib/python/charset-normalizer/README.md
@@ -51,13 +51,13 @@ This package offer better performance than its counterpart Chardet. Here are som
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
| ------------- | :-------------: | :------------------: | :------------------: |
-| [chardet](https://github.com/chardet/chardet) | 92 % | 220 ms | 5 file/sec |
-| charset-normalizer | **98 %** | **40 ms** | 25 file/sec |
+| [chardet](https://github.com/chardet/chardet) | 92 % | 200 ms | 5 file/sec |
+| charset-normalizer | **98 %** | **39 ms** | 26 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
| ------------- | :-------------: | :------------------: | :------------------: |
-| [chardet](https://github.com/chardet/chardet) | 1115 ms | 300 ms | 27 ms |
-| charset-normalizer | 460 ms | 240 ms | 18 ms |
+| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
+| charset-normalizer | 400 ms | 200 ms | 15 ms |
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
diff --git a/contrib/python/charset-normalizer/charset_normalizer/api.py b/contrib/python/charset-normalizer/charset_normalizer/api.py
index bdc8ed9893..ae08361bb4 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/api.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/api.py
@@ -1,12 +1,8 @@
import logging
+from os import PathLike
from os.path import basename, splitext
from typing import BinaryIO, List, Optional, Set
-try:
- from os import PathLike
-except ImportError: # pragma: no cover
- PathLike = str # type: ignore
-
from .cd import (
coherence_ratio,
encoding_languages,
@@ -18,6 +14,7 @@ from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
+ cut_sequence_chunks,
iana_name,
identify_sig_or_bom,
is_cp_similar,
@@ -70,11 +67,11 @@ def from_bytes(
)
if explain:
- previous_logger_level = logger.level # type: int
+ previous_logger_level: int = logger.level
logger.addHandler(explain_handler)
logger.setLevel(TRACE)
- length = len(sequences) # type: int
+ length: int = len(sequences)
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
@@ -119,8 +116,8 @@ def from_bytes(
if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps)
- is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE # type: bool
- is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool
+ is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
+ is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
if is_too_small_sequence:
logger.log(
@@ -137,11 +134,11 @@ def from_bytes(
),
)
- prioritized_encodings = [] # type: List[str]
+ prioritized_encodings: List[str] = []
- specified_encoding = (
+ specified_encoding: Optional[str] = (
any_specified_encoding(sequences) if preemptive_behaviour else None
- ) # type: Optional[str]
+ )
if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
@@ -151,15 +148,15 @@ def from_bytes(
specified_encoding,
)
- tested = set() # type: Set[str]
- tested_but_hard_failure = [] # type: List[str]
- tested_but_soft_failure = [] # type: List[str]
+ tested: Set[str] = set()
+ tested_but_hard_failure: List[str] = []
+ tested_but_soft_failure: List[str] = []
- fallback_ascii = None # type: Optional[CharsetMatch]
- fallback_u8 = None # type: Optional[CharsetMatch]
- fallback_specified = None # type: Optional[CharsetMatch]
+ fallback_ascii: Optional[CharsetMatch] = None
+ fallback_u8: Optional[CharsetMatch] = None
+ fallback_specified: Optional[CharsetMatch] = None
- results = CharsetMatches() # type: CharsetMatches
+ results: CharsetMatches = CharsetMatches()
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
@@ -190,11 +187,11 @@ def from_bytes(
tested.add(encoding_iana)
- decoded_payload = None # type: Optional[str]
- bom_or_sig_available = sig_encoding == encoding_iana # type: bool
- strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
+ decoded_payload: Optional[str] = None
+ bom_or_sig_available: bool = sig_encoding == encoding_iana
+ strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
- ) # type: bool
+ )
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log(
@@ -205,7 +202,7 @@ def from_bytes(
continue
try:
- is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool
+ is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError):
logger.log(
TRACE,
@@ -240,7 +237,7 @@ def from_bytes(
tested_but_hard_failure.append(encoding_iana)
continue
- similar_soft_failure_test = False # type: bool
+ similar_soft_failure_test: bool = False
for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed):
@@ -262,11 +259,11 @@ def from_bytes(
int(length / steps),
)
- multi_byte_bonus = (
+ multi_byte_bonus: bool = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
- ) # type: bool
+ )
if multi_byte_bonus:
logger.log(
@@ -276,72 +273,47 @@ def from_bytes(
encoding_iana,
)
- max_chunk_gave_up = int(len(r_) / 4) # type: int
+ max_chunk_gave_up: int = int(len(r_) / 4)
max_chunk_gave_up = max(max_chunk_gave_up, 2)
- early_stop_count = 0 # type: int
+ early_stop_count: int = 0
lazy_str_hard_failure = False
- md_chunks = [] # type: List[str]
+ md_chunks: List[str] = []
md_ratios = []
- for i in r_:
- if i + chunk_size > length + 8:
- continue
-
- cut_sequence = sequences[i : i + chunk_size]
-
- if bom_or_sig_available and strip_sig_or_bom is False:
- cut_sequence = sig_payload + cut_sequence
-
- try:
- chunk = cut_sequence.decode(
- encoding_iana,
- errors="ignore" if is_multi_byte_decoder else "strict",
- ) # type: str
- except UnicodeDecodeError as e: # Lazy str loading may have missed something there
- logger.log(
- TRACE,
- "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
- encoding_iana,
- str(e),
- )
- early_stop_count = max_chunk_gave_up
- lazy_str_hard_failure = True
- break
+ try:
+ for chunk in cut_sequence_chunks(
+ sequences,
+ encoding_iana,
+ r_,
+ chunk_size,
+ bom_or_sig_available,
+ strip_sig_or_bom,
+ sig_payload,
+ is_multi_byte_decoder,
+ decoded_payload,
+ ):
+ md_chunks.append(chunk)
- # multi-byte bad cutting detector and adjustment
- # not the cleanest way to perform that fix but clever enough for now.
- if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
+ md_ratios.append(mess_ratio(chunk, threshold))
- chunk_partial_size_chk = min(chunk_size, 16) # type: int
+ if md_ratios[-1] >= threshold:
+ early_stop_count += 1
- if (
- decoded_payload
- and chunk[:chunk_partial_size_chk] not in decoded_payload
+ if (early_stop_count >= max_chunk_gave_up) or (
+ bom_or_sig_available and strip_sig_or_bom is False
):
- for j in range(i, i - 4, -1):
- cut_sequence = sequences[j : i + chunk_size]
-
- if bom_or_sig_available and strip_sig_or_bom is False:
- cut_sequence = sig_payload + cut_sequence
-
- chunk = cut_sequence.decode(encoding_iana, errors="ignore")
-
- if chunk[:chunk_partial_size_chk] in decoded_payload:
- break
-
- md_chunks.append(chunk)
-
- md_ratios.append(mess_ratio(chunk, threshold))
-
- if md_ratios[-1] >= threshold:
- early_stop_count += 1
-
- if (early_stop_count >= max_chunk_gave_up) or (
- bom_or_sig_available and strip_sig_or_bom is False
- ):
- break
+ break
+ except UnicodeDecodeError as e: # Lazy str loading may have missed something there
+ logger.log(
+ TRACE,
+ "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+ encoding_iana,
+ str(e),
+ )
+ early_stop_count = max_chunk_gave_up
+ lazy_str_hard_failure = True
# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
@@ -362,9 +334,7 @@ def from_bytes(
tested_but_hard_failure.append(encoding_iana)
continue
- mean_mess_ratio = (
- sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
- ) # type: float
+ mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.log(
@@ -399,7 +369,7 @@ def from_bytes(
)
if not is_multi_byte_decoder:
- target_languages = encoding_languages(encoding_iana) # type: List[str]
+ target_languages: List[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
diff --git a/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py b/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py
index b2e56ff398..b9a3700f79 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py
@@ -1,1244 +1,1122 @@
# -*- coding: utf_8 -*-
-from collections import OrderedDict
+from typing import Dict, List
-FREQUENCIES = OrderedDict(
- [
- (
- "English",
- [
- "e",
- "a",
- "t",
- "i",
- "o",
- "n",
- "s",
- "r",
- "h",
- "l",
- "d",
- "c",
- "u",
- "m",
- "f",
- "p",
- "g",
- "w",
- "y",
- "b",
- "v",
- "k",
- "x",
- "j",
- "z",
- "q",
- ],
- ),
- (
- "German",
- [
- "e",
- "n",
- "i",
- "r",
- "s",
- "t",
- "a",
- "d",
- "h",
- "u",
- "l",
- "g",
- "o",
- "c",
- "m",
- "b",
- "f",
- "k",
- "w",
- "z",
- "p",
- "v",
- "ü",
- "ä",
- "ö",
- "j",
- ],
- ),
- (
- "French",
- [
- "e",
- "a",
- "s",
- "n",
- "i",
- "t",
- "r",
- "l",
- "u",
- "o",
- "d",
- "c",
- "p",
- "m",
- "é",
- "v",
- "g",
- "f",
- "b",
- "h",
- "q",
- "à",
- "x",
- "è",
- "y",
- "j",
- ],
- ),
- (
- "Dutch",
- [
- "e",
- "n",
- "a",
- "i",
- "r",
- "t",
- "o",
- "d",
- "s",
- "l",
- "g",
- "h",
- "v",
- "m",
- "u",
- "k",
- "c",
- "p",
- "b",
- "w",
- "j",
- "z",
- "f",
- "y",
- "x",
- "ë",
- ],
- ),
- (
- "Italian",
- [
- "e",
- "i",
- "a",
- "o",
- "n",
- "l",
- "t",
- "r",
- "s",
- "c",
- "d",
- "u",
- "p",
- "m",
- "g",
- "v",
- "f",
- "b",
- "z",
- "h",
- "q",
- "è",
- "à",
- "k",
- "y",
- "ò",
- ],
- ),
- (
- "Polish",
- [
- "a",
- "i",
- "o",
- "e",
- "n",
- "r",
- "z",
- "w",
- "s",
- "c",
- "t",
- "k",
- "y",
- "d",
- "p",
- "m",
- "u",
- "l",
- "j",
- "ł",
- "g",
- "b",
- "h",
- "ą",
- "ę",
- "ó",
- ],
- ),
- (
- "Spanish",
- [
- "e",
- "a",
- "o",
- "n",
- "s",
- "r",
- "i",
- "l",
- "d",
- "t",
- "c",
- "u",
- "m",
- "p",
- "b",
- "g",
- "v",
- "f",
- "y",
- "ó",
- "h",
- "q",
- "í",
- "j",
- "z",
- "á",
- ],
- ),
- (
- "Russian",
- [
- "о",
- "а",
- "е",
- "и",
- "н",
- "с",
- "т",
- "р",
- "в",
- "л",
- "к",
- "м",
- "д",
- "п",
- "у",
- "г",
- "я",
- "ы",
- "з",
- "б",
- "й",
- "ь",
- "ч",
- "х",
- "ж",
- "ц",
- ],
- ),
- (
- "Japanese",
- [
- "の",
- "に",
- "る",
- "た",
- "は",
- "ー",
- "と",
- "し",
- "を",
- "で",
- "て",
- "が",
- "い",
- "ン",
- "れ",
- "な",
- "年",
- "ス",
- "っ",
- "ル",
- "か",
- "ら",
- "あ",
- "さ",
- "も",
- "り",
- ],
- ),
- (
- "Portuguese",
- [
- "a",
- "e",
- "o",
- "s",
- "i",
- "r",
- "d",
- "n",
- "t",
- "m",
- "u",
- "c",
- "l",
- "p",
- "g",
- "v",
- "b",
- "f",
- "h",
- "ã",
- "q",
- "é",
- "ç",
- "á",
- "z",
- "í",
- ],
- ),
- (
- "Swedish",
- [
- "e",
- "a",
- "n",
- "r",
- "t",
- "s",
- "i",
- "l",
- "d",
- "o",
- "m",
- "k",
- "g",
- "v",
- "h",
- "f",
- "u",
- "p",
- "ä",
- "c",
- "b",
- "ö",
- "å",
- "y",
- "j",
- "x",
- ],
- ),
- (
- "Chinese",
- [
- "的",
- "一",
- "是",
- "不",
- "了",
- "在",
- "人",
- "有",
- "我",
- "他",
- "这",
- "个",
- "们",
- "中",
- "来",
- "上",
- "大",
- "为",
- "和",
- "国",
- "地",
- "到",
- "以",
- "说",
- "时",
- "要",
- "就",
- "出",
- "会",
- ],
- ),
- (
- "Ukrainian",
- [
- "о",
- "а",
- "н",
- "і",
- "и",
- "р",
- "в",
- "т",
- "е",
- "с",
- "к",
- "л",
- "у",
- "д",
- "м",
- "п",
- "з",
- "я",
- "ь",
- "б",
- "г",
- "й",
- "ч",
- "х",
- "ц",
- "ї",
- ],
- ),
- (
- "Norwegian",
- [
- "e",
- "r",
- "n",
- "t",
- "a",
- "s",
- "i",
- "o",
- "l",
- "d",
- "g",
- "k",
- "m",
- "v",
- "f",
- "p",
- "u",
- "b",
- "h",
- "å",
- "y",
- "j",
- "ø",
- "c",
- "æ",
- "w",
- ],
- ),
- (
- "Finnish",
- [
- "a",
- "i",
- "n",
- "t",
- "e",
- "s",
- "l",
- "o",
- "u",
- "k",
- "ä",
- "m",
- "r",
- "v",
- "j",
- "h",
- "p",
- "y",
- "d",
- "ö",
- "g",
- "c",
- "b",
- "f",
- "w",
- "z",
- ],
- ),
- (
- "Vietnamese",
- [
- "n",
- "h",
- "t",
- "i",
- "c",
- "g",
- "a",
- "o",
- "u",
- "m",
- "l",
- "r",
- "à",
- "đ",
- "s",
- "e",
- "v",
- "p",
- "b",
- "y",
- "ư",
- "d",
- "á",
- "k",
- "ộ",
- "ế",
- ],
- ),
- (
- "Czech",
- [
- "o",
- "e",
- "a",
- "n",
- "t",
- "s",
- "i",
- "l",
- "v",
- "r",
- "k",
- "d",
- "u",
- "m",
- "p",
- "í",
- "c",
- "h",
- "z",
- "á",
- "y",
- "j",
- "b",
- "ě",
- "é",
- "ř",
- ],
- ),
- (
- "Hungarian",
- [
- "e",
- "a",
- "t",
- "l",
- "s",
- "n",
- "k",
- "r",
- "i",
- "o",
- "z",
- "á",
- "é",
- "g",
- "m",
- "b",
- "y",
- "v",
- "d",
- "h",
- "u",
- "p",
- "j",
- "ö",
- "f",
- "c",
- ],
- ),
- (
- "Korean",
- [
- "이",
- "다",
- "에",
- "의",
- "는",
- "로",
- "하",
- "을",
- "가",
- "고",
- "지",
- "서",
- "한",
- "은",
- "기",
- "으",
- "년",
- "대",
- "사",
- "시",
- "를",
- "리",
- "도",
- "인",
- "스",
- "일",
- ],
- ),
- (
- "Indonesian",
- [
- "a",
- "n",
- "e",
- "i",
- "r",
- "t",
- "u",
- "s",
- "d",
- "k",
- "m",
- "l",
- "g",
- "p",
- "b",
- "o",
- "h",
- "y",
- "j",
- "c",
- "w",
- "f",
- "v",
- "z",
- "x",
- "q",
- ],
- ),
- (
- "Turkish",
- [
- "a",
- "e",
- "i",
- "n",
- "r",
- "l",
- "ı",
- "k",
- "d",
- "t",
- "s",
- "m",
- "y",
- "u",
- "o",
- "b",
- "ü",
- "ş",
- "v",
- "g",
- "z",
- "h",
- "c",
- "p",
- "ç",
- "ğ",
- ],
- ),
- (
- "Romanian",
- [
- "e",
- "i",
- "a",
- "r",
- "n",
- "t",
- "u",
- "l",
- "o",
- "c",
- "s",
- "d",
- "p",
- "m",
- "ă",
- "f",
- "v",
- "î",
- "g",
- "b",
- "ș",
- "ț",
- "z",
- "h",
- "â",
- "j",
- ],
- ),
- (
- "Farsi",
- [
- "ا",
- "ی",
- "ر",
- "د",
- "ن",
- "ه",
- "و",
- "م",
- "ت",
- "ب",
- "س",
- "ل",
- "ک",
- "ش",
- "ز",
- "ف",
- "گ",
- "ع",
- "خ",
- "ق",
- "ج",
- "آ",
- "پ",
- "ح",
- "ط",
- "ص",
- ],
- ),
- (
- "Arabic",
- [
- "ا",
- "ل",
- "ي",
- "م",
- "و",
- "ن",
- "ر",
- "ت",
- "ب",
- "ة",
- "ع",
- "د",
- "س",
- "ف",
- "ه",
- "ك",
- "ق",
- "أ",
- "ح",
- "ج",
- "ش",
- "ط",
- "ص",
- "ى",
- "خ",
- "إ",
- ],
- ),
- (
- "Danish",
- [
- "e",
- "r",
- "n",
- "t",
- "a",
- "i",
- "s",
- "d",
- "l",
- "o",
- "g",
- "m",
- "k",
- "f",
- "v",
- "u",
- "b",
- "h",
- "p",
- "å",
- "y",
- "ø",
- "æ",
- "c",
- "j",
- "w",
- ],
- ),
- (
- "Serbian",
- [
- "а",
- "и",
- "о",
- "е",
- "н",
- "р",
- "с",
- "у",
- "т",
- "к",
- "ј",
- "в",
- "д",
- "м",
- "п",
- "л",
- "г",
- "з",
- "б",
- "a",
- "i",
- "e",
- "o",
- "n",
- "ц",
- "ш",
- ],
- ),
- (
- "Lithuanian",
- [
- "i",
- "a",
- "s",
- "o",
- "r",
- "e",
- "t",
- "n",
- "u",
- "k",
- "m",
- "l",
- "p",
- "v",
- "d",
- "j",
- "g",
- "ė",
- "b",
- "y",
- "ų",
- "š",
- "ž",
- "c",
- "ą",
- "į",
- ],
- ),
- (
- "Slovene",
- [
- "e",
- "a",
- "i",
- "o",
- "n",
- "r",
- "s",
- "l",
- "t",
- "j",
- "v",
- "k",
- "d",
- "p",
- "m",
- "u",
- "z",
- "b",
- "g",
- "h",
- "č",
- "c",
- "š",
- "ž",
- "f",
- "y",
- ],
- ),
- (
- "Slovak",
- [
- "o",
- "a",
- "e",
- "n",
- "i",
- "r",
- "v",
- "t",
- "s",
- "l",
- "k",
- "d",
- "m",
- "p",
- "u",
- "c",
- "h",
- "j",
- "b",
- "z",
- "á",
- "y",
- "ý",
- "í",
- "č",
- "é",
- ],
- ),
- (
- "Hebrew",
- [
- "י",
- "ו",
- "ה",
- "ל",
- "ר",
- "ב",
- "ת",
- "מ",
- "א",
- "ש",
- "נ",
- "ע",
- "ם",
- "ד",
- "ק",
- "ח",
- "פ",
- "ס",
- "כ",
- "ג",
- "ט",
- "צ",
- "ן",
- "ז",
- "ך",
- ],
- ),
- (
- "Bulgarian",
- [
- "а",
- "и",
- "о",
- "е",
- "н",
- "т",
- "р",
- "с",
- "в",
- "л",
- "к",
- "д",
- "п",
- "м",
- "з",
- "г",
- "я",
- "ъ",
- "у",
- "б",
- "ч",
- "ц",
- "й",
- "ж",
- "щ",
- "х",
- ],
- ),
- (
- "Croatian",
- [
- "a",
- "i",
- "o",
- "e",
- "n",
- "r",
- "j",
- "s",
- "t",
- "u",
- "k",
- "l",
- "v",
- "d",
- "m",
- "p",
- "g",
- "z",
- "b",
- "c",
- "č",
- "h",
- "š",
- "ž",
- "ć",
- "f",
- ],
- ),
- (
- "Hindi",
- [
- "क",
- "र",
- "स",
- "न",
- "त",
- "म",
- "ह",
- "प",
- "य",
- "ल",
- "व",
- "ज",
- "द",
- "ग",
- "ब",
- "श",
- "ट",
- "अ",
- "ए",
- "थ",
- "भ",
- "ड",
- "च",
- "ध",
- "ष",
- "इ",
- ],
- ),
- (
- "Estonian",
- [
- "a",
- "i",
- "e",
- "s",
- "t",
- "l",
- "u",
- "n",
- "o",
- "k",
- "r",
- "d",
- "m",
- "v",
- "g",
- "p",
- "j",
- "h",
- "ä",
- "b",
- "õ",
- "ü",
- "f",
- "c",
- "ö",
- "y",
- ],
- ),
- (
- "Simple English",
- [
- "e",
- "a",
- "t",
- "i",
- "o",
- "n",
- "s",
- "r",
- "h",
- "l",
- "d",
- "c",
- "m",
- "u",
- "f",
- "p",
- "g",
- "w",
- "b",
- "y",
- "v",
- "k",
- "j",
- "x",
- "z",
- "q",
- ],
- ),
- (
- "Thai",
- [
- "า",
- "น",
- "ร",
- "อ",
- "ก",
- "เ",
- "ง",
- "ม",
- "ย",
- "ล",
- "ว",
- "ด",
- "ท",
- "ส",
- "ต",
- "ะ",
- "ป",
- "บ",
- "ค",
- "ห",
- "แ",
- "จ",
- "พ",
- "ช",
- "ข",
- "ใ",
- ],
- ),
- (
- "Greek",
- [
- "α",
- "τ",
- "ο",
- "ι",
- "ε",
- "ν",
- "ρ",
- "σ",
- "κ",
- "η",
- "π",
- "ς",
- "υ",
- "μ",
- "λ",
- "ί",
- "ό",
- "ά",
- "γ",
- "έ",
- "δ",
- "ή",
- "ω",
- "χ",
- "θ",
- "ύ",
- ],
- ),
- (
- "Tamil",
- [
- "க",
- "த",
- "ப",
- "ட",
- "ர",
- "ம",
- "ல",
- "ன",
- "வ",
- "ற",
- "ய",
- "ள",
- "ச",
- "ந",
- "இ",
- "ண",
- "அ",
- "ஆ",
- "ழ",
- "ங",
- "எ",
- "உ",
- "ஒ",
- "ஸ",
- ],
- ),
- (
- "Classical Chinese",
- [
- "之",
- "年",
- "為",
- "也",
- "以",
- "一",
- "人",
- "其",
- "者",
- "國",
- "有",
- "二",
- "十",
- "於",
- "曰",
- "三",
- "不",
- "大",
- "而",
- "子",
- "中",
- "五",
- "四",
- ],
- ),
- (
- "Kazakh",
- [
- "а",
- "ы",
- "е",
- "н",
- "т",
- "р",
- "л",
- "і",
- "д",
- "с",
- "м",
- "қ",
- "к",
- "о",
- "б",
- "и",
- "у",
- "ғ",
- "ж",
- "ң",
- "з",
- "ш",
- "й",
- "п",
- "г",
- "ө",
- ],
- ),
- ]
-)
+FREQUENCIES: Dict[str, List[str]] = {
+ "English": [
+ "e",
+ "a",
+ "t",
+ "i",
+ "o",
+ "n",
+ "s",
+ "r",
+ "h",
+ "l",
+ "d",
+ "c",
+ "u",
+ "m",
+ "f",
+ "p",
+ "g",
+ "w",
+ "y",
+ "b",
+ "v",
+ "k",
+ "x",
+ "j",
+ "z",
+ "q",
+ ],
+ "German": [
+ "e",
+ "n",
+ "i",
+ "r",
+ "s",
+ "t",
+ "a",
+ "d",
+ "h",
+ "u",
+ "l",
+ "g",
+ "o",
+ "c",
+ "m",
+ "b",
+ "f",
+ "k",
+ "w",
+ "z",
+ "p",
+ "v",
+ "ü",
+ "ä",
+ "ö",
+ "j",
+ ],
+ "French": [
+ "e",
+ "a",
+ "s",
+ "n",
+ "i",
+ "t",
+ "r",
+ "l",
+ "u",
+ "o",
+ "d",
+ "c",
+ "p",
+ "m",
+ "é",
+ "v",
+ "g",
+ "f",
+ "b",
+ "h",
+ "q",
+ "à",
+ "x",
+ "è",
+ "y",
+ "j",
+ ],
+ "Dutch": [
+ "e",
+ "n",
+ "a",
+ "i",
+ "r",
+ "t",
+ "o",
+ "d",
+ "s",
+ "l",
+ "g",
+ "h",
+ "v",
+ "m",
+ "u",
+ "k",
+ "c",
+ "p",
+ "b",
+ "w",
+ "j",
+ "z",
+ "f",
+ "y",
+ "x",
+ "ë",
+ ],
+ "Italian": [
+ "e",
+ "i",
+ "a",
+ "o",
+ "n",
+ "l",
+ "t",
+ "r",
+ "s",
+ "c",
+ "d",
+ "u",
+ "p",
+ "m",
+ "g",
+ "v",
+ "f",
+ "b",
+ "z",
+ "h",
+ "q",
+ "è",
+ "à",
+ "k",
+ "y",
+ "ò",
+ ],
+ "Polish": [
+ "a",
+ "i",
+ "o",
+ "e",
+ "n",
+ "r",
+ "z",
+ "w",
+ "s",
+ "c",
+ "t",
+ "k",
+ "y",
+ "d",
+ "p",
+ "m",
+ "u",
+ "l",
+ "j",
+ "ł",
+ "g",
+ "b",
+ "h",
+ "ą",
+ "ę",
+ "ó",
+ ],
+ "Spanish": [
+ "e",
+ "a",
+ "o",
+ "n",
+ "s",
+ "r",
+ "i",
+ "l",
+ "d",
+ "t",
+ "c",
+ "u",
+ "m",
+ "p",
+ "b",
+ "g",
+ "v",
+ "f",
+ "y",
+ "ó",
+ "h",
+ "q",
+ "í",
+ "j",
+ "z",
+ "á",
+ ],
+ "Russian": [
+ "о",
+ "а",
+ "е",
+ "и",
+ "н",
+ "с",
+ "т",
+ "р",
+ "в",
+ "л",
+ "к",
+ "м",
+ "д",
+ "п",
+ "у",
+ "г",
+ "я",
+ "ы",
+ "з",
+ "б",
+ "й",
+ "ь",
+ "ч",
+ "х",
+ "ж",
+ "ц",
+ ],
+ "Japanese": [
+ "の",
+ "に",
+ "る",
+ "た",
+ "は",
+ "ー",
+ "と",
+ "し",
+ "を",
+ "で",
+ "て",
+ "が",
+ "い",
+ "ン",
+ "れ",
+ "な",
+ "年",
+ "ス",
+ "っ",
+ "ル",
+ "か",
+ "ら",
+ "あ",
+ "さ",
+ "も",
+ "り",
+ ],
+ "Portuguese": [
+ "a",
+ "e",
+ "o",
+ "s",
+ "i",
+ "r",
+ "d",
+ "n",
+ "t",
+ "m",
+ "u",
+ "c",
+ "l",
+ "p",
+ "g",
+ "v",
+ "b",
+ "f",
+ "h",
+ "ã",
+ "q",
+ "é",
+ "ç",
+ "á",
+ "z",
+ "í",
+ ],
+ "Swedish": [
+ "e",
+ "a",
+ "n",
+ "r",
+ "t",
+ "s",
+ "i",
+ "l",
+ "d",
+ "o",
+ "m",
+ "k",
+ "g",
+ "v",
+ "h",
+ "f",
+ "u",
+ "p",
+ "ä",
+ "c",
+ "b",
+ "ö",
+ "å",
+ "y",
+ "j",
+ "x",
+ ],
+ "Chinese": [
+ "的",
+ "一",
+ "是",
+ "不",
+ "了",
+ "在",
+ "人",
+ "有",
+ "我",
+ "他",
+ "这",
+ "个",
+ "们",
+ "中",
+ "来",
+ "上",
+ "大",
+ "为",
+ "和",
+ "国",
+ "地",
+ "到",
+ "以",
+ "说",
+ "时",
+ "要",
+ "就",
+ "出",
+ "会",
+ ],
+ "Ukrainian": [
+ "о",
+ "а",
+ "н",
+ "і",
+ "и",
+ "р",
+ "в",
+ "т",
+ "е",
+ "с",
+ "к",
+ "л",
+ "у",
+ "д",
+ "м",
+ "п",
+ "з",
+ "я",
+ "ь",
+ "б",
+ "г",
+ "й",
+ "ч",
+ "х",
+ "ц",
+ "ї",
+ ],
+ "Norwegian": [
+ "e",
+ "r",
+ "n",
+ "t",
+ "a",
+ "s",
+ "i",
+ "o",
+ "l",
+ "d",
+ "g",
+ "k",
+ "m",
+ "v",
+ "f",
+ "p",
+ "u",
+ "b",
+ "h",
+ "å",
+ "y",
+ "j",
+ "ø",
+ "c",
+ "æ",
+ "w",
+ ],
+ "Finnish": [
+ "a",
+ "i",
+ "n",
+ "t",
+ "e",
+ "s",
+ "l",
+ "o",
+ "u",
+ "k",
+ "ä",
+ "m",
+ "r",
+ "v",
+ "j",
+ "h",
+ "p",
+ "y",
+ "d",
+ "ö",
+ "g",
+ "c",
+ "b",
+ "f",
+ "w",
+ "z",
+ ],
+ "Vietnamese": [
+ "n",
+ "h",
+ "t",
+ "i",
+ "c",
+ "g",
+ "a",
+ "o",
+ "u",
+ "m",
+ "l",
+ "r",
+ "à",
+ "đ",
+ "s",
+ "e",
+ "v",
+ "p",
+ "b",
+ "y",
+ "ư",
+ "d",
+ "á",
+ "k",
+ "ộ",
+ "ế",
+ ],
+ "Czech": [
+ "o",
+ "e",
+ "a",
+ "n",
+ "t",
+ "s",
+ "i",
+ "l",
+ "v",
+ "r",
+ "k",
+ "d",
+ "u",
+ "m",
+ "p",
+ "í",
+ "c",
+ "h",
+ "z",
+ "á",
+ "y",
+ "j",
+ "b",
+ "ě",
+ "é",
+ "ř",
+ ],
+ "Hungarian": [
+ "e",
+ "a",
+ "t",
+ "l",
+ "s",
+ "n",
+ "k",
+ "r",
+ "i",
+ "o",
+ "z",
+ "á",
+ "é",
+ "g",
+ "m",
+ "b",
+ "y",
+ "v",
+ "d",
+ "h",
+ "u",
+ "p",
+ "j",
+ "ö",
+ "f",
+ "c",
+ ],
+ "Korean": [
+ "이",
+ "다",
+ "에",
+ "의",
+ "는",
+ "로",
+ "하",
+ "을",
+ "가",
+ "고",
+ "지",
+ "서",
+ "한",
+ "은",
+ "기",
+ "으",
+ "년",
+ "대",
+ "사",
+ "시",
+ "를",
+ "리",
+ "도",
+ "인",
+ "스",
+ "일",
+ ],
+ "Indonesian": [
+ "a",
+ "n",
+ "e",
+ "i",
+ "r",
+ "t",
+ "u",
+ "s",
+ "d",
+ "k",
+ "m",
+ "l",
+ "g",
+ "p",
+ "b",
+ "o",
+ "h",
+ "y",
+ "j",
+ "c",
+ "w",
+ "f",
+ "v",
+ "z",
+ "x",
+ "q",
+ ],
+ "Turkish": [
+ "a",
+ "e",
+ "i",
+ "n",
+ "r",
+ "l",
+ "ı",
+ "k",
+ "d",
+ "t",
+ "s",
+ "m",
+ "y",
+ "u",
+ "o",
+ "b",
+ "ü",
+ "ş",
+ "v",
+ "g",
+ "z",
+ "h",
+ "c",
+ "p",
+ "ç",
+ "ğ",
+ ],
+ "Romanian": [
+ "e",
+ "i",
+ "a",
+ "r",
+ "n",
+ "t",
+ "u",
+ "l",
+ "o",
+ "c",
+ "s",
+ "d",
+ "p",
+ "m",
+ "ă",
+ "f",
+ "v",
+ "î",
+ "g",
+ "b",
+ "ș",
+ "ț",
+ "z",
+ "h",
+ "â",
+ "j",
+ ],
+ "Farsi": [
+ "ا",
+ "ی",
+ "ر",
+ "د",
+ "ن",
+ "ه",
+ "و",
+ "م",
+ "ت",
+ "ب",
+ "س",
+ "ل",
+ "ک",
+ "ش",
+ "ز",
+ "ف",
+ "گ",
+ "ع",
+ "خ",
+ "ق",
+ "ج",
+ "آ",
+ "پ",
+ "ح",
+ "ط",
+ "ص",
+ ],
+ "Arabic": [
+ "ا",
+ "ل",
+ "ي",
+ "م",
+ "و",
+ "ن",
+ "ر",
+ "ت",
+ "ب",
+ "ة",
+ "ع",
+ "د",
+ "س",
+ "ف",
+ "ه",
+ "ك",
+ "ق",
+ "أ",
+ "ح",
+ "ج",
+ "ش",
+ "ط",
+ "ص",
+ "ى",
+ "خ",
+ "إ",
+ ],
+ "Danish": [
+ "e",
+ "r",
+ "n",
+ "t",
+ "a",
+ "i",
+ "s",
+ "d",
+ "l",
+ "o",
+ "g",
+ "m",
+ "k",
+ "f",
+ "v",
+ "u",
+ "b",
+ "h",
+ "p",
+ "å",
+ "y",
+ "ø",
+ "æ",
+ "c",
+ "j",
+ "w",
+ ],
+ "Serbian": [
+ "а",
+ "и",
+ "о",
+ "е",
+ "н",
+ "р",
+ "с",
+ "у",
+ "т",
+ "к",
+ "ј",
+ "в",
+ "д",
+ "м",
+ "п",
+ "л",
+ "г",
+ "з",
+ "б",
+ "a",
+ "i",
+ "e",
+ "o",
+ "n",
+ "ц",
+ "ш",
+ ],
+ "Lithuanian": [
+ "i",
+ "a",
+ "s",
+ "o",
+ "r",
+ "e",
+ "t",
+ "n",
+ "u",
+ "k",
+ "m",
+ "l",
+ "p",
+ "v",
+ "d",
+ "j",
+ "g",
+ "ė",
+ "b",
+ "y",
+ "ų",
+ "š",
+ "ž",
+ "c",
+ "ą",
+ "į",
+ ],
+ "Slovene": [
+ "e",
+ "a",
+ "i",
+ "o",
+ "n",
+ "r",
+ "s",
+ "l",
+ "t",
+ "j",
+ "v",
+ "k",
+ "d",
+ "p",
+ "m",
+ "u",
+ "z",
+ "b",
+ "g",
+ "h",
+ "č",
+ "c",
+ "š",
+ "ž",
+ "f",
+ "y",
+ ],
+ "Slovak": [
+ "o",
+ "a",
+ "e",
+ "n",
+ "i",
+ "r",
+ "v",
+ "t",
+ "s",
+ "l",
+ "k",
+ "d",
+ "m",
+ "p",
+ "u",
+ "c",
+ "h",
+ "j",
+ "b",
+ "z",
+ "á",
+ "y",
+ "ý",
+ "í",
+ "č",
+ "é",
+ ],
+ "Hebrew": [
+ "י",
+ "ו",
+ "ה",
+ "ל",
+ "ר",
+ "ב",
+ "ת",
+ "מ",
+ "א",
+ "ש",
+ "נ",
+ "ע",
+ "ם",
+ "ד",
+ "ק",
+ "ח",
+ "פ",
+ "ס",
+ "כ",
+ "ג",
+ "ט",
+ "צ",
+ "ן",
+ "ז",
+ "ך",
+ ],
+ "Bulgarian": [
+ "а",
+ "и",
+ "о",
+ "е",
+ "н",
+ "т",
+ "р",
+ "с",
+ "в",
+ "л",
+ "к",
+ "д",
+ "п",
+ "м",
+ "з",
+ "г",
+ "я",
+ "ъ",
+ "у",
+ "б",
+ "ч",
+ "ц",
+ "й",
+ "ж",
+ "щ",
+ "х",
+ ],
+ "Croatian": [
+ "a",
+ "i",
+ "o",
+ "e",
+ "n",
+ "r",
+ "j",
+ "s",
+ "t",
+ "u",
+ "k",
+ "l",
+ "v",
+ "d",
+ "m",
+ "p",
+ "g",
+ "z",
+ "b",
+ "c",
+ "č",
+ "h",
+ "š",
+ "ž",
+ "ć",
+ "f",
+ ],
+ "Hindi": [
+ "क",
+ "र",
+ "स",
+ "न",
+ "त",
+ "म",
+ "ह",
+ "प",
+ "य",
+ "ल",
+ "व",
+ "ज",
+ "द",
+ "ग",
+ "ब",
+ "श",
+ "ट",
+ "अ",
+ "ए",
+ "थ",
+ "भ",
+ "ड",
+ "च",
+ "ध",
+ "ष",
+ "इ",
+ ],
+ "Estonian": [
+ "a",
+ "i",
+ "e",
+ "s",
+ "t",
+ "l",
+ "u",
+ "n",
+ "o",
+ "k",
+ "r",
+ "d",
+ "m",
+ "v",
+ "g",
+ "p",
+ "j",
+ "h",
+ "ä",
+ "b",
+ "õ",
+ "ü",
+ "f",
+ "c",
+ "ö",
+ "y",
+ ],
+ "Simple English": [
+ "e",
+ "a",
+ "t",
+ "i",
+ "o",
+ "n",
+ "s",
+ "r",
+ "h",
+ "l",
+ "d",
+ "c",
+ "m",
+ "u",
+ "f",
+ "p",
+ "g",
+ "w",
+ "b",
+ "y",
+ "v",
+ "k",
+ "j",
+ "x",
+ "z",
+ "q",
+ ],
+ "Thai": [
+ "า",
+ "น",
+ "ร",
+ "อ",
+ "ก",
+ "เ",
+ "ง",
+ "ม",
+ "ย",
+ "ล",
+ "ว",
+ "ด",
+ "ท",
+ "ส",
+ "ต",
+ "ะ",
+ "ป",
+ "บ",
+ "ค",
+ "ห",
+ "แ",
+ "จ",
+ "พ",
+ "ช",
+ "ข",
+ "ใ",
+ ],
+ "Greek": [
+ "α",
+ "τ",
+ "ο",
+ "ι",
+ "ε",
+ "ν",
+ "ρ",
+ "σ",
+ "κ",
+ "η",
+ "π",
+ "ς",
+ "υ",
+ "μ",
+ "λ",
+ "ί",
+ "ό",
+ "ά",
+ "γ",
+ "έ",
+ "δ",
+ "ή",
+ "ω",
+ "χ",
+ "θ",
+ "ύ",
+ ],
+ "Tamil": [
+ "க",
+ "த",
+ "ப",
+ "ட",
+ "ர",
+ "ம",
+ "ல",
+ "ன",
+ "வ",
+ "ற",
+ "ய",
+ "ள",
+ "ச",
+ "ந",
+ "இ",
+ "ண",
+ "அ",
+ "ஆ",
+ "ழ",
+ "ங",
+ "எ",
+ "உ",
+ "ஒ",
+ "ஸ",
+ ],
+ "Classical Chinese": [
+ "之",
+ "年",
+ "為",
+ "也",
+ "以",
+ "一",
+ "人",
+ "其",
+ "者",
+ "國",
+ "有",
+ "二",
+ "十",
+ "於",
+ "曰",
+ "三",
+ "不",
+ "大",
+ "而",
+ "子",
+ "中",
+ "五",
+ "四",
+ ],
+ "Kazakh": [
+ "а",
+ "ы",
+ "е",
+ "н",
+ "т",
+ "р",
+ "л",
+ "і",
+ "д",
+ "с",
+ "м",
+ "қ",
+ "к",
+ "о",
+ "б",
+ "и",
+ "у",
+ "ғ",
+ "ж",
+ "ң",
+ "з",
+ "ш",
+ "й",
+ "п",
+ "г",
+ "ө",
+ ],
+}
diff --git a/contrib/python/charset-normalizer/charset_normalizer/cd.py b/contrib/python/charset-normalizer/charset_normalizer/cd.py
index 8429a0eb20..8998bb545c 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/cd.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/cd.py
@@ -1,6 +1,6 @@
import importlib
from codecs import IncrementalDecoder
-from collections import Counter, OrderedDict
+from collections import Counter
from functools import lru_cache
from typing import Dict, List, Optional, Tuple
@@ -26,15 +26,15 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore
- p = decoder(errors="ignore") # type: IncrementalDecoder
- seen_ranges = {} # type: Dict[str, int]
- character_count = 0 # type: int
+ p: IncrementalDecoder = decoder(errors="ignore")
+ seen_ranges: Dict[str, int] = {}
+ character_count: int = 0
for i in range(0x40, 0xFF):
- chunk = p.decode(bytes([i])) # type: str
+ chunk: str = p.decode(bytes([i]))
if chunk:
- character_range = unicode_range(chunk) # type: Optional[str]
+ character_range: Optional[str] = unicode_range(chunk)
if character_range is None:
continue
@@ -58,7 +58,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
"""
Return inferred languages used with a unicode range.
"""
- languages = [] # type: List[str]
+ languages: List[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
@@ -75,8 +75,8 @@ def encoding_languages(iana_name: str) -> List[str]:
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
- unicode_ranges = encoding_unicode_range(iana_name) # type: List[str]
- primary_range = None # type: Optional[str]
+ unicode_ranges: List[str] = encoding_unicode_range(iana_name)
+ primary_range: Optional[str] = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
@@ -115,8 +115,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
- target_have_accents = False # type: bool
- target_pure_latin = True # type: bool
+ target_have_accents: bool = False
+ target_pure_latin: bool = True
for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character):
@@ -133,7 +133,7 @@ def alphabet_languages(
"""
Return associated languages associated to given characters.
"""
- languages = [] # type: List[Tuple[str, float]]
+ languages: List[Tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
@@ -147,13 +147,13 @@ def alphabet_languages(
if target_have_accents is False and source_have_accents:
continue
- character_count = len(language_characters) # type: int
+ character_count: int = len(language_characters)
- character_match_count = len(
+ character_match_count: int = len(
[c for c in language_characters if c in characters]
- ) # type: int
+ )
- ratio = character_match_count / character_count # type: float
+ ratio: float = character_match_count / character_count
if ratio >= 0.2:
languages.append((language, ratio))
@@ -174,36 +174,33 @@ def characters_popularity_compare(
if language not in FREQUENCIES:
raise ValueError("{} not available".format(language))
- character_approved_count = 0 # type: int
+ character_approved_count: int = 0
+ FREQUENCIES_language_set = set(FREQUENCIES[language])
for character in ordered_characters:
- if character not in FREQUENCIES[language]:
+ if character not in FREQUENCIES_language_set:
continue
- characters_before_source = FREQUENCIES[language][
+ characters_before_source: List[str] = FREQUENCIES[language][
0 : FREQUENCIES[language].index(character)
- ] # type: List[str]
- characters_after_source = FREQUENCIES[language][
+ ]
+ characters_after_source: List[str] = FREQUENCIES[language][
FREQUENCIES[language].index(character) :
- ] # type: List[str]
-
- characters_before = ordered_characters[
+ ]
+ characters_before: List[str] = ordered_characters[
0 : ordered_characters.index(character)
- ] # type: List[str]
- characters_after = ordered_characters[
+ ]
+ characters_after: List[str] = ordered_characters[
ordered_characters.index(character) :
- ] # type: List[str]
-
- before_match_count = [
- e in characters_before for e in characters_before_source
- ].count(
- True
- ) # type: int
- after_match_count = [
- e in characters_after for e in characters_after_source
- ].count(
- True
- ) # type: int
+ ]
+
+ before_match_count: int = len(
+ set(characters_before) & set(characters_before_source)
+ )
+
+ after_match_count: int = len(
+ set(characters_after) & set(characters_after_source)
+ )
if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
@@ -229,18 +226,18 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
- layers = OrderedDict() # type: Dict[str, str]
+ layers: Dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
- character_range = unicode_range(character) # type: Optional[str]
+ character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
- layer_target_range = None # type: Optional[str]
+ layer_target_range: Optional[str] = None
for discovered_range in layers:
if (
@@ -267,7 +264,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
- per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
+ per_language_ratios: Dict[str, List[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
@@ -299,10 +296,10 @@ def coherence_ratio(
A layer = Character extraction by alphabets/ranges.
"""
- results = [] # type: List[Tuple[str, float]]
- ignore_non_latin = False # type: bool
+ results: List[Tuple[str, float]] = []
+ ignore_non_latin: bool = False
- sufficient_match_count = 0 # type: int
+ sufficient_match_count: int = 0
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
@@ -310,22 +307,22 @@ def coherence_ratio(
lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence):
- sequence_frequencies = Counter(layer) # type: Counter
+ sequence_frequencies: Counter = Counter(layer)
most_common = sequence_frequencies.most_common()
- character_count = sum(o for c, o in most_common) # type: int
+ character_count: int = sum(o for c, o in most_common)
if character_count <= TOO_SMALL_SEQUENCE:
continue
- popular_character_ordered = [c for c, o in most_common] # type: List[str]
+ popular_character_ordered: List[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
- ratio = characters_popularity_compare(
+ ratio: float = characters_popularity_compare(
language, popular_character_ordered
- ) # type: float
+ )
if ratio < threshold:
continue
diff --git a/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py b/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py
index 5f912c923b..540e5e2a1a 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py
@@ -5,6 +5,11 @@ from os.path import abspath
from platform import python_version
from typing import List
+try:
+ from unicodedata2 import unidata_version
+except ImportError:
+ from unicodedata import unidata_version
+
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
@@ -111,7 +116,7 @@ def cli_detect(argv: List[str] = None) -> int:
"-t",
"--threshold",
action="store",
- default=0.1,
+ default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
@@ -119,8 +124,8 @@ def cli_detect(argv: List[str] = None) -> int:
parser.add_argument(
"--version",
action="version",
- version="Charset-Normalizer {} - Python {}".format(
- __version__, python_version()
+ version="Charset-Normalizer {} - Python {} - Unicode {}".format(
+ __version__, python_version(), unidata_version
),
help="Show version information and exit.",
)
@@ -229,7 +234,7 @@ def cli_detect(argv: List[str] = None) -> int:
my_file.close()
continue
- o_ = my_file.name.split(".") # type: List[str]
+ o_: List[str] = my_file.name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
diff --git a/contrib/python/charset-normalizer/charset_normalizer/constant.py b/contrib/python/charset-normalizer/charset_normalizer/constant.py
index c32f5cf2d6..ac840c461f 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/constant.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/constant.py
@@ -1,5 +1,4 @@
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
-from collections import OrderedDict
from encodings.aliases import aliases
from re import IGNORECASE, compile as re_compile
from typing import Dict, List, Set, Union
@@ -7,31 +6,26 @@ from typing import Dict, List, Set, Union
from .assets import FREQUENCIES
# Contain for each eligible encoding a list of/item bytes SIG/BOM
-ENCODING_MARKS = OrderedDict(
- [
- ("utf_8", BOM_UTF8),
- (
- "utf_7",
- [
- b"\x2b\x2f\x76\x38",
- b"\x2b\x2f\x76\x39",
- b"\x2b\x2f\x76\x2b",
- b"\x2b\x2f\x76\x2f",
- b"\x2b\x2f\x76\x38\x2d",
- ],
- ),
- ("gb18030", b"\x84\x31\x95\x33"),
- ("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]),
- ("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]),
- ]
-) # type: Dict[str, Union[bytes, List[bytes]]]
+ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
+ "utf_8": BOM_UTF8,
+ "utf_7": [
+ b"\x2b\x2f\x76\x38",
+ b"\x2b\x2f\x76\x39",
+ b"\x2b\x2f\x76\x2b",
+ b"\x2b\x2f\x76\x2f",
+ b"\x2b\x2f\x76\x38\x2d",
+ ],
+ "gb18030": b"\x84\x31\x95\x33",
+ "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
+ "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
+}
-TOO_SMALL_SEQUENCE = 32 # type: int
-TOO_BIG_SEQUENCE = int(10e6) # type: int
+TOO_SMALL_SEQUENCE: int = 32
+TOO_BIG_SEQUENCE: int = int(10e6)
-UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int
+UTF8_MAXIMAL_ALLOCATION: int = 1112064
-UNICODE_RANGES_COMBINED = {
+UNICODE_RANGES_COMBINED: Dict[str, range] = {
"Control character": range(31 + 1),
"Basic Latin": range(32, 127 + 1),
"Latin-1 Supplement": range(128, 255 + 1),
@@ -311,10 +305,10 @@ UNICODE_RANGES_COMBINED = {
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
"Tags": range(917504, 917631 + 1),
"Variation Selectors Supplement": range(917760, 917999 + 1),
-} # type: Dict[str, range]
+}
-UNICODE_SECONDARY_RANGE_KEYWORD = [
+UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
"Supplement",
"Extended",
"Extensions",
@@ -330,25 +324,25 @@ UNICODE_SECONDARY_RANGE_KEYWORD = [
"Shapes",
"Supplemental",
"Tags",
-] # type: List[str]
+]
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
IGNORECASE,
)
-IANA_SUPPORTED = sorted(
+IANA_SUPPORTED: List[str] = sorted(
filter(
lambda x: x.endswith("_codec") is False
and x not in {"rot_13", "tactis", "mbcs"},
list(set(aliases.values())),
)
-) # type: List[str]
+)
-IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED) # type: int
+IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
# pre-computed code page that are similar using the function cp_similarity.
-IANA_SUPPORTED_SIMILAR = {
+IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
"cp1125": ["cp866"],
@@ -434,10 +428,10 @@ IANA_SUPPORTED_SIMILAR = {
"mac_turkish": ["mac_iceland", "mac_roman"],
"ptcp154": ["cp1251", "kz1048"],
"tis_620": ["iso8859_11"],
-} # type: Dict[str, List[str]]
+}
-CHARDET_CORRESPONDENCE = {
+CHARDET_CORRESPONDENCE: Dict[str, str] = {
"iso2022_kr": "ISO-2022-KR",
"iso2022_jp": "ISO-2022-JP",
"euc_kr": "EUC-KR",
@@ -470,10 +464,10 @@ CHARDET_CORRESPONDENCE = {
"cp1256": "windows-1256",
"cp1254": "Windows-1254",
"cp949": "CP949",
-} # type: Dict[str, str]
+}
-COMMON_SAFE_ASCII_CHARACTERS = {
+COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
"<",
">",
"=",
@@ -489,15 +483,15 @@ COMMON_SAFE_ASCII_CHARACTERS = {
"|",
'"',
"-",
-} # type: Set[str]
+}
-KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str]
-ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str]
+KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
+ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
-LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES) # type: int
+LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
# Logging LEVEL bellow DEBUG
-TRACE = 5 # type: int
+TRACE: int = 5
diff --git a/contrib/python/charset-normalizer/charset_normalizer/md.py b/contrib/python/charset-normalizer/charset_normalizer/md.py
index f3d6505cf0..31808af84c 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/md.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/md.py
@@ -16,6 +16,7 @@ from .utils import (
is_separator,
is_symbol,
is_thai,
+ is_unprintable,
remove_accent,
unicode_range,
)
@@ -57,12 +58,12 @@ class MessDetectorPlugin:
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self) -> None:
- self._punctuation_count = 0 # type: int
- self._symbol_count = 0 # type: int
- self._character_count = 0 # type: int
+ self._punctuation_count: int = 0
+ self._symbol_count: int = 0
+ self._character_count: int = 0
- self._last_printable_char = None # type: Optional[str]
- self._frenzy_symbol_in_word = False # type: bool
+ self._last_printable_char: Optional[str] = None
+ self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
return character.isprintable()
@@ -95,17 +96,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
if self._character_count == 0:
return 0.0
- ratio_of_punctuation = (
+ ratio_of_punctuation: float = (
self._punctuation_count + self._symbol_count
- ) / self._character_count # type: float
+ ) / self._character_count
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self) -> None:
- self._character_count = 0 # type: int
- self._accentuated_count = 0 # type: int
+ self._character_count: int = 0
+ self._accentuated_count: int = 0
def eligible(self, character: str) -> bool:
return character.isalpha()
@@ -124,26 +125,20 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
- ratio_of_accentuation = (
- self._accentuated_count / self._character_count
- ) # type: float
+ ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin):
def __init__(self) -> None:
- self._unprintable_count = 0 # type: int
- self._character_count = 0 # type: int
+ self._unprintable_count: int = 0
+ self._character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
- if (
- character.isspace() is False # includes \n \t \r \v
- and character.isprintable() is False
- and character != "\x1A" # Why? Its the ASCII substitute character.
- ):
+ if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1
@@ -160,10 +155,10 @@ class UnprintablePlugin(MessDetectorPlugin):
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self) -> None:
- self._successive_count = 0 # type: int
- self._character_count = 0 # type: int
+ self._successive_count: int = 0
+ self._character_count: int = 0
- self._last_latin_character = None # type: Optional[str]
+ self._last_latin_character: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
@@ -197,9 +192,9 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
- self._suspicious_successive_range_count = 0 # type: int
- self._character_count = 0 # type: int
- self._last_printable_seen = None # type: Optional[str]
+ self._suspicious_successive_range_count: int = 0
+ self._character_count: int = 0
+ self._last_printable_seen: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isprintable()
@@ -219,10 +214,8 @@ class SuspiciousRange(MessDetectorPlugin):
self._last_printable_seen = character
return
- unicode_range_a = unicode_range(
- self._last_printable_seen
- ) # type: Optional[str]
- unicode_range_b = unicode_range(character) # type: Optional[str]
+ unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
+ unicode_range_b: Optional[str] = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
@@ -239,9 +232,9 @@ class SuspiciousRange(MessDetectorPlugin):
if self._character_count == 0:
return 0.0
- ratio_of_suspicious_range_usage = (
+ ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
- ) / self._character_count # type: float
+ ) / self._character_count
if ratio_of_suspicious_range_usage < 0.1:
return 0.0
@@ -251,25 +244,25 @@ class SuspiciousRange(MessDetectorPlugin):
class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
- self._word_count = 0 # type: int
- self._bad_word_count = 0 # type: int
- self._foreign_long_count = 0 # type: int
+ self._word_count: int = 0
+ self._bad_word_count: int = 0
+ self._foreign_long_count: int = 0
- self._is_current_word_bad = False # type: bool
- self._foreign_long_watch = False # type: bool
+ self._is_current_word_bad: bool = False
+ self._foreign_long_watch: bool = False
- self._character_count = 0 # type: int
- self._bad_character_count = 0 # type: int
+ self._character_count: int = 0
+ self._bad_character_count: int = 0
- self._buffer = "" # type: str
- self._buffer_accent_count = 0 # type: int
+ self._buffer: str = ""
+ self._buffer_accent_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character.isalpha():
- self._buffer = "".join([self._buffer, character])
+ self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
@@ -289,7 +282,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer:
self._word_count += 1
- buffer_length = len(self._buffer) # type: int
+ buffer_length: int = len(self._buffer)
self._character_count += buffer_length
@@ -346,8 +339,8 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
"""
def __init__(self) -> None:
- self._wrong_stop_count = 0 # type: int
- self._cjk_character_count = 0 # type: int
+ self._wrong_stop_count: int = 0
+ self._cjk_character_count: int = 0
def eligible(self, character: str) -> bool:
return True
@@ -372,17 +365,17 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self) -> None:
- self._buf = False # type: bool
+ self._buf: bool = False
- self._character_count_since_last_sep = 0 # type: int
+ self._character_count_since_last_sep: int = 0
- self._successive_upper_lower_count = 0 # type: int
- self._successive_upper_lower_count_final = 0 # type: int
+ self._successive_upper_lower_count: int = 0
+ self._successive_upper_lower_count_final: int = 0
- self._character_count = 0 # type: int
+ self._character_count: int = 0
- self._last_alpha_seen = None # type: Optional[str]
- self._current_ascii_only = True # type: bool
+ self._last_alpha_seen: Optional[str] = None
+ self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
return True
@@ -446,6 +439,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
return self._successive_upper_lower_count_final / self._character_count
+@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
@@ -524,16 +518,16 @@ def mess_ratio(
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
- detectors = [
+ detectors: List[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
- ] # type: List[MessDetectorPlugin]
+ ]
- length = len(decoded_sequence) + 1 # type: int
+ length: int = len(decoded_sequence) + 1
- mean_mess_ratio = 0.0 # type: float
+ mean_mess_ratio: float = 0.0
if length < 512:
- intermediary_mean_mess_ratio_calc = 32 # type: int
+ intermediary_mean_mess_ratio_calc: int = 32
elif length <= 1024:
intermediary_mean_mess_ratio_calc = 64
else:
diff --git a/contrib/python/charset-normalizer/charset_normalizer/models.py b/contrib/python/charset-normalizer/charset_normalizer/models.py
index c38da31fa5..b9d71eb4fd 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/models.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/models.py
@@ -21,21 +21,21 @@ class CharsetMatch:
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
):
- self._payload = payload # type: bytes
+ self._payload: bytes = payload
- self._encoding = guessed_encoding # type: str
- self._mean_mess_ratio = mean_mess_ratio # type: float
- self._languages = languages # type: CoherenceMatches
- self._has_sig_or_bom = has_sig_or_bom # type: bool
- self._unicode_ranges = None # type: Optional[List[str]]
+ self._encoding: str = guessed_encoding
+ self._mean_mess_ratio: float = mean_mess_ratio
+ self._languages: CoherenceMatches = languages
+ self._has_sig_or_bom: bool = has_sig_or_bom
+ self._unicode_ranges: Optional[List[str]] = None
- self._leaves = [] # type: List[CharsetMatch]
- self._mean_coherence_ratio = 0.0 # type: float
+ self._leaves: List[CharsetMatch] = []
+ self._mean_coherence_ratio: float = 0.0
- self._output_payload = None # type: Optional[bytes]
- self._output_encoding = None # type: Optional[str]
+ self._output_payload: Optional[bytes] = None
+ self._output_encoding: Optional[str] = None
- self._string = decoded_payload # type: Optional[str]
+ self._string: Optional[str] = decoded_payload
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
@@ -53,8 +53,8 @@ class CharsetMatch:
if not isinstance(other, CharsetMatch):
raise ValueError
- chaos_difference = abs(self.chaos - other.chaos) # type: float
- coherence_difference = abs(self.coherence - other.coherence) # type: float
+ chaos_difference: float = abs(self.chaos - other.chaos)
+ coherence_difference: float = abs(self.coherence - other.coherence)
# Bellow 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02:
@@ -137,7 +137,7 @@ class CharsetMatch:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
- also_known_as = [] # type: List[str]
+ also_known_as: List[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
@@ -227,9 +227,9 @@ class CharsetMatch:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
- detected_ranges = [
+ detected_ranges: List[Optional[str]] = [
unicode_range(char) for char in str(self)
- ] # type: List[Optional[str]]
+ ]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@@ -281,7 +281,7 @@ class CharsetMatches:
"""
def __init__(self, results: List[CharsetMatch] = None):
- self._results = sorted(results) if results else [] # type: List[CharsetMatch]
+ self._results: List[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
@@ -360,17 +360,17 @@ class CliDetectionResult:
unicode_path: Optional[str],
is_preferred: bool,
):
- self.path = path # type: str
- self.unicode_path = unicode_path # type: Optional[str]
- self.encoding = encoding # type: Optional[str]
- self.encoding_aliases = encoding_aliases # type: List[str]
- self.alternative_encodings = alternative_encodings # type: List[str]
- self.language = language # type: str
- self.alphabets = alphabets # type: List[str]
- self.has_sig_or_bom = has_sig_or_bom # type: bool
- self.chaos = chaos # type: float
- self.coherence = coherence # type: float
- self.is_preferred = is_preferred # type: bool
+ self.path: str = path
+ self.unicode_path: Optional[str] = unicode_path
+ self.encoding: Optional[str] = encoding
+ self.encoding_aliases: List[str] = encoding_aliases
+ self.alternative_encodings: List[str] = alternative_encodings
+ self.language: str = language
+ self.alphabets: List[str] = alphabets
+ self.has_sig_or_bom: bool = has_sig_or_bom
+ self.chaos: float = chaos
+ self.coherence: float = coherence
+ self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> Dict[str, Any]: # type: ignore
diff --git a/contrib/python/charset-normalizer/charset_normalizer/utils.py b/contrib/python/charset-normalizer/charset_normalizer/utils.py
index dcb14dfee1..17eaee0408 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/utils.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/utils.py
@@ -1,4 +1,6 @@
try:
+ # WARNING: unicodedata2 support is going to be removed in 3.0
+ # Python is quickly catching up.
import unicodedata2 as unicodedata
except ImportError:
import unicodedata # type: ignore[no-redef]
@@ -9,7 +11,7 @@ from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
-from typing import List, Optional, Set, Tuple, Union
+from typing import Generator, List, Optional, Set, Tuple, Union
from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
@@ -26,7 +28,7 @@ from .constant import (
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
try:
- description = unicodedata.name(character) # type: str
+ description: str = unicodedata.name(character)
except ValueError:
return False
return (
@@ -41,11 +43,11 @@ def is_accentuated(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
- decomposed = unicodedata.decomposition(character) # type: str
+ decomposed: str = unicodedata.decomposition(character)
if not decomposed:
return character
- codes = decomposed.split(" ") # type: List[str]
+ codes: List[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@@ -55,7 +57,7 @@ def unicode_range(character: str) -> Optional[str]:
"""
Retrieve the Unicode range official name from a single character.
"""
- character_ord = ord(character) # type: int
+ character_ord: int = ord(character)
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
if character_ord in ord_range:
@@ -67,12 +69,13 @@ def unicode_range(character: str) -> Optional[str]:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
try:
- description = unicodedata.name(character) # type: str
+ description: str = unicodedata.name(character)
except ValueError:
return False
return "LATIN" in description
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_ascii(character: str) -> bool:
try:
character.encode("ascii")
@@ -83,12 +86,12 @@ def is_ascii(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
- character_category = unicodedata.category(character) # type: str
+ character_category: str = unicodedata.category(character)
if "P" in character_category:
return True
- character_range = unicode_range(character) # type: Optional[str]
+ character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
@@ -98,12 +101,12 @@ def is_punctuation(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
- character_category = unicodedata.category(character) # type: str
+ character_category: str = unicodedata.category(character)
if "S" in character_category or "N" in character_category:
return True
- character_range = unicode_range(character) # type: Optional[str]
+ character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
@@ -113,7 +116,7 @@ def is_symbol(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
- character_range = unicode_range(character) # type: Optional[str]
+ character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
@@ -126,7 +129,7 @@ def is_separator(character: str) -> bool:
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
return True
- character_category = unicodedata.category(character) # type: str
+ character_category: str = unicodedata.category(character)
return "Z" in character_category
@@ -137,7 +140,7 @@ def is_case_variable(character: str) -> bool:
def is_private_use_only(character: str) -> bool:
- character_category = unicodedata.category(character) # type: str
+ character_category: str = unicodedata.category(character)
return character_category == "Co"
@@ -197,6 +200,17 @@ def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_unprintable(character: str) -> bool:
+ return (
+ character.isspace() is False # includes \n \t \r \v
+ and character.isprintable() is False
+ and character != "\x1A" # Why? Its the ASCII substitute character.
+ and character != b"\xEF\xBB\xBF".decode("utf_8") # bug discovered in Python,
+ # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
+ )
+
+
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
@@ -204,12 +218,12 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
if not isinstance(sequence, bytes):
raise TypeError
- seq_len = len(sequence) # type: int
+ seq_len: int = len(sequence)
- results = findall(
+ results: List[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
- ) # type: List[str]
+ )
if len(results) == 0:
return None
@@ -253,7 +267,7 @@ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
"""
for iana_encoding in ENCODING_MARKS:
- marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]]
+ marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
@@ -283,10 +297,10 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
def range_scan(decoded_sequence: str) -> List[str]:
- ranges = set() # type: Set[str]
+ ranges: Set[str] = set()
for character in decoded_sequence:
- character_range = unicode_range(character) # type: Optional[str]
+ character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
@@ -304,13 +318,13 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore
decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore
- id_a = decoder_a(errors="ignore") # type: IncrementalDecoder
- id_b = decoder_b(errors="ignore") # type: IncrementalDecoder
+ id_a: IncrementalDecoder = decoder_a(errors="ignore")
+ id_b: IncrementalDecoder = decoder_b(errors="ignore")
- character_match_count = 0 # type: int
+ character_match_count: int = 0
for i in range(255):
- to_be_decoded = bytes([i]) # type: bytes
+ to_be_decoded: bytes = bytes([i])
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
@@ -340,3 +354,61 @@ def set_logging_handler(
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
+
+
+def cut_sequence_chunks(
+ sequences: bytes,
+ encoding_iana: str,
+ offsets: range,
+ chunk_size: int,
+ bom_or_sig_available: bool,
+ strip_sig_or_bom: bool,
+ sig_payload: bytes,
+ is_multi_byte_decoder: bool,
+ decoded_payload: Optional[str] = None,
+) -> Generator[str, None, None]:
+
+ if decoded_payload and is_multi_byte_decoder is False:
+ for i in offsets:
+ chunk = decoded_payload[i : i + chunk_size]
+ if not chunk:
+ break
+ yield chunk
+ else:
+ for i in offsets:
+ chunk_end = i + chunk_size
+ if chunk_end > len(sequences) + 8:
+ continue
+
+ cut_sequence = sequences[i : i + chunk_size]
+
+ if bom_or_sig_available and strip_sig_or_bom is False:
+ cut_sequence = sig_payload + cut_sequence
+
+ chunk = cut_sequence.decode(
+ encoding_iana,
+ errors="ignore" if is_multi_byte_decoder else "strict",
+ )
+
+ # multi-byte bad cutting detector and adjustment
+ # not the cleanest way to perform that fix but clever enough for now.
+ if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
+
+ chunk_partial_size_chk: int = min(chunk_size, 16)
+
+ if (
+ decoded_payload
+ and chunk[:chunk_partial_size_chk] not in decoded_payload
+ ):
+ for j in range(i, i - 4, -1):
+ cut_sequence = sequences[j:chunk_end]
+
+ if bom_or_sig_available and strip_sig_or_bom is False:
+ cut_sequence = sig_payload + cut_sequence
+
+ chunk = cut_sequence.decode(encoding_iana, errors="ignore")
+
+ if chunk[:chunk_partial_size_chk] in decoded_payload:
+ break
+
+ yield chunk
diff --git a/contrib/python/charset-normalizer/charset_normalizer/version.py b/contrib/python/charset-normalizer/charset_normalizer/version.py
index 77cfff25d6..af7e749e82 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/version.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/version.py
@@ -2,5 +2,5 @@
Expose version
"""
-__version__ = "2.0.12"
+__version__ = "2.1.0"
VERSION = __version__.split(".")