intermediate changes

ref:ac842eacda5e614f20cf9d3985d932732f92beab
author: arcadia-devtools <arcadia-devtools@yandex-team.ru> 2022-06-20 18:39:30 +0300
committer: arcadia-devtools <arcadia-devtools@yandex-team.ru> 2022-06-20 18:39:30 +0300
commit: 798d25a291578fceb2223382b508fba1723fef4a (patch)
tree: 227b9a24000c40ae3354f4321ff9fe19143423f7
parent: d934aec555f13784eabe2d7682211050918e6cf5 (diff)
download: ydb-798d25a291578fceb2223382b508fba1723fef4a.tar.gz
11 files changed, 1451 insertions, 1541 deletions
diff --git a/contrib/python/charset-normalizer/.dist-info/METADATA b/contrib/python/charset-normalizer/.dist-info/METADATA
index 1b04ed4c4e3..0ba0f9d5138 100644
--- a/contrib/python/charset-normalizer/.dist-info/METADATA
+++ b/contrib/python/charset-normalizer/.dist-info/METADATA
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: charset-normalizer
-Version: 2.0.12
+Version: 2.1.0
 Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
 Home-page: https://github.com/ousret/charset_normalizer
 Author: Ahmed TAHRI @Ousret
@@ -10,13 +10,13 @@ Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
 Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
 Keywords: encoding,i18n,txt,text,charset,charset-detector,normalization,unicode,chardet
 Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Intended Audience :: Developers
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.5
 Classifier: Programming Language :: Python :: 3.6
 Classifier: Programming Language :: Python :: 3.7
 Classifier: Programming Language :: Python :: 3.8
@@ -27,7 +27,7 @@ Classifier: Topic :: Text Processing :: Linguistic
 Classifier: Topic :: Utilities
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Classifier: Typing :: Typed
-Requires-Python: >=3.5.0
+Requires-Python: >=3.6.0
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Provides-Extra: unicode_backport
@@ -87,13 +87,13 @@ This package offer better performance than its counterpart Chardet. Here are som
 
 | Package       | Accuracy       | Mean per file (ms) | File per sec (est) |
 | ------------- | :-------------: | :------------------: | :------------------: |
-|      [chardet](https://github.com/chardet/chardet)        |     92 %     |     220 ms      |       5 file/sec        |
-| charset-normalizer |    **98 %**     |     **40 ms**      |       25 file/sec    |
+|      [chardet](https://github.com/chardet/chardet)        |     92 %     |     200 ms      |       5 file/sec        |
+| charset-normalizer |    **98 %**     |     **39 ms**      |       26 file/sec    |
 
 | Package       | 99th percentile       | 95th percentile | 50th percentile |
 | ------------- | :-------------: | :------------------: | :------------------: |
-|      [chardet](https://github.com/chardet/chardet)        |     1115 ms     |     300 ms      |       27 ms        |
-| charset-normalizer |    460 ms     |     240 ms      |       18 ms    |
+|      [chardet](https://github.com/chardet/chardet)        |     1200 ms     |     287 ms      |       23 ms        |
+| charset-normalizer |    400 ms     |     200 ms      |       15 ms    |
 
 Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
 
diff --git a/contrib/python/charset-normalizer/README.md b/contrib/python/charset-normalizer/README.md
index b4c957a63c9..904b60ea228 100644
--- a/contrib/python/charset-normalizer/README.md
+++ b/contrib/python/charset-normalizer/README.md
@@ -51,13 +51,13 @@ This package offer better performance than its counterpart Chardet. Here are som
 
 | Package       | Accuracy       | Mean per file (ms) | File per sec (est) |
 | ------------- | :-------------: | :------------------: | :------------------: |
-|      [chardet](https://github.com/chardet/chardet)        |     92 %     |     220 ms      |       5 file/sec        |
-| charset-normalizer |    **98 %**     |     **40 ms**      |       25 file/sec    |
+|      [chardet](https://github.com/chardet/chardet)        |     92 %     |     200 ms      |       5 file/sec        |
+| charset-normalizer |    **98 %**     |     **39 ms**      |       26 file/sec    |
 
 | Package       | 99th percentile       | 95th percentile | 50th percentile |
 | ------------- | :-------------: | :------------------: | :------------------: |
-|      [chardet](https://github.com/chardet/chardet)        |     1115 ms     |     300 ms      |       27 ms        |
-| charset-normalizer |    460 ms     |     240 ms      |       18 ms    |
+|      [chardet](https://github.com/chardet/chardet)        |     1200 ms     |     287 ms      |       23 ms        |
+| charset-normalizer |    400 ms     |     200 ms      |       15 ms    |
 
 Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
 
diff --git a/contrib/python/charset-normalizer/charset_normalizer/api.py b/contrib/python/charset-normalizer/charset_normalizer/api.py
index bdc8ed9893d..ae08361bb40 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/api.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/api.py
@@ -1,12 +1,8 @@
 import logging
+from os import PathLike
 from os.path import basename, splitext
 from typing import BinaryIO, List, Optional, Set
 
-try:
-    from os import PathLike
-except ImportError:  # pragma: no cover
-    PathLike = str  # type: ignore
-
 from .cd import (
     coherence_ratio,
     encoding_languages,
@@ -18,6 +14,7 @@ from .md import mess_ratio
 from .models import CharsetMatch, CharsetMatches
 from .utils import (
     any_specified_encoding,
+    cut_sequence_chunks,
     iana_name,
     identify_sig_or_bom,
     is_cp_similar,
@@ -70,11 +67,11 @@ def from_bytes(
         )
 
     if explain:
-        previous_logger_level = logger.level  # type: int
+        previous_logger_level: int = logger.level
         logger.addHandler(explain_handler)
         logger.setLevel(TRACE)
 
-    length = len(sequences)  # type: int
+    length: int = len(sequences)
 
     if length == 0:
         logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
@@ -119,8 +116,8 @@ def from_bytes(
     if steps > 1 and length / steps < chunk_size:
         chunk_size = int(length / steps)
 
-    is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE  # type: bool
-    is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE  # type: bool
+    is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
+    is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
 
     if is_too_small_sequence:
         logger.log(
@@ -137,11 +134,11 @@ def from_bytes(
             ),
         )
 
-    prioritized_encodings = []  # type: List[str]
+    prioritized_encodings: List[str] = []
 
-    specified_encoding = (
+    specified_encoding: Optional[str] = (
         any_specified_encoding(sequences) if preemptive_behaviour else None
-    )  # type: Optional[str]
+    )
 
     if specified_encoding is not None:
         prioritized_encodings.append(specified_encoding)
@@ -151,15 +148,15 @@ def from_bytes(
             specified_encoding,
         )
 
-    tested = set()  # type: Set[str]
-    tested_but_hard_failure = []  # type: List[str]
-    tested_but_soft_failure = []  # type: List[str]
+    tested: Set[str] = set()
+    tested_but_hard_failure: List[str] = []
+    tested_but_soft_failure: List[str] = []
 
-    fallback_ascii = None  # type: Optional[CharsetMatch]
-    fallback_u8 = None  # type: Optional[CharsetMatch]
-    fallback_specified = None  # type: Optional[CharsetMatch]
+    fallback_ascii: Optional[CharsetMatch] = None
+    fallback_u8: Optional[CharsetMatch] = None
+    fallback_specified: Optional[CharsetMatch] = None
 
-    results = CharsetMatches()  # type: CharsetMatches
+    results: CharsetMatches = CharsetMatches()
 
     sig_encoding, sig_payload = identify_sig_or_bom(sequences)
 
@@ -190,11 +187,11 @@ def from_bytes(
 
         tested.add(encoding_iana)
 
-        decoded_payload = None  # type: Optional[str]
-        bom_or_sig_available = sig_encoding == encoding_iana  # type: bool
-        strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
+        decoded_payload: Optional[str] = None
+        bom_or_sig_available: bool = sig_encoding == encoding_iana
+        strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
             encoding_iana
-        )  # type: bool
+        )
 
         if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
             logger.log(
@@ -205,7 +202,7 @@ def from_bytes(
             continue
 
         try:
-            is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana)  # type: bool
+            is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
         except (ModuleNotFoundError, ImportError):
             logger.log(
                 TRACE,
@@ -240,7 +237,7 @@ def from_bytes(
             tested_but_hard_failure.append(encoding_iana)
             continue
 
-        similar_soft_failure_test = False  # type: bool
+        similar_soft_failure_test: bool = False
 
         for encoding_soft_failed in tested_but_soft_failure:
             if is_cp_similar(encoding_iana, encoding_soft_failed):
@@ -262,11 +259,11 @@ def from_bytes(
             int(length / steps),
         )
 
-        multi_byte_bonus = (
+        multi_byte_bonus: bool = (
             is_multi_byte_decoder
             and decoded_payload is not None
             and len(decoded_payload) < length
-        )  # type: bool
+        )
 
         if multi_byte_bonus:
             logger.log(
@@ -276,72 +273,47 @@ def from_bytes(
                 encoding_iana,
             )
 
-        max_chunk_gave_up = int(len(r_) / 4)  # type: int
+        max_chunk_gave_up: int = int(len(r_) / 4)
 
         max_chunk_gave_up = max(max_chunk_gave_up, 2)
-        early_stop_count = 0  # type: int
+        early_stop_count: int = 0
         lazy_str_hard_failure = False
 
-        md_chunks = []  # type: List[str]
+        md_chunks: List[str] = []
         md_ratios = []
 
-        for i in r_:
-            if i + chunk_size > length + 8:
-                continue
-
-            cut_sequence = sequences[i : i + chunk_size]
-
-            if bom_or_sig_available and strip_sig_or_bom is False:
-                cut_sequence = sig_payload + cut_sequence
-
-            try:
-                chunk = cut_sequence.decode(
-                    encoding_iana,
-                    errors="ignore" if is_multi_byte_decoder else "strict",
-                )  # type: str
-            except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
-                logger.log(
-                    TRACE,
-                    "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
-                    encoding_iana,
-                    str(e),
-                )
-                early_stop_count = max_chunk_gave_up
-                lazy_str_hard_failure = True
-                break
+        try:
+            for chunk in cut_sequence_chunks(
+                sequences,
+                encoding_iana,
+                r_,
+                chunk_size,
+                bom_or_sig_available,
+                strip_sig_or_bom,
+                sig_payload,
+                is_multi_byte_decoder,
+                decoded_payload,
+            ):
+                md_chunks.append(chunk)
 
-            # multi-byte bad cutting detector and adjustment
-            # not the cleanest way to perform that fix but clever enough for now.
-            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
+                md_ratios.append(mess_ratio(chunk, threshold))
 
-                chunk_partial_size_chk = min(chunk_size, 16)  # type: int
+                if md_ratios[-1] >= threshold:
+                    early_stop_count += 1
 
-                if (
-                    decoded_payload
-                    and chunk[:chunk_partial_size_chk] not in decoded_payload
+                if (early_stop_count >= max_chunk_gave_up) or (
+                    bom_or_sig_available and strip_sig_or_bom is False
                 ):
-                    for j in range(i, i - 4, -1):
-                        cut_sequence = sequences[j : i + chunk_size]
-
-                        if bom_or_sig_available and strip_sig_or_bom is False:
-                            cut_sequence = sig_payload + cut_sequence
-
-                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
-
-                        if chunk[:chunk_partial_size_chk] in decoded_payload:
-                            break
-
-            md_chunks.append(chunk)
-
-            md_ratios.append(mess_ratio(chunk, threshold))
-
-            if md_ratios[-1] >= threshold:
-                early_stop_count += 1
-
-            if (early_stop_count >= max_chunk_gave_up) or (
-                bom_or_sig_available and strip_sig_or_bom is False
-            ):
-                break
+                    break
+        except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
+            logger.log(
+                TRACE,
+                "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                encoding_iana,
+                str(e),
+            )
+            early_stop_count = max_chunk_gave_up
+            lazy_str_hard_failure = True
 
         # We might want to check the sequence again with the whole content
         # Only if initial MD tests passes
@@ -362,9 +334,7 @@ def from_bytes(
                 tested_but_hard_failure.append(encoding_iana)
                 continue
 
-        mean_mess_ratio = (
-            sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
-        )  # type: float
+        mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
         if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
             tested_but_soft_failure.append(encoding_iana)
             logger.log(
@@ -399,7 +369,7 @@ def from_bytes(
         )
 
         if not is_multi_byte_decoder:
-            target_languages = encoding_languages(encoding_iana)  # type: List[str]
+            target_languages: List[str] = encoding_languages(encoding_iana)
         else:
             target_languages = mb_encoding_languages(encoding_iana)
 
diff --git a/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py b/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py
index b2e56ff398a..b9a3700f794 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py
@@ -1,1244 +1,1122 @@
 # -*- coding: utf_8 -*-
-from collections import OrderedDict
+from typing import Dict, List
 
-FREQUENCIES = OrderedDict(
-    [
-        (
-            "English",
-            [
-                "e",
-                "a",
-                "t",
-                "i",
-                "o",
-                "n",
-                "s",
-                "r",
-                "h",
-                "l",
-                "d",
-                "c",
-                "u",
-                "m",
-                "f",
-                "p",
-                "g",
-                "w",
-                "y",
-                "b",
-                "v",
-                "k",
-                "x",
-                "j",
-                "z",
-                "q",
-            ],
-        ),
-        (
-            "German",
-            [
-                "e",
-                "n",
-                "i",
-                "r",
-                "s",
-                "t",
-                "a",
-                "d",
-                "h",
-                "u",
-                "l",
-                "g",
-                "o",
-                "c",
-                "m",
-                "b",
-                "f",
-                "k",
-                "w",
-                "z",
-                "p",
-                "v",
-                "ü",
-                "ä",
-                "ö",
-                "j",
-            ],
-        ),
-        (
-            "French",
-            [
-                "e",
-                "a",
-                "s",
-                "n",
-                "i",
-                "t",
-                "r",
-                "l",
-                "u",
-                "o",
-                "d",
-                "c",
-                "p",
-                "m",
-                "é",
-                "v",
-                "g",
-                "f",
-                "b",
-                "h",
-                "q",
-                "à",
-                "x",
-                "è",
-                "y",
-                "j",
-            ],
-        ),
-        (
-            "Dutch",
-            [
-                "e",
-                "n",
-                "a",
-                "i",
-                "r",
-                "t",
-                "o",
-                "d",
-                "s",
-                "l",
-                "g",
-                "h",
-                "v",
-                "m",
-                "u",
-                "k",
-                "c",
-                "p",
-                "b",
-                "w",
-                "j",
-                "z",
-                "f",
-                "y",
-                "x",
-                "ë",
-            ],
-        ),
-        (
-            "Italian",
-            [
-                "e",
-                "i",
-                "a",
-                "o",
-                "n",
-                "l",
-                "t",
-                "r",
-                "s",
-                "c",
-                "d",
-                "u",
-                "p",
-                "m",
-                "g",
-                "v",
-                "f",
-                "b",
-                "z",
-                "h",
-                "q",
-                "è",
-                "à",
-                "k",
-                "y",
-                "ò",
-            ],
-        ),
-        (
-            "Polish",
-            [
-                "a",
-                "i",
-                "o",
-                "e",
-                "n",
-                "r",
-                "z",
-                "w",
-                "s",
-                "c",
-                "t",
-                "k",
-                "y",
-                "d",
-                "p",
-                "m",
-                "u",
-                "l",
-                "j",
-                "ł",
-                "g",
-                "b",
-                "h",
-                "ą",
-                "ę",
-                "ó",
-            ],
-        ),
-        (
-            "Spanish",
-            [
-                "e",
-                "a",
-                "o",
-                "n",
-                "s",
-                "r",
-                "i",
-                "l",
-                "d",
-                "t",
-                "c",
-                "u",
-                "m",
-                "p",
-                "b",
-                "g",
-                "v",
-                "f",
-                "y",
-                "ó",
-                "h",
-                "q",
-                "í",
-                "j",
-                "z",
-                "á",
-            ],
-        ),
-        (
-            "Russian",
-            [
-                "о",
-                "а",
-                "е",
-                "и",
-                "н",
-                "с",
-                "т",
-                "р",
-                "в",
-                "л",
-                "к",
-                "м",
-                "д",
-                "п",
-                "у",
-                "г",
-                "я",
-                "ы",
-                "з",
-                "б",
-                "й",
-                "ь",
-                "ч",
-                "х",
-                "ж",
-                "ц",
-            ],
-        ),
-        (
-            "Japanese",
-            [
-                "の",
-                "に",
-                "る",
-                "た",
-                "は",
-                "ー",
-                "と",
-                "し",
-                "を",
-                "で",
-                "て",
-                "が",
-                "い",
-                "ン",
-                "れ",
-                "な",
-                "年",
-                "ス",
-                "っ",
-                "ル",
-                "か",
-                "ら",
-                "あ",
-                "さ",
-                "も",
-                "り",
-            ],
-        ),
-        (
-            "Portuguese",
-            [
-                "a",
-                "e",
-                "o",
-                "s",
-                "i",
-                "r",
-                "d",
-                "n",
-                "t",
-                "m",
-                "u",
-                "c",
-                "l",
-                "p",
-                "g",
-                "v",
-                "b",
-                "f",
-                "h",
-                "ã",
-                "q",
-                "é",
-                "ç",
-                "á",
-                "z",
-                "í",
-            ],
-        ),
-        (
-            "Swedish",
-            [
-                "e",
-                "a",
-                "n",
-                "r",
-                "t",
-                "s",
-                "i",
-                "l",
-                "d",
-                "o",
-                "m",
-                "k",
-                "g",
-                "v",
-                "h",
-                "f",
-                "u",
-                "p",
-                "ä",
-                "c",
-                "b",
-                "ö",
-                "å",
-                "y",
-                "j",
-                "x",
-            ],
-        ),
-        (
-            "Chinese",
-            [
-                "的",
-                "一",
-                "是",
-                "不",
-                "了",
-                "在",
-                "人",
-                "有",
-                "我",
-                "他",
-                "这",
-                "个",
-                "们",
-                "中",
-                "来",
-                "上",
-                "大",
-                "为",
-                "和",
-                "国",
-                "地",
-                "到",
-                "以",
-                "说",
-                "时",
-                "要",
-                "就",
-                "出",
-                "会",
-            ],
-        ),
-        (
-            "Ukrainian",
-            [
-                "о",
-                "а",
-                "н",
-                "і",
-                "и",
-                "р",
-                "в",
-                "т",
-                "е",
-                "с",
-                "к",
-                "л",
-                "у",
-                "д",
-                "м",
-                "п",
-                "з",
-                "я",
-                "ь",
-                "б",
-                "г",
-                "й",
-                "ч",
-                "х",
-                "ц",
-                "ї",
-            ],
-        ),
-        (
-            "Norwegian",
-            [
-                "e",
-                "r",
-                "n",
-                "t",
-                "a",
-                "s",
-                "i",
-                "o",
-                "l",
-                "d",
-                "g",
-                "k",
-                "m",
-                "v",
-                "f",
-                "p",
-                "u",
-                "b",
-                "h",
-                "å",
-                "y",
-                "j",
-                "ø",
-                "c",
-                "æ",
-                "w",
-            ],
-        ),
-        (
-            "Finnish",
-            [
-                "a",
-                "i",
-                "n",
-                "t",
-                "e",
-                "s",
-                "l",
-                "o",
-                "u",
-                "k",
-                "ä",
-                "m",
-                "r",
-                "v",
-                "j",
-                "h",
-                "p",
-                "y",
-                "d",
-                "ö",
-                "g",
-                "c",
-                "b",
-                "f",
-                "w",
-                "z",
-            ],
-        ),
-        (
-            "Vietnamese",
-            [
-                "n",
-                "h",
-                "t",
-                "i",
-                "c",
-                "g",
-                "a",
-                "o",
-                "u",
-                "m",
-                "l",
-                "r",
-                "à",
-                "đ",
-                "s",
-                "e",
-                "v",
-                "p",
-                "b",
-                "y",
-                "ư",
-                "d",
-                "á",
-                "k",
-                "ộ",
-                "ế",
-            ],
-        ),
-        (
-            "Czech",
-            [
-                "o",
-                "e",
-                "a",
-                "n",
-                "t",
-                "s",
-                "i",
-                "l",
-                "v",
-                "r",
-                "k",
-                "d",
-                "u",
-                "m",
-                "p",
-                "í",
-                "c",
-                "h",
-                "z",
-                "á",
-                "y",
-                "j",
-                "b",
-                "ě",
-                "é",
-                "ř",
-            ],
-        ),
-        (
-            "Hungarian",
-            [
-                "e",
-                "a",
-                "t",
-                "l",
-                "s",
-                "n",
-                "k",
-                "r",
-                "i",
-                "o",
-                "z",
-                "á",
-                "é",
-                "g",
-                "m",
-                "b",
-                "y",
-                "v",
-                "d",
-                "h",
-                "u",
-                "p",
-                "j",
-                "ö",
-                "f",
-                "c",
-            ],
-        ),
-        (
-            "Korean",
-            [
-                "이",
-                "다",
-                "에",
-                "의",
-                "는",
-                "로",
-                "하",
-                "을",
-                "가",
-                "고",
-                "지",
-                "서",
-                "한",
-                "은",
-                "기",
-                "으",
-                "년",
-                "대",
-                "사",
-                "시",
-                "를",
-                "리",
-                "도",
-                "인",
-                "스",
-                "일",
-            ],
-        ),
-        (
-            "Indonesian",
-            [
-                "a",
-                "n",
-                "e",
-                "i",
-                "r",
-                "t",
-                "u",
-                "s",
-                "d",
-                "k",
-                "m",
-                "l",
-                "g",
-                "p",
-                "b",
-                "o",
-                "h",
-                "y",
-                "j",
-                "c",
-                "w",
-                "f",
-                "v",
-                "z",
-                "x",
-                "q",
-            ],
-        ),
-        (
-            "Turkish",
-            [
-                "a",
-                "e",
-                "i",
-                "n",
-                "r",
-                "l",
-                "ı",
-                "k",
-                "d",
-                "t",
-                "s",
-                "m",
-                "y",
-                "u",
-                "o",
-                "b",
-                "ü",
-                "ş",
-                "v",
-                "g",
-                "z",
-                "h",
-                "c",
-                "p",
-                "ç",
-                "ğ",
-            ],
-        ),
-        (
-            "Romanian",
-            [
-                "e",
-                "i",
-                "a",
-                "r",
-                "n",
-                "t",
-                "u",
-                "l",
-                "o",
-                "c",
-                "s",
-                "d",
-                "p",
-                "m",
-                "ă",
-                "f",
-                "v",
-                "î",
-                "g",
-                "b",
-                "ș",
-                "ț",
-                "z",
-                "h",
-                "â",
-                "j",
-            ],
-        ),
-        (
-            "Farsi",
-            [
-                "ا",
-                "ی",
-                "ر",
-                "د",
-                "ن",
-                "ه",
-                "و",
-                "م",
-                "ت",
-                "ب",
-                "س",
-                "ل",
-                "ک",
-                "ش",
-                "ز",
-                "ف",
-                "گ",
-                "ع",
-                "خ",
-                "ق",
-                "ج",
-                "آ",
-                "پ",
-                "ح",
-                "ط",
-                "ص",
-            ],
-        ),
-        (
-            "Arabic",
-            [
-                "ا",
-                "ل",
-                "ي",
-                "م",
-                "و",
-                "ن",
-                "ر",
-                "ت",
-                "ب",
-                "ة",
-                "ع",
-                "د",
-                "س",
-                "ف",
-                "ه",
-                "ك",
-                "ق",
-                "أ",
-                "ح",
-                "ج",
-                "ش",
-                "ط",
-                "ص",
-                "ى",
-                "خ",
-                "إ",
-            ],
-        ),
-        (
-            "Danish",
-            [
-                "e",
-                "r",
-                "n",
-                "t",
-                "a",
-                "i",
-                "s",
-                "d",
-                "l",
-                "o",
-                "g",
-                "m",
-                "k",
-                "f",
-                "v",
-                "u",
-                "b",
-                "h",
-                "p",
-                "å",
-                "y",
-                "ø",
-                "æ",
-                "c",
-                "j",
-                "w",
-            ],
-        ),
-        (
-            "Serbian",
-            [
-                "а",
-                "и",
-                "о",
-                "е",
-                "н",
-                "р",
-                "с",
-                "у",
-                "т",
-                "к",
-                "ј",
-                "в",
-                "д",
-                "м",
-                "п",
-                "л",
-                "г",
-                "з",
-                "б",
-                "a",
-                "i",
-                "e",
-                "o",
-                "n",
-                "ц",
-                "ш",
-            ],
-        ),
-        (
-            "Lithuanian",
-            [
-                "i",
-                "a",
-                "s",
-                "o",
-                "r",
-                "e",
-                "t",
-                "n",
-                "u",
-                "k",
-                "m",
-                "l",
-                "p",
-                "v",
-                "d",
-                "j",
-                "g",
-                "ė",
-                "b",
-                "y",
-                "ų",
-                "š",
-                "ž",
-                "c",
-                "ą",
-                "į",
-            ],
-        ),
-        (
-            "Slovene",
-            [
-                "e",
-                "a",
-                "i",
-                "o",
-                "n",
-                "r",
-                "s",
-                "l",
-                "t",
-                "j",
-                "v",
-                "k",
-                "d",
-                "p",
-                "m",
-                "u",
-                "z",
-                "b",
-                "g",
-                "h",
-                "č",
-                "c",
-                "š",
-                "ž",
-                "f",
-                "y",
-            ],
-        ),
-        (
-            "Slovak",
-            [
-                "o",
-                "a",
-                "e",
-                "n",
-                "i",
-                "r",
-                "v",
-                "t",
-                "s",
-                "l",
-                "k",
-                "d",
-                "m",
-                "p",
-                "u",
-                "c",
-                "h",
-                "j",
-                "b",
-                "z",
-                "á",
-                "y",
-                "ý",
-                "í",
-                "č",
-                "é",
-            ],
-        ),
-        (
-            "Hebrew",
-            [
-                "י",
-                "ו",
-                "ה",
-                "ל",
-                "ר",
-                "ב",
-                "ת",
-                "מ",
-                "א",
-                "ש",
-                "נ",
-                "ע",
-                "ם",
-                "ד",
-                "ק",
-                "ח",
-                "פ",
-                "ס",
-                "כ",
-                "ג",
-                "ט",
-                "צ",
-                "ן",
-                "ז",
-                "ך",
-            ],
-        ),
-        (
-            "Bulgarian",
-            [
-                "а",
-                "и",
-                "о",
-                "е",
-                "н",
-                "т",
-                "р",
-                "с",
-                "в",
-                "л",
-                "к",
-                "д",
-                "п",
-                "м",
-                "з",
-                "г",
-                "я",
-                "ъ",
-                "у",
-                "б",
-                "ч",
-                "ц",
-                "й",
-                "ж",
-                "щ",
-                "х",
-            ],
-        ),
-        (
-            "Croatian",
-            [
-                "a",
-                "i",
-                "o",
-                "e",
-                "n",
-                "r",
-                "j",
-                "s",
-                "t",
-                "u",
-                "k",
-                "l",
-                "v",
-                "d",
-                "m",
-                "p",
-                "g",
-                "z",
-                "b",
-                "c",
-                "č",
-                "h",
-                "š",
-                "ž",
-                "ć",
-                "f",
-            ],
-        ),
-        (
-            "Hindi",
-            [
-                "क",
-                "र",
-                "स",
-                "न",
-                "त",
-                "म",
-                "ह",
-                "प",
-                "य",
-                "ल",
-                "व",
-                "ज",
-                "द",
-                "ग",
-                "ब",
-                "श",
-                "ट",
-                "अ",
-                "ए",
-                "थ",
-                "भ",
-                "ड",
-                "च",
-                "ध",
-                "ष",
-                "इ",
-            ],
-        ),
-        (
-            "Estonian",
-            [
-                "a",
-                "i",
-                "e",
-                "s",
-                "t",
-                "l",
-                "u",
-                "n",
-                "o",
-                "k",
-                "r",
-                "d",
-                "m",
-                "v",
-                "g",
-                "p",
-                "j",
-                "h",
-                "ä",
-                "b",
-                "õ",
-                "ü",
-                "f",
-                "c",
-                "ö",
-                "y",
-            ],
-        ),
-        (
-            "Simple English",
-            [
-                "e",
-                "a",
-                "t",
-                "i",
-                "o",
-                "n",
-                "s",
-                "r",
-                "h",
-                "l",
-                "d",
-                "c",
-                "m",
-                "u",
-                "f",
-                "p",
-                "g",
-                "w",
-                "b",
-                "y",
-                "v",
-                "k",
-                "j",
-                "x",
-                "z",
-                "q",
-            ],
-        ),
-        (
-            "Thai",
-            [
-                "า",
-                "น",
-                "ร",
-                "อ",
-                "ก",
-                "เ",
-                "ง",
-                "ม",
-                "ย",
-                "ล",
-                "ว",
-                "ด",
-                "ท",
-                "ส",
-                "ต",
-                "ะ",
-                "ป",
-                "บ",
-                "ค",
-                "ห",
-                "แ",
-                "จ",
-                "พ",
-                "ช",
-                "ข",
-                "ใ",
-            ],
-        ),
-        (
-            "Greek",
-            [
-                "α",
-                "τ",
-                "ο",
-                "ι",
-                "ε",
-                "ν",
-                "ρ",
-                "σ",
-                "κ",
-                "η",
-                "π",
-                "ς",
-                "υ",
-                "μ",
-                "λ",
-                "ί",
-                "ό",
-                "ά",
-                "γ",
-                "έ",
-                "δ",
-                "ή",
-                "ω",
-                "χ",
-                "θ",
-                "ύ",
-            ],
-        ),
-        (
-            "Tamil",
-            [
-                "க",
-                "த",
-                "ப",
-                "ட",
-                "ர",
-                "ம",
-                "ல",
-                "ன",
-                "வ",
-                "ற",
-                "ய",
-                "ள",
-                "ச",
-                "ந",
-                "இ",
-                "ண",
-                "அ",
-                "ஆ",
-                "ழ",
-                "ங",
-                "எ",
-                "உ",
-                "ஒ",
-                "ஸ",
-            ],
-        ),
-        (
-            "Classical Chinese",
-            [
-                "之",
-                "年",
-                "為",
-                "也",
-                "以",
-                "一",
-                "人",
-                "其",
-                "者",
-                "國",
-                "有",
-                "二",
-                "十",
-                "於",
-                "曰",
-                "三",
-                "不",
-                "大",
-                "而",
-                "子",
-                "中",
-                "五",
-                "四",
-            ],
-        ),
-        (
-            "Kazakh",
-            [
-                "а",
-                "ы",
-                "е",
-                "н",
-                "т",
-                "р",
-                "л",
-                "і",
-                "д",
-                "с",
-                "м",
-                "қ",
-                "к",
-                "о",
-                "б",
-                "и",
-                "у",
-                "ғ",
-                "ж",
-                "ң",
-                "з",
-                "ш",
-                "й",
-                "п",
-                "г",
-                "ө",
-            ],
-        ),
-    ]
-)
+FREQUENCIES: Dict[str, List[str]] = {
+    "English": [
+        "e",
+        "a",
+        "t",
+        "i",
+        "o",
+        "n",
+        "s",
+        "r",
+        "h",
+        "l",
+        "d",
+        "c",
+        "u",
+        "m",
+        "f",
+        "p",
+        "g",
+        "w",
+        "y",
+        "b",
+        "v",
+        "k",
+        "x",
+        "j",
+        "z",
+        "q",
+    ],
+    "German": [
+        "e",
+        "n",
+        "i",
+        "r",
+        "s",
+        "t",
+        "a",
+        "d",
+        "h",
+        "u",
+        "l",
+        "g",
+        "o",
+        "c",
+        "m",
+        "b",
+        "f",
+        "k",
+        "w",
+        "z",
+        "p",
+        "v",
+        "ü",
+        "ä",
+        "ö",
+        "j",
+    ],
+    "French": [
+        "e",
+        "a",
+        "s",
+        "n",
+        "i",
+        "t",
+        "r",
+        "l",
+        "u",
+        "o",
+        "d",
+        "c",
+        "p",
+        "m",
+        "é",
+        "v",
+        "g",
+        "f",
+        "b",
+        "h",
+        "q",
+        "à",
+        "x",
+        "è",
+        "y",
+        "j",
+    ],
+    "Dutch": [
+        "e",
+        "n",
+        "a",
+        "i",
+        "r",
+        "t",
+        "o",
+        "d",
+        "s",
+        "l",
+        "g",
+        "h",
+        "v",
+        "m",
+        "u",
+        "k",
+        "c",
+        "p",
+        "b",
+        "w",
+        "j",
+        "z",
+        "f",
+        "y",
+        "x",
+        "ë",
+    ],
+    "Italian": [
+        "e",
+        "i",
+        "a",
+        "o",
+        "n",
+        "l",
+        "t",
+        "r",
+        "s",
+        "c",
+        "d",
+        "u",
+        "p",
+        "m",
+        "g",
+        "v",
+        "f",
+        "b",
+        "z",
+        "h",
+        "q",
+        "è",
+        "à",
+        "k",
+        "y",
+        "ò",
+    ],
+    "Polish": [
+        "a",
+        "i",
+        "o",
+        "e",
+        "n",
+        "r",
+        "z",
+        "w",
+        "s",
+        "c",
+        "t",
+        "k",
+        "y",
+        "d",
+        "p",
+        "m",
+        "u",
+        "l",
+        "j",
+        "ł",
+        "g",
+        "b",
+        "h",
+        "ą",
+        "ę",
+        "ó",
+    ],
+    "Spanish": [
+        "e",
+        "a",
+        "o",
+        "n",
+        "s",
+        "r",
+        "i",
+        "l",
+        "d",
+        "t",
+        "c",
+        "u",
+        "m",
+        "p",
+        "b",
+        "g",
+        "v",
+        "f",
+        "y",
+        "ó",
+        "h",
+        "q",
+        "í",
+        "j",
+        "z",
+        "á",
+    ],
+    "Russian": [
+        "о",
+        "а",
+        "е",
+        "и",
+        "н",
+        "с",
+        "т",
+        "р",
+        "в",
+        "л",
+        "к",
+        "м",
+        "д",
+        "п",
+        "у",
+        "г",
+        "я",
+        "ы",
+        "з",
+        "б",
+        "й",
+        "ь",
+        "ч",
+        "х",
+        "ж",
+        "ц",
+    ],
+    "Japanese": [
+        "の",
+        "に",
+        "る",
+        "た",
+        "は",
+        "ー",
+        "と",
+        "し",
+        "を",
+        "で",
+        "て",
+        "が",
+        "い",
+        "ン",
+        "れ",
+        "な",
+        "年",
+        "ス",
+        "っ",
+        "ル",
+        "か",
+        "ら",
+        "あ",
+        "さ",
+        "も",
+        "り",
+    ],
+    "Portuguese": [
+        "a",
+        "e",
+        "o",
+        "s",
+        "i",
+        "r",
+        "d",
+        "n",
+        "t",
+        "m",
+        "u",
+        "c",
+        "l",
+        "p",
+        "g",
+        "v",
+        "b",
+        "f",
+        "h",
+        "ã",
+        "q",
+        "é",
+        "ç",
+        "á",
+        "z",
+        "í",
+    ],
+    "Swedish": [
+        "e",
+        "a",
+        "n",
+        "r",
+        "t",
+        "s",
+        "i",
+        "l",
+        "d",
+        "o",
+        "m",
+        "k",
+        "g",
+        "v",
+        "h",
+        "f",
+        "u",
+        "p",
+        "ä",
+        "c",
+        "b",
+        "ö",
+        "å",
+        "y",
+        "j",
+        "x",
+    ],
+    "Chinese": [
+        "的",
+        "一",
+        "是",
+        "不",
+        "了",
+        "在",
+        "人",
+        "有",
+        "我",
+        "他",
+        "这",
+        "个",
+        "们",
+        "中",
+        "来",
+        "上",
+        "大",
+        "为",
+        "和",
+        "国",
+        "地",
+        "到",
+        "以",
+        "说",
+        "时",
+        "要",
+        "就",
+        "出",
+        "会",
+    ],
+    "Ukrainian": [
+        "о",
+        "а",
+        "н",
+        "і",
+        "и",
+        "р",
+        "в",
+        "т",
+        "е",
+        "с",
+        "к",
+        "л",
+        "у",
+        "д",
+        "м",
+        "п",
+        "з",
+        "я",
+        "ь",
+        "б",
+        "г",
+        "й",
+        "ч",
+        "х",
+        "ц",
+        "ї",
+    ],
+    "Norwegian": [
+        "e",
+        "r",
+        "n",
+        "t",
+        "a",
+        "s",
+        "i",
+        "o",
+        "l",
+        "d",
+        "g",
+        "k",
+        "m",
+        "v",
+        "f",
+        "p",
+        "u",
+        "b",
+        "h",
+        "å",
+        "y",
+        "j",
+        "ø",
+        "c",
+        "æ",
+        "w",
+    ],
+    "Finnish": [
+        "a",
+        "i",
+        "n",
+        "t",
+        "e",
+        "s",
+        "l",
+        "o",
+        "u",
+        "k",
+        "ä",
+        "m",
+        "r",
+        "v",
+        "j",
+        "h",
+        "p",
+        "y",
+        "d",
+        "ö",
+        "g",
+        "c",
+        "b",
+        "f",
+        "w",
+        "z",
+    ],
+    "Vietnamese": [
+        "n",
+        "h",
+        "t",
+        "i",
+        "c",
+        "g",
+        "a",
+        "o",
+        "u",
+        "m",
+        "l",
+        "r",
+        "à",
+        "đ",
+        "s",
+        "e",
+        "v",
+        "p",
+        "b",
+        "y",
+        "ư",
+        "d",
+        "á",
+        "k",
+        "ộ",
+        "ế",
+    ],
+    "Czech": [
+        "o",
+        "e",
+        "a",
+        "n",
+        "t",
+        "s",
+        "i",
+        "l",
+        "v",
+        "r",
+        "k",
+        "d",
+        "u",
+        "m",
+        "p",
+        "í",
+        "c",
+        "h",
+        "z",
+        "á",
+        "y",
+        "j",
+        "b",
+        "ě",
+        "é",
+        "ř",
+    ],
+    "Hungarian": [
+        "e",
+        "a",
+        "t",
+        "l",
+        "s",
+        "n",
+        "k",
+        "r",
+        "i",
+        "o",
+        "z",
+        "á",
+        "é",
+        "g",
+        "m",
+        "b",
+        "y",
+        "v",
+        "d",
+        "h",
+        "u",
+        "p",
+        "j",
+        "ö",
+        "f",
+        "c",
+    ],
+    "Korean": [
+        "이",
+        "다",
+        "에",
+        "의",
+        "는",
+        "로",
+        "하",
+        "을",
+        "가",
+        "고",
+        "지",
+        "서",
+        "한",
+        "은",
+        "기",
+        "으",
+        "년",
+        "대",
+        "사",
+        "시",
+        "를",
+        "리",
+        "도",
+        "인",
+        "스",
+        "일",
+    ],
+    "Indonesian": [
+        "a",
+        "n",
+        "e",
+        "i",
+        "r",
+        "t",
+        "u",
+        "s",
+        "d",
+        "k",
+        "m",
+        "l",
+        "g",
+        "p",
+        "b",
+        "o",
+        "h",
+        "y",
+        "j",
+        "c",
+        "w",
+        "f",
+        "v",
+        "z",
+        "x",
+        "q",
+    ],
+    "Turkish": [
+        "a",
+        "e",
+        "i",
+        "n",
+        "r",
+        "l",
+        "ı",
+        "k",
+        "d",
+        "t",
+        "s",
+        "m",
+        "y",
+        "u",
+        "o",
+        "b",
+        "ü",
+        "ş",
+        "v",
+        "g",
+        "z",
+        "h",
+        "c",
+        "p",
+        "ç",
+        "ğ",
+    ],
+    "Romanian": [
+        "e",
+        "i",
+        "a",
+        "r",
+        "n",
+        "t",
+        "u",
+        "l",
+        "o",
+        "c",
+        "s",
+        "d",
+        "p",
+        "m",
+        "ă",
+        "f",
+        "v",
+        "î",
+        "g",
+        "b",
+        "ș",
+        "ț",
+        "z",
+        "h",
+        "â",
+        "j",
+    ],
+    "Farsi": [
+        "ا",
+        "ی",
+        "ر",
+        "د",
+        "ن",
+        "ه",
+        "و",
+        "م",
+        "ت",
+        "ب",
+        "س",
+        "ل",
+        "ک",
+        "ش",
+        "ز",
+        "ف",
+        "گ",
+        "ع",
+        "خ",
+        "ق",
+        "ج",
+        "آ",
+        "پ",
+        "ح",
+        "ط",
+        "ص",
+    ],
+    "Arabic": [
+        "ا",
+        "ل",
+        "ي",
+        "م",
+        "و",
+        "ن",
+        "ر",
+        "ت",
+        "ب",
+        "ة",
+        "ع",
+        "د",
+        "س",
+        "ف",
+        "ه",
+        "ك",
+        "ق",
+        "أ",
+        "ح",
+        "ج",
+        "ش",
+        "ط",
+        "ص",
+        "ى",
+        "خ",
+        "إ",
+    ],
+    "Danish": [
+        "e",
+        "r",
+        "n",
+        "t",
+        "a",
+        "i",
+        "s",
+        "d",
+        "l",
+        "o",
+        "g",
+        "m",
+        "k",
+        "f",
+        "v",
+        "u",
+        "b",
+        "h",
+        "p",
+        "å",
+        "y",
+        "ø",
+        "æ",
+        "c",
+        "j",
+        "w",
+    ],
+    "Serbian": [
+        "а",
+        "и",
+        "о",
+        "е",
+        "н",
+        "р",
+        "с",
+        "у",
+        "т",
+        "к",
+        "ј",
+        "в",
+        "д",
+        "м",
+        "п",
+        "л",
+        "г",
+        "з",
+        "б",
+        "a",
+        "i",
+        "e",
+        "o",
+        "n",
+        "ц",
+        "ш",
+    ],
+    "Lithuanian": [
+        "i",
+        "a",
+        "s",
+        "o",
+        "r",
+        "e",
+        "t",
+        "n",
+        "u",
+        "k",
+        "m",
+        "l",
+        "p",
+        "v",
+        "d",
+        "j",
+        "g",
+        "ė",
+        "b",
+        "y",
+        "ų",
+        "š",
+        "ž",
+        "c",
+        "ą",
+        "į",
+    ],
+    "Slovene": [
+        "e",
+        "a",
+        "i",
+        "o",
+        "n",
+        "r",
+        "s",
+        "l",
+        "t",
+        "j",
+        "v",
+        "k",
+        "d",
+        "p",
+        "m",
+        "u",
+        "z",
+        "b",
+        "g",
+        "h",
+        "č",
+        "c",
+        "š",
+        "ž",
+        "f",
+        "y",
+    ],
+    "Slovak": [
+        "o",
+        "a",
+        "e",
+        "n",
+        "i",
+        "r",
+        "v",
+        "t",
+        "s",
+        "l",
+        "k",
+        "d",
+        "m",
+        "p",
+        "u",
+        "c",
+        "h",
+        "j",
+        "b",
+        "z",
+        "á",
+        "y",
+        "ý",
+        "í",
+        "č",
+        "é",
+    ],
+    "Hebrew": [
+        "י",
+        "ו",
+        "ה",
+        "ל",
+        "ר",
+        "ב",
+        "ת",
+        "מ",
+        "א",
+        "ש",
+        "נ",
+        "ע",
+        "ם",
+        "ד",
+        "ק",
+        "ח",
+        "פ",
+        "ס",
+        "כ",
+        "ג",
+        "ט",
+        "צ",
+        "ן",
+        "ז",
+        "ך",
+    ],
+    "Bulgarian": [
+        "а",
+        "и",
+        "о",
+        "е",
+        "н",
+        "т",
+        "р",
+        "с",
+        "в",
+        "л",
+        "к",
+        "д",
+        "п",
+        "м",
+        "з",
+        "г",
+        "я",
+        "ъ",
+        "у",
+        "б",
+        "ч",
+        "ц",
+        "й",
+        "ж",
+        "щ",
+        "х",
+    ],
+    "Croatian": [
+        "a",
+        "i",
+        "o",
+        "e",
+        "n",
+        "r",
+        "j",
+        "s",
+        "t",
+        "u",
+        "k",
+        "l",
+        "v",
+        "d",
+        "m",
+        "p",
+        "g",
+        "z",
+        "b",
+        "c",
+        "č",
+        "h",
+        "š",
+        "ž",
+        "ć",
+        "f",
+    ],
+    "Hindi": [
+        "क",
+        "र",
+        "स",
+        "न",
+        "त",
+        "म",
+        "ह",
+        "प",
+        "य",
+        "ल",
+        "व",
+        "ज",
+        "द",
+        "ग",
+        "ब",
+        "श",
+        "ट",
+        "अ",
+        "ए",
+        "थ",
+        "भ",
+        "ड",
+        "च",
+        "ध",
+        "ष",
+        "इ",
+    ],
+    "Estonian": [
+        "a",
+        "i",
+        "e",
+        "s",
+        "t",
+        "l",
+        "u",
+        "n",
+        "o",
+        "k",
+        "r",
+        "d",
+        "m",
+        "v",
+        "g",
+        "p",
+        "j",
+        "h",
+        "ä",
+        "b",
+        "õ",
+        "ü",
+        "f",
+        "c",
+        "ö",
+        "y",
+    ],
+    "Simple English": [
+        "e",
+        "a",
+        "t",
+        "i",
+        "o",
+        "n",
+        "s",
+        "r",
+        "h",
+        "l",
+        "d",
+        "c",
+        "m",
+        "u",
+        "f",
+        "p",
+        "g",
+        "w",
+        "b",
+        "y",
+        "v",
+        "k",
+        "j",
+        "x",
+        "z",
+        "q",
+    ],
+    "Thai": [
+        "า",
+        "น",
+        "ร",
+        "อ",
+        "ก",
+        "เ",
+        "ง",
+        "ม",
+        "ย",
+        "ล",
+        "ว",
+        "ด",
+        "ท",
+        "ส",
+        "ต",
+        "ะ",
+        "ป",
+        "บ",
+        "ค",
+        "ห",
+        "แ",
+        "จ",
+        "พ",
+        "ช",
+        "ข",
+        "ใ",
+    ],
+    "Greek": [
+        "α",
+        "τ",
+        "ο",
+        "ι",
+        "ε",
+        "ν",
+        "ρ",
+        "σ",
+        "κ",
+        "η",
+        "π",
+        "ς",
+        "υ",
+        "μ",
+        "λ",
+        "ί",
+        "ό",
+        "ά",
+        "γ",
+        "έ",
+        "δ",
+        "ή",
+        "ω",
+        "χ",
+        "θ",
+        "ύ",
+    ],
+    "Tamil": [
+        "க",
+        "த",
+        "ப",
+        "ட",
+        "ர",
+        "ம",
+        "ல",
+        "ன",
+        "வ",
+        "ற",
+        "ய",
+        "ள",
+        "ச",
+        "ந",
+        "இ",
+        "ண",
+        "அ",
+        "ஆ",
+        "ழ",
+        "ங",
+        "எ",
+        "உ",
+        "ஒ",
+        "ஸ",
+    ],
+    "Classical Chinese": [
+        "之",
+        "年",
+        "為",
+        "也",
+        "以",
+        "一",
+        "人",
+        "其",
+        "者",
+        "國",
+        "有",
+        "二",
+        "十",
+        "於",
+        "曰",
+        "三",
+        "不",
+        "大",
+        "而",
+        "子",
+        "中",
+        "五",
+        "四",
+    ],
+    "Kazakh": [
+        "а",
+        "ы",
+        "е",
+        "н",
+        "т",
+        "р",
+        "л",
+        "і",
+        "д",
+        "с",
+        "м",
+        "қ",
+        "к",
+        "о",
+        "б",
+        "и",
+        "у",
+        "ғ",
+        "ж",
+        "ң",
+        "з",
+        "ш",
+        "й",
+        "п",
+        "г",
+        "ө",
+    ],
+}
diff --git a/contrib/python/charset-normalizer/charset_normalizer/cd.py b/contrib/python/charset-normalizer/charset_normalizer/cd.py
index 8429a0eb206..8998bb545c8 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/cd.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/cd.py
@@ -1,6 +1,6 @@
 import importlib
 from codecs import IncrementalDecoder
-from collections import Counter, OrderedDict
+from collections import Counter
 from functools import lru_cache
 from typing import Dict, List, Optional, Tuple
 
@@ -26,15 +26,15 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
 
     decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder  # type: ignore
 
-    p = decoder(errors="ignore")  # type: IncrementalDecoder
-    seen_ranges = {}  # type: Dict[str, int]
-    character_count = 0  # type: int
+    p: IncrementalDecoder = decoder(errors="ignore")
+    seen_ranges: Dict[str, int] = {}
+    character_count: int = 0
 
     for i in range(0x40, 0xFF):
-        chunk = p.decode(bytes([i]))  # type: str
+        chunk: str = p.decode(bytes([i]))
 
         if chunk:
-            character_range = unicode_range(chunk)  # type: Optional[str]
+            character_range: Optional[str] = unicode_range(chunk)
 
             if character_range is None:
                 continue
@@ -58,7 +58,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
     """
     Return inferred languages used with a unicode range.
     """
-    languages = []  # type: List[str]
+    languages: List[str] = []
 
     for language, characters in FREQUENCIES.items():
         for character in characters:
@@ -75,8 +75,8 @@ def encoding_languages(iana_name: str) -> List[str]:
     Single-byte encoding language association. Some code page are heavily linked to particular language(s).
     This function does the correspondence.
     """
-    unicode_ranges = encoding_unicode_range(iana_name)  # type: List[str]
-    primary_range = None  # type: Optional[str]
+    unicode_ranges: List[str] = encoding_unicode_range(iana_name)
+    primary_range: Optional[str] = None
 
     for specified_range in unicode_ranges:
         if "Latin" not in specified_range:
@@ -115,8 +115,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
     """
     Determine main aspects from a supported language if it contains accents and if is pure Latin.
     """
-    target_have_accents = False  # type: bool
-    target_pure_latin = True  # type: bool
+    target_have_accents: bool = False
+    target_pure_latin: bool = True
 
     for character in FREQUENCIES[language]:
         if not target_have_accents and is_accentuated(character):
@@ -133,7 +133,7 @@ def alphabet_languages(
     """
     Return associated languages associated to given characters.
     """
-    languages = []  # type: List[Tuple[str, float]]
+    languages: List[Tuple[str, float]] = []
 
     source_have_accents = any(is_accentuated(character) for character in characters)
 
@@ -147,13 +147,13 @@ def alphabet_languages(
         if target_have_accents is False and source_have_accents:
             continue
 
-        character_count = len(language_characters)  # type: int
+        character_count: int = len(language_characters)
 
-        character_match_count = len(
+        character_match_count: int = len(
             [c for c in language_characters if c in characters]
-        )  # type: int
+        )
 
-        ratio = character_match_count / character_count  # type: float
+        ratio: float = character_match_count / character_count
 
         if ratio >= 0.2:
             languages.append((language, ratio))
@@ -174,36 +174,33 @@ def characters_popularity_compare(
     if language not in FREQUENCIES:
         raise ValueError("{} not available".format(language))
 
-    character_approved_count = 0  # type: int
+    character_approved_count: int = 0
+    FREQUENCIES_language_set = set(FREQUENCIES[language])
 
     for character in ordered_characters:
-        if character not in FREQUENCIES[language]:
+        if character not in FREQUENCIES_language_set:
             continue
 
-        characters_before_source = FREQUENCIES[language][
+        characters_before_source: List[str] = FREQUENCIES[language][
             0 : FREQUENCIES[language].index(character)
-        ]  # type: List[str]
-        characters_after_source = FREQUENCIES[language][
+        ]
+        characters_after_source: List[str] = FREQUENCIES[language][
             FREQUENCIES[language].index(character) :
-        ]  # type: List[str]
-
-        characters_before = ordered_characters[
+        ]
+        characters_before: List[str] = ordered_characters[
             0 : ordered_characters.index(character)
-        ]  # type: List[str]
-        characters_after = ordered_characters[
+        ]
+        characters_after: List[str] = ordered_characters[
             ordered_characters.index(character) :
-        ]  # type: List[str]
-
-        before_match_count = [
-            e in characters_before for e in characters_before_source
-        ].count(
-            True
-        )  # type: int
-        after_match_count = [
-            e in characters_after for e in characters_after_source
-        ].count(
-            True
-        )  # type: int
+        ]
+
+        before_match_count: int = len(
+            set(characters_before) & set(characters_before_source)
+        )
+
+        after_match_count: int = len(
+            set(characters_after) & set(characters_after_source)
+        )
 
         if len(characters_before_source) == 0 and before_match_count <= 4:
             character_approved_count += 1
@@ -229,18 +226,18 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
     Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
     One containing the latin letters and the other hebrew.
     """
-    layers = OrderedDict()  # type: Dict[str, str]
+    layers: Dict[str, str] = {}
 
     for character in decoded_sequence:
         if character.isalpha() is False:
             continue
 
-        character_range = unicode_range(character)  # type: Optional[str]
+        character_range: Optional[str] = unicode_range(character)
 
         if character_range is None:
             continue
 
-        layer_target_range = None  # type: Optional[str]
+        layer_target_range: Optional[str] = None
 
         for discovered_range in layers:
             if (
@@ -267,7 +264,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
     This function merge results previously given by the function coherence_ratio.
     The return type is the same as coherence_ratio.
     """
-    per_language_ratios = OrderedDict()  # type: Dict[str, List[float]]
+    per_language_ratios: Dict[str, List[float]] = {}
     for result in results:
         for sub_result in result:
             language, ratio = sub_result
@@ -299,10 +296,10 @@ def coherence_ratio(
     A layer = Character extraction by alphabets/ranges.
     """
 
-    results = []  # type: List[Tuple[str, float]]
-    ignore_non_latin = False  # type: bool
+    results: List[Tuple[str, float]] = []
+    ignore_non_latin: bool = False
 
-    sufficient_match_count = 0  # type: int
+    sufficient_match_count: int = 0
 
     lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
     if "Latin Based" in lg_inclusion_list:
@@ -310,22 +307,22 @@ def coherence_ratio(
         lg_inclusion_list.remove("Latin Based")
 
     for layer in alpha_unicode_split(decoded_sequence):
-        sequence_frequencies = Counter(layer)  # type: Counter
+        sequence_frequencies: Counter = Counter(layer)
         most_common = sequence_frequencies.most_common()
 
-        character_count = sum(o for c, o in most_common)  # type: int
+        character_count: int = sum(o for c, o in most_common)
 
         if character_count <= TOO_SMALL_SEQUENCE:
             continue
 
-        popular_character_ordered = [c for c, o in most_common]  # type: List[str]
+        popular_character_ordered: List[str] = [c for c, o in most_common]
 
         for language in lg_inclusion_list or alphabet_languages(
             popular_character_ordered, ignore_non_latin
         ):
-            ratio = characters_popularity_compare(
+            ratio: float = characters_popularity_compare(
                 language, popular_character_ordered
-            )  # type: float
+            )
 
             if ratio < threshold:
                 continue
diff --git a/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py b/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py
index 5f912c923b7..540e5e2a1a2 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py
@@ -5,6 +5,11 @@ from os.path import abspath
 from platform import python_version
 from typing import List
 
+try:
+    from unicodedata2 import unidata_version
+except ImportError:
+    from unicodedata import unidata_version
+
 from charset_normalizer import from_fp
 from charset_normalizer.models import CliDetectionResult
 from charset_normalizer.version import __version__
@@ -111,7 +116,7 @@ def cli_detect(argv: List[str] = None) -> int:
         "-t",
         "--threshold",
         action="store",
-        default=0.1,
+        default=0.2,
         type=float,
         dest="threshold",
         help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
@@ -119,8 +124,8 @@ def cli_detect(argv: List[str] = None) -> int:
     parser.add_argument(
         "--version",
         action="version",
-        version="Charset-Normalizer {} - Python {}".format(
-            __version__, python_version()
+        version="Charset-Normalizer {} - Python {} - Unicode {}".format(
+            __version__, python_version(), unidata_version
         ),
         help="Show version information and exit.",
     )
@@ -229,7 +234,7 @@ def cli_detect(argv: List[str] = None) -> int:
                         my_file.close()
                     continue
 
-                o_ = my_file.name.split(".")  # type: List[str]
+                o_: List[str] = my_file.name.split(".")
 
                 if args.replace is False:
                     o_.insert(-1, best_guess.encoding)
diff --git a/contrib/python/charset-normalizer/charset_normalizer/constant.py b/contrib/python/charset-normalizer/charset_normalizer/constant.py
index c32f5cf2d63..ac840c461fa 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/constant.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/constant.py
@@ -1,5 +1,4 @@
 from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
-from collections import OrderedDict
 from encodings.aliases import aliases
 from re import IGNORECASE, compile as re_compile
 from typing import Dict, List, Set, Union
@@ -7,31 +6,26 @@ from typing import Dict, List, Set, Union
 from .assets import FREQUENCIES
 
 # Contain for each eligible encoding a list of/item bytes SIG/BOM
-ENCODING_MARKS = OrderedDict(
-    [
-        ("utf_8", BOM_UTF8),
-        (
-            "utf_7",
-            [
-                b"\x2b\x2f\x76\x38",
-                b"\x2b\x2f\x76\x39",
-                b"\x2b\x2f\x76\x2b",
-                b"\x2b\x2f\x76\x2f",
-                b"\x2b\x2f\x76\x38\x2d",
-            ],
-        ),
-        ("gb18030", b"\x84\x31\x95\x33"),
-        ("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]),
-        ("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]),
-    ]
-)  # type: Dict[str, Union[bytes, List[bytes]]]
+ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
+    "utf_8": BOM_UTF8,
+    "utf_7": [
+        b"\x2b\x2f\x76\x38",
+        b"\x2b\x2f\x76\x39",
+        b"\x2b\x2f\x76\x2b",
+        b"\x2b\x2f\x76\x2f",
+        b"\x2b\x2f\x76\x38\x2d",
+    ],
+    "gb18030": b"\x84\x31\x95\x33",
+    "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
+    "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
+}
 
-TOO_SMALL_SEQUENCE = 32  # type: int
-TOO_BIG_SEQUENCE = int(10e6)  # type: int
+TOO_SMALL_SEQUENCE: int = 32
+TOO_BIG_SEQUENCE: int = int(10e6)
 
-UTF8_MAXIMAL_ALLOCATION = 1112064  # type: int
+UTF8_MAXIMAL_ALLOCATION: int = 1112064
 
-UNICODE_RANGES_COMBINED = {
+UNICODE_RANGES_COMBINED: Dict[str, range] = {
     "Control character": range(31 + 1),
     "Basic Latin": range(32, 127 + 1),
     "Latin-1 Supplement": range(128, 255 + 1),
@@ -311,10 +305,10 @@ UNICODE_RANGES_COMBINED = {
     "CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
     "Tags": range(917504, 917631 + 1),
     "Variation Selectors Supplement": range(917760, 917999 + 1),
-}  # type: Dict[str, range]
+}
 
 
-UNICODE_SECONDARY_RANGE_KEYWORD = [
+UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
     "Supplement",
     "Extended",
     "Extensions",
@@ -330,25 +324,25 @@ UNICODE_SECONDARY_RANGE_KEYWORD = [
     "Shapes",
     "Supplemental",
     "Tags",
-]  # type: List[str]
+]
 
 RE_POSSIBLE_ENCODING_INDICATION = re_compile(
     r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
     IGNORECASE,
 )
 
-IANA_SUPPORTED = sorted(
+IANA_SUPPORTED: List[str] = sorted(
     filter(
         lambda x: x.endswith("_codec") is False
         and x not in {"rot_13", "tactis", "mbcs"},
         list(set(aliases.values())),
     )
-)  # type: List[str]
+)
 
-IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED)  # type: int
+IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
 
 # pre-computed code page that are similar using the function cp_similarity.
-IANA_SUPPORTED_SIMILAR = {
+IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
     "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
     "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
     "cp1125": ["cp866"],
@@ -434,10 +428,10 @@ IANA_SUPPORTED_SIMILAR = {
     "mac_turkish": ["mac_iceland", "mac_roman"],
     "ptcp154": ["cp1251", "kz1048"],
     "tis_620": ["iso8859_11"],
-}  # type: Dict[str, List[str]]
+}
 
 
-CHARDET_CORRESPONDENCE = {
+CHARDET_CORRESPONDENCE: Dict[str, str] = {
     "iso2022_kr": "ISO-2022-KR",
     "iso2022_jp": "ISO-2022-JP",
     "euc_kr": "EUC-KR",
@@ -470,10 +464,10 @@ CHARDET_CORRESPONDENCE = {
     "cp1256": "windows-1256",
     "cp1254": "Windows-1254",
     "cp949": "CP949",
-}  # type: Dict[str, str]
+}
 
 
-COMMON_SAFE_ASCII_CHARACTERS = {
+COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
     "<",
     ">",
     "=",
@@ -489,15 +483,15 @@ COMMON_SAFE_ASCII_CHARACTERS = {
     "|",
     '"',
     "-",
-}  # type: Set[str]
+}
 
 
-KO_NAMES = {"johab", "cp949", "euc_kr"}  # type: Set[str]
-ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"}  # type: Set[str]
+KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
+ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
 
 NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
 
-LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES)  # type: int
+LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
 
 # Logging LEVEL bellow DEBUG
-TRACE = 5  # type: int
+TRACE: int = 5
diff --git a/contrib/python/charset-normalizer/charset_normalizer/md.py b/contrib/python/charset-normalizer/charset_normalizer/md.py
index f3d6505cf00..31808af84ca 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/md.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/md.py
@@ -16,6 +16,7 @@ from .utils import (
     is_separator,
     is_symbol,
     is_thai,
+    is_unprintable,
     remove_accent,
     unicode_range,
 )
@@ -57,12 +58,12 @@ class MessDetectorPlugin:
 
 class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
-        self._punctuation_count = 0  # type: int
-        self._symbol_count = 0  # type: int
-        self._character_count = 0  # type: int
+        self._punctuation_count: int = 0
+        self._symbol_count: int = 0
+        self._character_count: int = 0
 
-        self._last_printable_char = None  # type: Optional[str]
-        self._frenzy_symbol_in_word = False  # type: bool
+        self._last_printable_char: Optional[str] = None
+        self._frenzy_symbol_in_word: bool = False
 
     def eligible(self, character: str) -> bool:
         return character.isprintable()
@@ -95,17 +96,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
         if self._character_count == 0:
             return 0.0
 
-        ratio_of_punctuation = (
+        ratio_of_punctuation: float = (
             self._punctuation_count + self._symbol_count
-        ) / self._character_count  # type: float
+        ) / self._character_count
 
         return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
 
 
 class TooManyAccentuatedPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
-        self._character_count = 0  # type: int
-        self._accentuated_count = 0  # type: int
+        self._character_count: int = 0
+        self._accentuated_count: int = 0
 
     def eligible(self, character: str) -> bool:
         return character.isalpha()
@@ -124,26 +125,20 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
     def ratio(self) -> float:
         if self._character_count == 0:
             return 0.0
-        ratio_of_accentuation = (
-            self._accentuated_count / self._character_count
-        )  # type: float
+        ratio_of_accentuation: float = self._accentuated_count / self._character_count
         return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
 
 
 class UnprintablePlugin(MessDetectorPlugin):
     def __init__(self) -> None:
-        self._unprintable_count = 0  # type: int
-        self._character_count = 0  # type: int
+        self._unprintable_count: int = 0
+        self._character_count: int = 0
 
     def eligible(self, character: str) -> bool:
         return True
 
     def feed(self, character: str) -> None:
-        if (
-            character.isspace() is False  # includes \n \t \r \v
-            and character.isprintable() is False
-            and character != "\x1A"  # Why? Its the ASCII substitute character.
-        ):
+        if is_unprintable(character):
             self._unprintable_count += 1
         self._character_count += 1
 
@@ -160,10 +155,10 @@ class UnprintablePlugin(MessDetectorPlugin):
 
 class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
-        self._successive_count = 0  # type: int
-        self._character_count = 0  # type: int
+        self._successive_count: int = 0
+        self._character_count: int = 0
 
-        self._last_latin_character = None  # type: Optional[str]
+        self._last_latin_character: Optional[str] = None
 
     def eligible(self, character: str) -> bool:
         return character.isalpha() and is_latin(character)
@@ -197,9 +192,9 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
 
 class SuspiciousRange(MessDetectorPlugin):
     def __init__(self) -> None:
-        self._suspicious_successive_range_count = 0  # type: int
-        self._character_count = 0  # type: int
-        self._last_printable_seen = None  # type: Optional[str]
+        self._suspicious_successive_range_count: int = 0
+        self._character_count: int = 0
+        self._last_printable_seen: Optional[str] = None
 
     def eligible(self, character: str) -> bool:
         return character.isprintable()
@@ -219,10 +214,8 @@ class SuspiciousRange(MessDetectorPlugin):
             self._last_printable_seen = character
             return
 
-        unicode_range_a = unicode_range(
-            self._last_printable_seen
-        )  # type: Optional[str]
-        unicode_range_b = unicode_range(character)  # type: Optional[str]
+        unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
+        unicode_range_b: Optional[str] = unicode_range(character)
 
         if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
             self._suspicious_successive_range_count += 1
@@ -239,9 +232,9 @@ class SuspiciousRange(MessDetectorPlugin):
         if self._character_count == 0:
             return 0.0
 
-        ratio_of_suspicious_range_usage = (
+        ratio_of_suspicious_range_usage: float = (
             self._suspicious_successive_range_count * 2
-        ) / self._character_count  # type: float
+        ) / self._character_count
 
         if ratio_of_suspicious_range_usage < 0.1:
             return 0.0
@@ -251,25 +244,25 @@ class SuspiciousRange(MessDetectorPlugin):
 
 class SuperWeirdWordPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
-        self._word_count = 0  # type: int
-        self._bad_word_count = 0  # type: int
-        self._foreign_long_count = 0  # type: int
+        self._word_count: int = 0
+        self._bad_word_count: int = 0
+        self._foreign_long_count: int = 0
 
-        self._is_current_word_bad = False  # type: bool
-        self._foreign_long_watch = False  # type: bool
+        self._is_current_word_bad: bool = False
+        self._foreign_long_watch: bool = False
 
-        self._character_count = 0  # type: int
-        self._bad_character_count = 0  # type: int
+        self._character_count: int = 0
+        self._bad_character_count: int = 0
 
-        self._buffer = ""  # type: str
-        self._buffer_accent_count = 0  # type: int
+        self._buffer: str = ""
+        self._buffer_accent_count: int = 0
 
     def eligible(self, character: str) -> bool:
         return True
 
     def feed(self, character: str) -> None:
         if character.isalpha():
-            self._buffer = "".join([self._buffer, character])
+            self._buffer += character
             if is_accentuated(character):
                 self._buffer_accent_count += 1
             if (
@@ -289,7 +282,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
             character.isspace() or is_punctuation(character) or is_separator(character)
         ) and self._buffer:
             self._word_count += 1
-            buffer_length = len(self._buffer)  # type: int
+            buffer_length: int = len(self._buffer)
 
             self._character_count += buffer_length
 
@@ -346,8 +339,8 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
     """
 
     def __init__(self) -> None:
-        self._wrong_stop_count = 0  # type: int
-        self._cjk_character_count = 0  # type: int
+        self._wrong_stop_count: int = 0
+        self._cjk_character_count: int = 0
 
     def eligible(self, character: str) -> bool:
         return True
@@ -372,17 +365,17 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
 
 class ArchaicUpperLowerPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
-        self._buf = False  # type: bool
+        self._buf: bool = False
 
-        self._character_count_since_last_sep = 0  # type: int
+        self._character_count_since_last_sep: int = 0
 
-        self._successive_upper_lower_count = 0  # type: int
-        self._successive_upper_lower_count_final = 0  # type: int
+        self._successive_upper_lower_count: int = 0
+        self._successive_upper_lower_count_final: int = 0
 
-        self._character_count = 0  # type: int
+        self._character_count: int = 0
 
-        self._last_alpha_seen = None  # type: Optional[str]
-        self._current_ascii_only = True  # type: bool
+        self._last_alpha_seen: Optional[str] = None
+        self._current_ascii_only: bool = True
 
     def eligible(self, character: str) -> bool:
         return True
@@ -446,6 +439,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
         return self._successive_upper_lower_count_final / self._character_count
 
 
+@lru_cache(maxsize=1024)
 def is_suspiciously_successive_range(
     unicode_range_a: Optional[str], unicode_range_b: Optional[str]
 ) -> bool:
@@ -524,16 +518,16 @@ def mess_ratio(
     Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
     """
 
-    detectors = [
+    detectors: List[MessDetectorPlugin] = [
         md_class() for md_class in MessDetectorPlugin.__subclasses__()
-    ]  # type: List[MessDetectorPlugin]
+    ]
 
-    length = len(decoded_sequence) + 1  # type: int
+    length: int = len(decoded_sequence) + 1
 
-    mean_mess_ratio = 0.0  # type: float
+    mean_mess_ratio: float = 0.0
 
     if length < 512:
-        intermediary_mean_mess_ratio_calc = 32  # type: int
+        intermediary_mean_mess_ratio_calc: int = 32
     elif length <= 1024:
         intermediary_mean_mess_ratio_calc = 64
     else:
diff --git a/contrib/python/charset-normalizer/charset_normalizer/models.py b/contrib/python/charset-normalizer/charset_normalizer/models.py
index c38da31fa56..b9d71eb4fd2 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/models.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/models.py
@@ -21,21 +21,21 @@ class CharsetMatch:
         languages: "CoherenceMatches",
         decoded_payload: Optional[str] = None,
     ):
-        self._payload = payload  # type: bytes
+        self._payload: bytes = payload
 
-        self._encoding = guessed_encoding  # type: str
-        self._mean_mess_ratio = mean_mess_ratio  # type: float
-        self._languages = languages  # type: CoherenceMatches
-        self._has_sig_or_bom = has_sig_or_bom  # type: bool
-        self._unicode_ranges = None  # type: Optional[List[str]]
+        self._encoding: str = guessed_encoding
+        self._mean_mess_ratio: float = mean_mess_ratio
+        self._languages: CoherenceMatches = languages
+        self._has_sig_or_bom: bool = has_sig_or_bom
+        self._unicode_ranges: Optional[List[str]] = None
 
-        self._leaves = []  # type: List[CharsetMatch]
-        self._mean_coherence_ratio = 0.0  # type: float
+        self._leaves: List[CharsetMatch] = []
+        self._mean_coherence_ratio: float = 0.0
 
-        self._output_payload = None  # type: Optional[bytes]
-        self._output_encoding = None  # type: Optional[str]
+        self._output_payload: Optional[bytes] = None
+        self._output_encoding: Optional[str] = None
 
-        self._string = decoded_payload  # type: Optional[str]
+        self._string: Optional[str] = decoded_payload
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, CharsetMatch):
@@ -53,8 +53,8 @@ class CharsetMatch:
         if not isinstance(other, CharsetMatch):
             raise ValueError
 
-        chaos_difference = abs(self.chaos - other.chaos)  # type: float
-        coherence_difference = abs(self.coherence - other.coherence)  # type: float
+        chaos_difference: float = abs(self.chaos - other.chaos)
+        coherence_difference: float = abs(self.coherence - other.coherence)
 
         # Bellow 1% difference --> Use Coherence
         if chaos_difference < 0.01 and coherence_difference > 0.02:
@@ -137,7 +137,7 @@ class CharsetMatch:
         """
         Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
         """
-        also_known_as = []  # type: List[str]
+        also_known_as: List[str] = []
         for u, p in aliases.items():
             if self.encoding == u:
                 also_known_as.append(p)
@@ -227,9 +227,9 @@ class CharsetMatch:
         if self._unicode_ranges is not None:
             return self._unicode_ranges
         # list detected ranges
-        detected_ranges = [
+        detected_ranges: List[Optional[str]] = [
             unicode_range(char) for char in str(self)
-        ]  # type: List[Optional[str]]
+        ]
         # filter and sort
         self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
         return self._unicode_ranges
@@ -281,7 +281,7 @@ class CharsetMatches:
     """
 
     def __init__(self, results: List[CharsetMatch] = None):
-        self._results = sorted(results) if results else []  # type: List[CharsetMatch]
+        self._results: List[CharsetMatch] = sorted(results) if results else []
 
     def __iter__(self) -> Iterator[CharsetMatch]:
         yield from self._results
@@ -360,17 +360,17 @@ class CliDetectionResult:
         unicode_path: Optional[str],
         is_preferred: bool,
     ):
-        self.path = path  # type: str
-        self.unicode_path = unicode_path  # type: Optional[str]
-        self.encoding = encoding  # type: Optional[str]
-        self.encoding_aliases = encoding_aliases  # type: List[str]
-        self.alternative_encodings = alternative_encodings  # type: List[str]
-        self.language = language  # type: str
-        self.alphabets = alphabets  # type: List[str]
-        self.has_sig_or_bom = has_sig_or_bom  # type: bool
-        self.chaos = chaos  # type: float
-        self.coherence = coherence  # type: float
-        self.is_preferred = is_preferred  # type: bool
+        self.path: str = path
+        self.unicode_path: Optional[str] = unicode_path
+        self.encoding: Optional[str] = encoding
+        self.encoding_aliases: List[str] = encoding_aliases
+        self.alternative_encodings: List[str] = alternative_encodings
+        self.language: str = language
+        self.alphabets: List[str] = alphabets
+        self.has_sig_or_bom: bool = has_sig_or_bom
+        self.chaos: float = chaos
+        self.coherence: float = coherence
+        self.is_preferred: bool = is_preferred
 
     @property
     def __dict__(self) -> Dict[str, Any]:  # type: ignore
diff --git a/contrib/python/charset-normalizer/charset_normalizer/utils.py b/contrib/python/charset-normalizer/charset_normalizer/utils.py
index dcb14dfee1f..17eaee0408e 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/utils.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/utils.py
@@ -1,4 +1,6 @@
 try:
+    # WARNING: unicodedata2 support is going to be removed in 3.0
+    # Python is quickly catching up.
     import unicodedata2 as unicodedata
 except ImportError:
     import unicodedata  # type: ignore[no-redef]
@@ -9,7 +11,7 @@ from codecs import IncrementalDecoder
 from encodings.aliases import aliases
 from functools import lru_cache
 from re import findall
-from typing import List, Optional, Set, Tuple, Union
+from typing import Generator, List, Optional, Set, Tuple, Union
 
 from _multibytecodec import MultibyteIncrementalDecoder  # type: ignore
 
@@ -26,7 +28,7 @@ from .constant import (
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_accentuated(character: str) -> bool:
     try:
-        description = unicodedata.name(character)  # type: str
+        description: str = unicodedata.name(character)
     except ValueError:
         return False
     return (
@@ -41,11 +43,11 @@ def is_accentuated(character: str) -> bool:
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def remove_accent(character: str) -> str:
-    decomposed = unicodedata.decomposition(character)  # type: str
+    decomposed: str = unicodedata.decomposition(character)
     if not decomposed:
         return character
 
-    codes = decomposed.split(" ")  # type: List[str]
+    codes: List[str] = decomposed.split(" ")
 
     return chr(int(codes[0], 16))
 
@@ -55,7 +57,7 @@ def unicode_range(character: str) -> Optional[str]:
     """
     Retrieve the Unicode range official name from a single character.
     """
-    character_ord = ord(character)  # type: int
+    character_ord: int = ord(character)
 
     for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
         if character_ord in ord_range:
@@ -67,12 +69,13 @@ def unicode_range(character: str) -> Optional[str]:
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_latin(character: str) -> bool:
     try:
-        description = unicodedata.name(character)  # type: str
+        description: str = unicodedata.name(character)
     except ValueError:
         return False
     return "LATIN" in description
 
 
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_ascii(character: str) -> bool:
     try:
         character.encode("ascii")
@@ -83,12 +86,12 @@ def is_ascii(character: str) -> bool:
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_punctuation(character: str) -> bool:
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)
 
     if "P" in character_category:
         return True
 
-    character_range = unicode_range(character)  # type: Optional[str]
+    character_range: Optional[str] = unicode_range(character)
 
     if character_range is None:
         return False
@@ -98,12 +101,12 @@ def is_punctuation(character: str) -> bool:
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_symbol(character: str) -> bool:
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)
 
     if "S" in character_category or "N" in character_category:
         return True
 
-    character_range = unicode_range(character)  # type: Optional[str]
+    character_range: Optional[str] = unicode_range(character)
 
     if character_range is None:
         return False
@@ -113,7 +116,7 @@ def is_symbol(character: str) -> bool:
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_emoticon(character: str) -> bool:
-    character_range = unicode_range(character)  # type: Optional[str]
+    character_range: Optional[str] = unicode_range(character)
 
     if character_range is None:
         return False
@@ -126,7 +129,7 @@ def is_separator(character: str) -> bool:
     if character.isspace() or character in {"｜", "+", ",", ";", "<", ">"}:
         return True
 
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)
 
     return "Z" in character_category
 
@@ -137,7 +140,7 @@ def is_case_variable(character: str) -> bool:
 
 
 def is_private_use_only(character: str) -> bool:
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)
 
     return character_category == "Co"
 
@@ -197,6 +200,17 @@ def is_unicode_range_secondary(range_name: str) -> bool:
     return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
 
 
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_unprintable(character: str) -> bool:
+    return (
+        character.isspace() is False  # includes \n \t \r \v
+        and character.isprintable() is False
+        and character != "\x1A"  # Why? Its the ASCII substitute character.
+        and character != b"\xEF\xBB\xBF".decode("utf_8")  # bug discovered in Python,
+        # Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
+    )
+
+
 def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
     """
     Extract using ASCII-only decoder any specified encoding in the first n-bytes.
@@ -204,12 +218,12 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
     if not isinstance(sequence, bytes):
         raise TypeError
 
-    seq_len = len(sequence)  # type: int
+    seq_len: int = len(sequence)
 
-    results = findall(
+    results: List[str] = findall(
         RE_POSSIBLE_ENCODING_INDICATION,
         sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
-    )  # type: List[str]
+    )
 
     if len(results) == 0:
         return None
@@ -253,7 +267,7 @@ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
     """
 
     for iana_encoding in ENCODING_MARKS:
-        marks = ENCODING_MARKS[iana_encoding]  # type: Union[bytes, List[bytes]]
+        marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
 
         if isinstance(marks, bytes):
             marks = [marks]
@@ -283,10 +297,10 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
 
 
 def range_scan(decoded_sequence: str) -> List[str]:
-    ranges = set()  # type: Set[str]
+    ranges: Set[str] = set()
 
     for character in decoded_sequence:
-        character_range = unicode_range(character)  # type: Optional[str]
+        character_range: Optional[str] = unicode_range(character)
 
         if character_range is None:
             continue
@@ -304,13 +318,13 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
     decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder  # type: ignore
     decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder  # type: ignore
 
-    id_a = decoder_a(errors="ignore")  # type: IncrementalDecoder
-    id_b = decoder_b(errors="ignore")  # type: IncrementalDecoder
+    id_a: IncrementalDecoder = decoder_a(errors="ignore")
+    id_b: IncrementalDecoder = decoder_b(errors="ignore")
 
-    character_match_count = 0  # type: int
+    character_match_count: int = 0
 
     for i in range(255):
-        to_be_decoded = bytes([i])  # type: bytes
+        to_be_decoded: bytes = bytes([i])
         if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
             character_match_count += 1
 
@@ -340,3 +354,61 @@ def set_logging_handler(
     handler = logging.StreamHandler()
     handler.setFormatter(logging.Formatter(format_string))
     logger.addHandler(handler)
+
+
+def cut_sequence_chunks(
+    sequences: bytes,
+    encoding_iana: str,
+    offsets: range,
+    chunk_size: int,
+    bom_or_sig_available: bool,
+    strip_sig_or_bom: bool,
+    sig_payload: bytes,
+    is_multi_byte_decoder: bool,
+    decoded_payload: Optional[str] = None,
+) -> Generator[str, None, None]:
+
+    if decoded_payload and is_multi_byte_decoder is False:
+        for i in offsets:
+            chunk = decoded_payload[i : i + chunk_size]
+            if not chunk:
+                break
+            yield chunk
+    else:
+        for i in offsets:
+            chunk_end = i + chunk_size
+            if chunk_end > len(sequences) + 8:
+                continue
+
+            cut_sequence = sequences[i : i + chunk_size]
+
+            if bom_or_sig_available and strip_sig_or_bom is False:
+                cut_sequence = sig_payload + cut_sequence
+
+            chunk = cut_sequence.decode(
+                encoding_iana,
+                errors="ignore" if is_multi_byte_decoder else "strict",
+            )
+
+            # multi-byte bad cutting detector and adjustment
+            # not the cleanest way to perform that fix but clever enough for now.
+            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
+
+                chunk_partial_size_chk: int = min(chunk_size, 16)
+
+                if (
+                    decoded_payload
+                    and chunk[:chunk_partial_size_chk] not in decoded_payload
+                ):
+                    for j in range(i, i - 4, -1):
+                        cut_sequence = sequences[j:chunk_end]
+
+                        if bom_or_sig_available and strip_sig_or_bom is False:
+                            cut_sequence = sig_payload + cut_sequence
+
+                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
+
+                        if chunk[:chunk_partial_size_chk] in decoded_payload:
+                            break
+
+            yield chunk
diff --git a/contrib/python/charset-normalizer/charset_normalizer/version.py b/contrib/python/charset-normalizer/charset_normalizer/version.py
index 77cfff25d64..af7e749e823 100644
--- a/contrib/python/charset-normalizer/charset_normalizer/version.py
+++ b/contrib/python/charset-normalizer/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "2.0.12"
+__version__ = "2.1.0"
 VERSION = __version__.split(".")
author	arcadia-devtools <arcadia-devtools@yandex-team.ru>	2022-06-20 18:39:30 +0300
committer	arcadia-devtools <arcadia-devtools@yandex-team.ru>	2022-06-20 18:39:30 +0300
commit	798d25a291578fceb2223382b508fba1723fef4a (patch)
tree	227b9a24000c40ae3354f4321ff9fe19143423f7
parent	d934aec555f13784eabe2d7682211050918e6cf5 (diff)
download	ydb-798d25a291578fceb2223382b508fba1723fef4a.tar.gz