diff options
author | robot-contrib <robot-contrib@yandex-team.com> | 2023-10-17 11:25:58 +0300 |
---|---|---|
committer | robot-contrib <robot-contrib@yandex-team.com> | 2023-10-17 12:25:27 +0300 |
commit | 6880ae99d2f55b0fd0d3820154920af3755ae9da (patch) | |
tree | c017a33d0302224a39d9ea006dccc0be50de25d2 | |
parent | a74d8d38bfdad3260f263cbccf768589e16acfbb (diff) | |
download | ydb-6880ae99d2f55b0fd0d3820154920af3755ae9da.tar.gz |
Update contrib/python/charset-normalizer to 3.3.0
14 files changed, 1903 insertions, 1781 deletions
diff --git a/contrib/python/charset-normalizer/.dist-info/METADATA b/contrib/python/charset-normalizer/.dist-info/METADATA index ca190e183c6..ad5158c0dee 100644 --- a/contrib/python/charset-normalizer/.dist-info/METADATA +++ b/contrib/python/charset-normalizer/.dist-info/METADATA @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: charset-normalizer -Version: 3.2.0 +Version: 3.3.0 Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet. Home-page: https://github.com/Ousret/charset_normalizer Author: Ahmed TAHRI @@ -39,12 +39,27 @@ Provides-Extra: unicode_backport <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" /> </a> <a href="https://pepy.tech/project/charset-normalizer/"> - <img alt="Download Count Total" src="https://pepy.tech/badge/charset-normalizer/month" /> + <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" /> </a> <a href="https://bestpractices.coreinfrastructure.org/projects/7297"> <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge"> </a> </p> +<p align="center"> + <sup><i>Featured Packages</i></sup><br> + <a href="https://github.com/jawah/niquests"> + <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-HTTP_1.1%2C%202%2C_and_3_Client-cyan"> + </a> + <a href="https://github.com/jawah/wassima"> + <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan"> + </a> +</p> +<p align="center"> + <sup><i>In other language (unofficial port - by the community)</i></sup><br> + <a href="https://github.com/nickspring/charset-normalizer-rs"> + <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red"> + </a> +</p> > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`, > I'm trying to resolve the issue by taking a new approach. @@ -56,21 +71,22 @@ Provides-Extra: unicode_backport This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**. -| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) | -|--------------------------------------------------|:---------------------------------------------:|:------------------------------------------------------------------------------------------------------:|:-----------------------------------------------:| -| `Fast` | ❌<br> | ✅<br> | ✅ <br> | -| `Universal**` | ❌ | ✅ | ❌ | -| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ | -| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ | -| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ | -| `Native Python` | ✅ | ✅ | ❌ | -| `Detect spoken language` | ❌ | ✅ | N/A | -| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ | -| `Whl Size` | 193.6 kB | 40 kB | ~200 kB | -| `Supported Encoding` | 33 | 🎉 [90](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 | +| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) | +|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:| +| `Fast` | ❌ | ✅ | ✅ | +| `Universal**` | ❌ | ✅ | ❌ | +| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ | +| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ | +| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ | +| `Native Python` | ✅ | ✅ | ❌ | +| `Detect spoken language` | ❌ | ✅ | N/A | +| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ | +| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB | +| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 | <p align="center"> <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/> +</p> *\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br> Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html) @@ -145,6 +161,12 @@ optional arguments: normalizer ./data/sample.1.fr.srt ``` +or + +```bash +python -m charset_normalizer ./data/sample.1.fr.srt +``` + 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format. ```json @@ -241,6 +263,7 @@ that intel is worth something here. So I use those records against decoded text - Python >=2.7,<3.5: Unsupported - Python 3.5: charset-normalizer < 2.1 - Python 3.6: charset-normalizer < 3.1 +- Python 3.7: charset-normalizer < 4.0 Upgrade your Python interpreter as soon as possible. @@ -270,6 +293,23 @@ tools. All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30) + +### Added +- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer` +- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323) + +### Removed +- (internal) Redundant utils.is_ascii function and unused function is_private_use_only +- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant + +### Changed +- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection +- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.7 + +### Fixed +- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350) + ## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07) ### Changed diff --git a/contrib/python/charset-normalizer/.dist-info/entry_points.txt b/contrib/python/charset-normalizer/.dist-info/entry_points.txt index a06d360058d..65619e73ec0 100644 --- a/contrib/python/charset-normalizer/.dist-info/entry_points.txt +++ b/contrib/python/charset-normalizer/.dist-info/entry_points.txt @@ -1,2 +1,2 @@ [console_scripts] -normalizer = charset_normalizer.cli.normalizer:cli_detect +normalizer = charset_normalizer.cli:cli_detect diff --git a/contrib/python/charset-normalizer/README.md b/contrib/python/charset-normalizer/README.md index 022726102d0..13e6e14ff65 100644 --- a/contrib/python/charset-normalizer/README.md +++ b/contrib/python/charset-normalizer/README.md @@ -6,12 +6,27 @@ <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" /> </a> <a href="https://pepy.tech/project/charset-normalizer/"> - <img alt="Download Count Total" src="https://pepy.tech/badge/charset-normalizer/month" /> + <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" /> </a> <a href="https://bestpractices.coreinfrastructure.org/projects/7297"> <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge"> </a> </p> +<p align="center"> + <sup><i>Featured Packages</i></sup><br> + <a href="https://github.com/jawah/niquests"> + <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-HTTP_1.1%2C%202%2C_and_3_Client-cyan"> + </a> + <a href="https://github.com/jawah/wassima"> + <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan"> + </a> +</p> +<p align="center"> + <sup><i>In other language (unofficial port - by the community)</i></sup><br> + <a href="https://github.com/nickspring/charset-normalizer-rs"> + <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red"> + </a> +</p> > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`, > I'm trying to resolve the issue by taking a new approach. @@ -23,21 +38,22 @@ This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**. -| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) | -|--------------------------------------------------|:---------------------------------------------:|:------------------------------------------------------------------------------------------------------:|:-----------------------------------------------:| -| `Fast` | ❌<br> | ✅<br> | ✅ <br> | -| `Universal**` | ❌ | ✅ | ❌ | -| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ | -| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ | -| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ | -| `Native Python` | ✅ | ✅ | ❌ | -| `Detect spoken language` | ❌ | ✅ | N/A | -| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ | -| `Whl Size` | 193.6 kB | 40 kB | ~200 kB | -| `Supported Encoding` | 33 | 🎉 [90](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 | +| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) | +|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:| +| `Fast` | ❌ | ✅ | ✅ | +| `Universal**` | ❌ | ✅ | ❌ | +| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ | +| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ | +| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ | +| `Native Python` | ✅ | ✅ | ❌ | +| `Detect spoken language` | ❌ | ✅ | N/A | +| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ | +| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB | +| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 | <p align="center"> <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/> +</p> *\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br> Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html) @@ -112,6 +128,12 @@ optional arguments: normalizer ./data/sample.1.fr.srt ``` +or + +```bash +python -m charset_normalizer ./data/sample.1.fr.srt +``` + 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format. ```json @@ -208,6 +230,7 @@ that intel is worth something here. So I use those records against decoded text - Python >=2.7,<3.5: Unsupported - Python 3.5: charset-normalizer < 2.1 - Python 3.6: charset-normalizer < 3.1 +- Python 3.7: charset-normalizer < 4.0 Upgrade your Python interpreter as soon as possible. diff --git a/contrib/python/charset-normalizer/charset_normalizer/__main__.py b/contrib/python/charset-normalizer/charset_normalizer/__main__.py new file mode 100644 index 00000000000..beae2ef7749 --- /dev/null +++ b/contrib/python/charset-normalizer/charset_normalizer/__main__.py @@ -0,0 +1,4 @@ +from .cli import cli_detect + +if __name__ == "__main__": + cli_detect() diff --git a/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py b/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py deleted file mode 100644 index 9075930dc8f..00000000000 --- a/contrib/python/charset-normalizer/charset_normalizer/assets/__init__.py +++ /dev/null @@ -1,1440 +0,0 @@ -# -*- coding: utf-8 -*- -from typing import Dict, List - -# Language label that contain the em dash "—" -# character are to be considered alternative seq to origin -FREQUENCIES: Dict[str, List[str]] = { - "English": [ - "e", - "a", - "t", - "i", - "o", - "n", - "s", - "r", - "h", - "l", - "d", - "c", - "u", - "m", - "f", - "p", - "g", - "w", - "y", - "b", - "v", - "k", - "x", - "j", - "z", - "q", - ], - "English—": [ - "e", - "a", - "t", - "i", - "o", - "n", - "s", - "r", - "h", - "l", - "d", - "c", - "m", - "u", - "f", - "p", - "g", - "w", - "b", - "y", - "v", - "k", - "j", - "x", - "z", - "q", - ], - "German": [ - "e", - "n", - "i", - "r", - "s", - "t", - "a", - "d", - "h", - "u", - "l", - "g", - "o", - "c", - "m", - "b", - "f", - "k", - "w", - "z", - "p", - "v", - "ü", - "ä", - "ö", - "j", - ], - "French": [ - "e", - "a", - "s", - "n", - "i", - "t", - "r", - "l", - "u", - "o", - "d", - "c", - "p", - "m", - "é", - "v", - "g", - "f", - "b", - "h", - "q", - "à", - "x", - "è", - "y", - "j", - ], - "Dutch": [ - "e", - "n", - "a", - "i", - "r", - "t", - "o", - "d", - "s", - "l", - "g", - "h", - "v", - "m", - "u", - "k", - "c", - "p", - "b", - "w", - "j", - "z", - "f", - "y", - "x", - "ë", - ], - "Italian": [ - "e", - "i", - "a", - "o", - "n", - "l", - "t", - "r", - "s", - "c", - "d", - "u", - "p", - "m", - "g", - "v", - "f", - "b", - "z", - "h", - "q", - "è", - "à", - "k", - "y", - "ò", - ], - "Polish": [ - "a", - "i", - "o", - "e", - "n", - "r", - "z", - "w", - "s", - "c", - "t", - "k", - "y", - "d", - "p", - "m", - "u", - "l", - "j", - "ł", - "g", - "b", - "h", - "ą", - "ę", - "ó", - ], - "Spanish": [ - "e", - "a", - "o", - "n", - "s", - "r", - "i", - "l", - "d", - "t", - "c", - "u", - "m", - "p", - "b", - "g", - "v", - "f", - "y", - "ó", - "h", - "q", - "í", - "j", - "z", - "á", - ], - "Russian": [ - "о", - "а", - "е", - "и", - "н", - "с", - "т", - "р", - "в", - "л", - "к", - "м", - "д", - "п", - "у", - "г", - "я", - "ы", - "з", - "б", - "й", - "ь", - "ч", - "х", - "ж", - "ц", - ], - # Jap-Kanji - "Japanese": [ - "人", - "一", - "大", - "亅", - "丁", - "丨", - "竹", - "笑", - "口", - "日", - "今", - "二", - "彳", - "行", - "十", - "土", - "丶", - "寸", - "寺", - "時", - "乙", - "丿", - "乂", - "气", - "気", - "冂", - "巾", - "亠", - "市", - "目", - "儿", - "見", - "八", - "小", - "凵", - "県", - "月", - "彐", - "門", - "間", - "木", - "東", - "山", - "出", - "本", - "中", - "刀", - "分", - "耳", - "又", - "取", - "最", - "言", - "田", - "心", - "思", - "刂", - "前", - "京", - "尹", - "事", - "生", - "厶", - "云", - "会", - "未", - "来", - "白", - "冫", - "楽", - "灬", - "馬", - "尸", - "尺", - "駅", - "明", - "耂", - "者", - "了", - "阝", - "都", - "高", - "卜", - "占", - "厂", - "广", - "店", - "子", - "申", - "奄", - "亻", - "俺", - "上", - "方", - "冖", - "学", - "衣", - "艮", - "食", - "自", - ], - # Jap-Katakana - "Japanese—": [ - "ー", - "ン", - "ス", - "・", - "ル", - "ト", - "リ", - "イ", - "ア", - "ラ", - "ッ", - "ク", - "ド", - "シ", - "レ", - "ジ", - "タ", - "フ", - "ロ", - "カ", - "テ", - "マ", - "ィ", - "グ", - "バ", - "ム", - "プ", - "オ", - "コ", - "デ", - "ニ", - "ウ", - "メ", - "サ", - "ビ", - "ナ", - "ブ", - "ャ", - "エ", - "ュ", - "チ", - "キ", - "ズ", - "ダ", - "パ", - "ミ", - "ェ", - "ョ", - "ハ", - "セ", - "ベ", - "ガ", - "モ", - "ツ", - "ネ", - "ボ", - "ソ", - "ノ", - "ァ", - "ヴ", - "ワ", - "ポ", - "ペ", - "ピ", - "ケ", - "ゴ", - "ギ", - "ザ", - "ホ", - "ゲ", - "ォ", - "ヤ", - "ヒ", - "ユ", - "ヨ", - "ヘ", - "ゼ", - "ヌ", - "ゥ", - "ゾ", - "ヶ", - "ヂ", - "ヲ", - "ヅ", - "ヵ", - "ヱ", - "ヰ", - "ヮ", - "ヽ", - "゠", - "ヾ", - "ヷ", - "ヿ", - "ヸ", - "ヹ", - "ヺ", - ], - # Jap-Hiragana - "Japanese——": [ - "の", - "に", - "る", - "た", - "と", - "は", - "し", - "い", - "を", - "で", - "て", - "が", - "な", - "れ", - "か", - "ら", - "さ", - "っ", - "り", - "す", - "あ", - "も", - "こ", - "ま", - "う", - "く", - "よ", - "き", - "ん", - "め", - "お", - "け", - "そ", - "つ", - "だ", - "や", - "え", - "ど", - "わ", - "ち", - "み", - "せ", - "じ", - "ば", - "へ", - "び", - "ず", - "ろ", - "ほ", - "げ", - "む", - "べ", - "ひ", - "ょ", - "ゆ", - "ぶ", - "ご", - "ゃ", - "ね", - "ふ", - "ぐ", - "ぎ", - "ぼ", - "ゅ", - "づ", - "ざ", - "ぞ", - "ぬ", - "ぜ", - "ぱ", - "ぽ", - "ぷ", - "ぴ", - "ぃ", - "ぁ", - "ぇ", - "ぺ", - "ゞ", - "ぢ", - "ぉ", - "ぅ", - "ゐ", - "ゝ", - "ゑ", - "゛", - "゜", - "ゎ", - "ゔ", - "゚", - "ゟ", - "゙", - "ゕ", - "ゖ", - ], - "Portuguese": [ - "a", - "e", - "o", - "s", - "i", - "r", - "d", - "n", - "t", - "m", - "u", - "c", - "l", - "p", - "g", - "v", - "b", - "f", - "h", - "ã", - "q", - "é", - "ç", - "á", - "z", - "í", - ], - "Swedish": [ - "e", - "a", - "n", - "r", - "t", - "s", - "i", - "l", - "d", - "o", - "m", - "k", - "g", - "v", - "h", - "f", - "u", - "p", - "ä", - "c", - "b", - "ö", - "å", - "y", - "j", - "x", - ], - "Chinese": [ - "的", - "一", - "是", - "不", - "了", - "在", - "人", - "有", - "我", - "他", - "这", - "个", - "们", - "中", - "来", - "上", - "大", - "为", - "和", - "国", - "地", - "到", - "以", - "说", - "时", - "要", - "就", - "出", - "会", - "可", - "也", - "你", - "对", - "生", - "能", - "而", - "子", - "那", - "得", - "于", - "着", - "下", - "自", - "之", - "年", - "过", - "发", - "后", - "作", - "里", - "用", - "道", - "行", - "所", - "然", - "家", - "种", - "事", - "成", - "方", - "多", - "经", - "么", - "去", - "法", - "学", - "如", - "都", - "同", - "现", - "当", - "没", - "动", - "面", - "起", - "看", - "定", - "天", - "分", - "还", - "进", - "好", - "小", - "部", - "其", - "些", - "主", - "样", - "理", - "心", - "她", - "本", - "前", - "开", - "但", - "因", - "只", - "从", - "想", - "实", - ], - "Ukrainian": [ - "о", - "а", - "н", - "і", - "и", - "р", - "в", - "т", - "е", - "с", - "к", - "л", - "у", - "д", - "м", - "п", - "з", - "я", - "ь", - "б", - "г", - "й", - "ч", - "х", - "ц", - "ї", - ], - "Norwegian": [ - "e", - "r", - "n", - "t", - "a", - "s", - "i", - "o", - "l", - "d", - "g", - "k", - "m", - "v", - "f", - "p", - "u", - "b", - "h", - "å", - "y", - "j", - "ø", - "c", - "æ", - "w", - ], - "Finnish": [ - "a", - "i", - "n", - "t", - "e", - "s", - "l", - "o", - "u", - "k", - "ä", - "m", - "r", - "v", - "j", - "h", - "p", - "y", - "d", - "ö", - "g", - "c", - "b", - "f", - "w", - "z", - ], - "Vietnamese": [ - "n", - "h", - "t", - "i", - "c", - "g", - "a", - "o", - "u", - "m", - "l", - "r", - "à", - "đ", - "s", - "e", - "v", - "p", - "b", - "y", - "ư", - "d", - "á", - "k", - "ộ", - "ế", - ], - "Czech": [ - "o", - "e", - "a", - "n", - "t", - "s", - "i", - "l", - "v", - "r", - "k", - "d", - "u", - "m", - "p", - "í", - "c", - "h", - "z", - "á", - "y", - "j", - "b", - "ě", - "é", - "ř", - ], - "Hungarian": [ - "e", - "a", - "t", - "l", - "s", - "n", - "k", - "r", - "i", - "o", - "z", - "á", - "é", - "g", - "m", - "b", - "y", - "v", - "d", - "h", - "u", - "p", - "j", - "ö", - "f", - "c", - ], - "Korean": [ - "이", - "다", - "에", - "의", - "는", - "로", - "하", - "을", - "가", - "고", - "지", - "서", - "한", - "은", - "기", - "으", - "년", - "대", - "사", - "시", - "를", - "리", - "도", - "인", - "스", - "일", - ], - "Indonesian": [ - "a", - "n", - "e", - "i", - "r", - "t", - "u", - "s", - "d", - "k", - "m", - "l", - "g", - "p", - "b", - "o", - "h", - "y", - "j", - "c", - "w", - "f", - "v", - "z", - "x", - "q", - ], - "Turkish": [ - "a", - "e", - "i", - "n", - "r", - "l", - "ı", - "k", - "d", - "t", - "s", - "m", - "y", - "u", - "o", - "b", - "ü", - "ş", - "v", - "g", - "z", - "h", - "c", - "p", - "ç", - "ğ", - ], - "Romanian": [ - "e", - "i", - "a", - "r", - "n", - "t", - "u", - "l", - "o", - "c", - "s", - "d", - "p", - "m", - "ă", - "f", - "v", - "î", - "g", - "b", - "ș", - "ț", - "z", - "h", - "â", - "j", - ], - "Farsi": [ - "ا", - "ی", - "ر", - "د", - "ن", - "ه", - "و", - "م", - "ت", - "ب", - "س", - "ل", - "ک", - "ش", - "ز", - "ف", - "گ", - "ع", - "خ", - "ق", - "ج", - "آ", - "پ", - "ح", - "ط", - "ص", - ], - "Arabic": [ - "ا", - "ل", - "ي", - "م", - "و", - "ن", - "ر", - "ت", - "ب", - "ة", - "ع", - "د", - "س", - "ف", - "ه", - "ك", - "ق", - "أ", - "ح", - "ج", - "ش", - "ط", - "ص", - "ى", - "خ", - "إ", - ], - "Danish": [ - "e", - "r", - "n", - "t", - "a", - "i", - "s", - "d", - "l", - "o", - "g", - "m", - "k", - "f", - "v", - "u", - "b", - "h", - "p", - "å", - "y", - "ø", - "æ", - "c", - "j", - "w", - ], - "Serbian": [ - "а", - "и", - "о", - "е", - "н", - "р", - "с", - "у", - "т", - "к", - "ј", - "в", - "д", - "м", - "п", - "л", - "г", - "з", - "б", - "a", - "i", - "e", - "o", - "n", - "ц", - "ш", - ], - "Lithuanian": [ - "i", - "a", - "s", - "o", - "r", - "e", - "t", - "n", - "u", - "k", - "m", - "l", - "p", - "v", - "d", - "j", - "g", - "ė", - "b", - "y", - "ų", - "š", - "ž", - "c", - "ą", - "į", - ], - "Slovene": [ - "e", - "a", - "i", - "o", - "n", - "r", - "s", - "l", - "t", - "j", - "v", - "k", - "d", - "p", - "m", - "u", - "z", - "b", - "g", - "h", - "č", - "c", - "š", - "ž", - "f", - "y", - ], - "Slovak": [ - "o", - "a", - "e", - "n", - "i", - "r", - "v", - "t", - "s", - "l", - "k", - "d", - "m", - "p", - "u", - "c", - "h", - "j", - "b", - "z", - "á", - "y", - "ý", - "í", - "č", - "é", - ], - "Hebrew": [ - "י", - "ו", - "ה", - "ל", - "ר", - "ב", - "ת", - "מ", - "א", - "ש", - "נ", - "ע", - "ם", - "ד", - "ק", - "ח", - "פ", - "ס", - "כ", - "ג", - "ט", - "צ", - "ן", - "ז", - "ך", - ], - "Bulgarian": [ - "а", - "и", - "о", - "е", - "н", - "т", - "р", - "с", - "в", - "л", - "к", - "д", - "п", - "м", - "з", - "г", - "я", - "ъ", - "у", - "б", - "ч", - "ц", - "й", - "ж", - "щ", - "х", - ], - "Croatian": [ - "a", - "i", - "o", - "e", - "n", - "r", - "j", - "s", - "t", - "u", - "k", - "l", - "v", - "d", - "m", - "p", - "g", - "z", - "b", - "c", - "č", - "h", - "š", - "ž", - "ć", - "f", - ], - "Hindi": [ - "क", - "र", - "स", - "न", - "त", - "म", - "ह", - "प", - "य", - "ल", - "व", - "ज", - "द", - "ग", - "ब", - "श", - "ट", - "अ", - "ए", - "थ", - "भ", - "ड", - "च", - "ध", - "ष", - "इ", - ], - "Estonian": [ - "a", - "i", - "e", - "s", - "t", - "l", - "u", - "n", - "o", - "k", - "r", - "d", - "m", - "v", - "g", - "p", - "j", - "h", - "ä", - "b", - "õ", - "ü", - "f", - "c", - "ö", - "y", - ], - "Thai": [ - "า", - "น", - "ร", - "อ", - "ก", - "เ", - "ง", - "ม", - "ย", - "ล", - "ว", - "ด", - "ท", - "ส", - "ต", - "ะ", - "ป", - "บ", - "ค", - "ห", - "แ", - "จ", - "พ", - "ช", - "ข", - "ใ", - ], - "Greek": [ - "α", - "τ", - "ο", - "ι", - "ε", - "ν", - "ρ", - "σ", - "κ", - "η", - "π", - "ς", - "υ", - "μ", - "λ", - "ί", - "ό", - "ά", - "γ", - "έ", - "δ", - "ή", - "ω", - "χ", - "θ", - "ύ", - ], - "Tamil": [ - "க", - "த", - "ப", - "ட", - "ர", - "ம", - "ல", - "ன", - "வ", - "ற", - "ய", - "ள", - "ச", - "ந", - "இ", - "ண", - "அ", - "ஆ", - "ழ", - "ங", - "எ", - "உ", - "ஒ", - "ஸ", - ], - "Kazakh": [ - "а", - "ы", - "е", - "н", - "т", - "р", - "л", - "і", - "д", - "с", - "м", - "қ", - "к", - "о", - "б", - "и", - "у", - "ғ", - "ж", - "ң", - "з", - "ш", - "й", - "п", - "г", - "ө", - ], -} diff --git a/contrib/python/charset-normalizer/charset_normalizer/cd.py b/contrib/python/charset-normalizer/charset_normalizer/cd.py index 6e56fe84a9e..4ea6760c45b 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/cd.py +++ b/contrib/python/charset-normalizer/charset_normalizer/cd.py @@ -4,8 +4,13 @@ from collections import Counter from functools import lru_cache from typing import Counter as TypeCounter, Dict, List, Optional, Tuple -from .assets import FREQUENCIES -from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES +from .constant import ( + FREQUENCIES, + KO_NAMES, + LANGUAGE_SUPPORTED_COUNT, + TOO_SMALL_SEQUENCE, + ZH_NAMES, +) from .md import is_suspiciously_successive_range from .models import CoherenceMatches from .utils import ( diff --git a/contrib/python/charset-normalizer/charset_normalizer/cli/__init__.py b/contrib/python/charset-normalizer/charset_normalizer/cli/__init__.py index e69de29bb2d..d95fedfe572 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/cli/__init__.py +++ b/contrib/python/charset-normalizer/charset_normalizer/cli/__init__.py @@ -0,0 +1,6 @@ +from .__main__ import cli_detect, query_yes_no + +__all__ = ( + "cli_detect", + "query_yes_no", +) diff --git a/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py b/contrib/python/charset-normalizer/charset_normalizer/cli/__main__.py index f4bcbaac049..f4bcbaac049 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/cli/normalizer.py +++ b/contrib/python/charset-normalizer/charset_normalizer/cli/__main__.py diff --git a/contrib/python/charset-normalizer/charset_normalizer/constant.py b/contrib/python/charset-normalizer/charset_normalizer/constant.py index 3188108d6ba..863490461ea 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/constant.py +++ b/contrib/python/charset-normalizer/charset_normalizer/constant.py @@ -1,10 +1,9 @@ +# -*- coding: utf-8 -*- from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE from encodings.aliases import aliases from re import IGNORECASE, compile as re_compile from typing import Dict, List, Set, Union -from .assets import FREQUENCIES - # Contain for each eligible encoding a list of/item bytes SIG/BOM ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = { "utf_8": BOM_UTF8, @@ -23,288 +22,338 @@ ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = { TOO_SMALL_SEQUENCE: int = 32 TOO_BIG_SEQUENCE: int = int(10e6) -UTF8_MAXIMAL_ALLOCATION: int = 1112064 +UTF8_MAXIMAL_ALLOCATION: int = 1_112_064 +# Up-to-date Unicode ucd/15.0.0 UNICODE_RANGES_COMBINED: Dict[str, range] = { - "Control character": range(31 + 1), - "Basic Latin": range(32, 127 + 1), - "Latin-1 Supplement": range(128, 255 + 1), - "Latin Extended-A": range(256, 383 + 1), - "Latin Extended-B": range(384, 591 + 1), - "IPA Extensions": range(592, 687 + 1), - "Spacing Modifier Letters": range(688, 767 + 1), - "Combining Diacritical Marks": range(768, 879 + 1), - "Greek and Coptic": range(880, 1023 + 1), - "Cyrillic": range(1024, 1279 + 1), - "Cyrillic Supplement": range(1280, 1327 + 1), - "Armenian": range(1328, 1423 + 1), - "Hebrew": range(1424, 1535 + 1), - "Arabic": range(1536, 1791 + 1), - "Syriac": range(1792, 1871 + 1), - "Arabic Supplement": range(1872, 1919 + 1), - "Thaana": range(1920, 1983 + 1), - "NKo": range(1984, 2047 + 1), - "Samaritan": range(2048, 2111 + 1), - "Mandaic": range(2112, 2143 + 1), - "Syriac Supplement": range(2144, 2159 + 1), - "Arabic Extended-A": range(2208, 2303 + 1), - "Devanagari": range(2304, 2431 + 1), - "Bengali": range(2432, 2559 + 1), - "Gurmukhi": range(2560, 2687 + 1), - "Gujarati": range(2688, 2815 + 1), - "Oriya": range(2816, 2943 + 1), - "Tamil": range(2944, 3071 + 1), - "Telugu": range(3072, 3199 + 1), - "Kannada": range(3200, 3327 + 1), - "Malayalam": range(3328, 3455 + 1), - "Sinhala": range(3456, 3583 + 1), - "Thai": range(3584, 3711 + 1), - "Lao": range(3712, 3839 + 1), - "Tibetan": range(3840, 4095 + 1), - "Myanmar": range(4096, 4255 + 1), - "Georgian": range(4256, 4351 + 1), - "Hangul Jamo": range(4352, 4607 + 1), - "Ethiopic": range(4608, 4991 + 1), - "Ethiopic Supplement": range(4992, 5023 + 1), - "Cherokee": range(5024, 5119 + 1), - "Unified Canadian Aboriginal Syllabics": range(5120, 5759 + 1), - "Ogham": range(5760, 5791 + 1), - "Runic": range(5792, 5887 + 1), - "Tagalog": range(5888, 5919 + 1), - "Hanunoo": range(5920, 5951 + 1), - "Buhid": range(5952, 5983 + 1), - "Tagbanwa": range(5984, 6015 + 1), - "Khmer": range(6016, 6143 + 1), - "Mongolian": range(6144, 6319 + 1), - "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6399 + 1), - "Limbu": range(6400, 6479 + 1), - "Tai Le": range(6480, 6527 + 1), - "New Tai Lue": range(6528, 6623 + 1), - "Khmer Symbols": range(6624, 6655 + 1), - "Buginese": range(6656, 6687 + 1), - "Tai Tham": range(6688, 6831 + 1), - "Combining Diacritical Marks Extended": range(6832, 6911 + 1), - "Balinese": range(6912, 7039 + 1), - "Sundanese": range(7040, 7103 + 1), - "Batak": range(7104, 7167 + 1), - "Lepcha": range(7168, 7247 + 1), - "Ol Chiki": range(7248, 7295 + 1), - "Cyrillic Extended C": range(7296, 7311 + 1), - "Sundanese Supplement": range(7360, 7375 + 1), - "Vedic Extensions": range(7376, 7423 + 1), - "Phonetic Extensions": range(7424, 7551 + 1), - "Phonetic Extensions Supplement": range(7552, 7615 + 1), - "Combining Diacritical Marks Supplement": range(7616, 7679 + 1), - "Latin Extended Additional": range(7680, 7935 + 1), - "Greek Extended": range(7936, 8191 + 1), - "General Punctuation": range(8192, 8303 + 1), - "Superscripts and Subscripts": range(8304, 8351 + 1), - "Currency Symbols": range(8352, 8399 + 1), - "Combining Diacritical Marks for Symbols": range(8400, 8447 + 1), - "Letterlike Symbols": range(8448, 8527 + 1), - "Number Forms": range(8528, 8591 + 1), - "Arrows": range(8592, 8703 + 1), - "Mathematical Operators": range(8704, 8959 + 1), - "Miscellaneous Technical": range(8960, 9215 + 1), - "Control Pictures": range(9216, 9279 + 1), - "Optical Character Recognition": range(9280, 9311 + 1), - "Enclosed Alphanumerics": range(9312, 9471 + 1), - "Box Drawing": range(9472, 9599 + 1), - "Block Elements": range(9600, 9631 + 1), - "Geometric Shapes": range(9632, 9727 + 1), - "Miscellaneous Symbols": range(9728, 9983 + 1), - "Dingbats": range(9984, 10175 + 1), - "Miscellaneous Mathematical Symbols-A": range(10176, 10223 + 1), - "Supplemental Arrows-A": range(10224, 10239 + 1), - "Braille Patterns": range(10240, 10495 + 1), - "Supplemental Arrows-B": range(10496, 10623 + 1), - "Miscellaneous Mathematical Symbols-B": range(10624, 10751 + 1), - "Supplemental Mathematical Operators": range(10752, 11007 + 1), - "Miscellaneous Symbols and Arrows": range(11008, 11263 + 1), - "Glagolitic": range(11264, 11359 + 1), - "Latin Extended-C": range(11360, 11391 + 1), - "Coptic": range(11392, 11519 + 1), - "Georgian Supplement": range(11520, 11567 + 1), - "Tifinagh": range(11568, 11647 + 1), - "Ethiopic Extended": range(11648, 11743 + 1), - "Cyrillic Extended-A": range(11744, 11775 + 1), - "Supplemental Punctuation": range(11776, 11903 + 1), - "CJK Radicals Supplement": range(11904, 12031 + 1), - "Kangxi Radicals": range(12032, 12255 + 1), - "Ideographic Description Characters": range(12272, 12287 + 1), - "CJK Symbols and Punctuation": range(12288, 12351 + 1), - "Hiragana": range(12352, 12447 + 1), - "Katakana": range(12448, 12543 + 1), - "Bopomofo": range(12544, 12591 + 1), - "Hangul Compatibility Jamo": range(12592, 12687 + 1), - "Kanbun": range(12688, 12703 + 1), - "Bopomofo Extended": range(12704, 12735 + 1), - "CJK Strokes": range(12736, 12783 + 1), - "Katakana Phonetic Extensions": range(12784, 12799 + 1), - "Enclosed CJK Letters and Months": range(12800, 13055 + 1), - "CJK Compatibility": range(13056, 13311 + 1), - "CJK Unified Ideographs Extension A": range(13312, 19903 + 1), - "Yijing Hexagram Symbols": range(19904, 19967 + 1), - "CJK Unified Ideographs": range(19968, 40959 + 1), - "Yi Syllables": range(40960, 42127 + 1), - "Yi Radicals": range(42128, 42191 + 1), - "Lisu": range(42192, 42239 + 1), - "Vai": range(42240, 42559 + 1), - "Cyrillic Extended-B": range(42560, 42655 + 1), - "Bamum": range(42656, 42751 + 1), - "Modifier Tone Letters": range(42752, 42783 + 1), - "Latin Extended-D": range(42784, 43007 + 1), - "Syloti Nagri": range(43008, 43055 + 1), - "Common Indic Number Forms": range(43056, 43071 + 1), - "Phags-pa": range(43072, 43135 + 1), - "Saurashtra": range(43136, 43231 + 1), - "Devanagari Extended": range(43232, 43263 + 1), - "Kayah Li": range(43264, 43311 + 1), - "Rejang": range(43312, 43359 + 1), - "Hangul Jamo Extended-A": range(43360, 43391 + 1), - "Javanese": range(43392, 43487 + 1), - "Myanmar Extended-B": range(43488, 43519 + 1), - "Cham": range(43520, 43615 + 1), - "Myanmar Extended-A": range(43616, 43647 + 1), - "Tai Viet": range(43648, 43743 + 1), - "Meetei Mayek Extensions": range(43744, 43775 + 1), - "Ethiopic Extended-A": range(43776, 43823 + 1), - "Latin Extended-E": range(43824, 43887 + 1), - "Cherokee Supplement": range(43888, 43967 + 1), - "Meetei Mayek": range(43968, 44031 + 1), - "Hangul Syllables": range(44032, 55215 + 1), - "Hangul Jamo Extended-B": range(55216, 55295 + 1), - "High Surrogates": range(55296, 56191 + 1), - "High Private Use Surrogates": range(56192, 56319 + 1), - "Low Surrogates": range(56320, 57343 + 1), - "Private Use Area": range(57344, 63743 + 1), - "CJK Compatibility Ideographs": range(63744, 64255 + 1), - "Alphabetic Presentation Forms": range(64256, 64335 + 1), - "Arabic Presentation Forms-A": range(64336, 65023 + 1), - "Variation Selectors": range(65024, 65039 + 1), - "Vertical Forms": range(65040, 65055 + 1), - "Combining Half Marks": range(65056, 65071 + 1), - "CJK Compatibility Forms": range(65072, 65103 + 1), - "Small Form Variants": range(65104, 65135 + 1), - "Arabic Presentation Forms-B": range(65136, 65279 + 1), - "Halfwidth and Fullwidth Forms": range(65280, 65519 + 1), - "Specials": range(65520, 65535 + 1), - "Linear B Syllabary": range(65536, 65663 + 1), - "Linear B Ideograms": range(65664, 65791 + 1), - "Aegean Numbers": range(65792, 65855 + 1), - "Ancient Greek Numbers": range(65856, 65935 + 1), - "Ancient Symbols": range(65936, 65999 + 1), - "Phaistos Disc": range(66000, 66047 + 1), - "Lycian": range(66176, 66207 + 1), - "Carian": range(66208, 66271 + 1), - "Coptic Epact Numbers": range(66272, 66303 + 1), - "Old Italic": range(66304, 66351 + 1), - "Gothic": range(66352, 66383 + 1), - "Old Permic": range(66384, 66431 + 1), - "Ugaritic": range(66432, 66463 + 1), - "Old Persian": range(66464, 66527 + 1), - "Deseret": range(66560, 66639 + 1), - "Shavian": range(66640, 66687 + 1), - "Osmanya": range(66688, 66735 + 1), - "Osage": range(66736, 66815 + 1), - "Elbasan": range(66816, 66863 + 1), - "Caucasian Albanian": range(66864, 66927 + 1), - "Linear A": range(67072, 67455 + 1), - "Cypriot Syllabary": range(67584, 67647 + 1), - "Imperial Aramaic": range(67648, 67679 + 1), - "Palmyrene": range(67680, 67711 + 1), - "Nabataean": range(67712, 67759 + 1), - "Hatran": range(67808, 67839 + 1), - "Phoenician": range(67840, 67871 + 1), - "Lydian": range(67872, 67903 + 1), - "Meroitic Hieroglyphs": range(67968, 67999 + 1), - "Meroitic Cursive": range(68000, 68095 + 1), - "Kharoshthi": range(68096, 68191 + 1), - "Old South Arabian": range(68192, 68223 + 1), - "Old North Arabian": range(68224, 68255 + 1), - "Manichaean": range(68288, 68351 + 1), - "Avestan": range(68352, 68415 + 1), - "Inscriptional Parthian": range(68416, 68447 + 1), - "Inscriptional Pahlavi": range(68448, 68479 + 1), - "Psalter Pahlavi": range(68480, 68527 + 1), - "Old Turkic": range(68608, 68687 + 1), - "Old Hungarian": range(68736, 68863 + 1), - "Rumi Numeral Symbols": range(69216, 69247 + 1), - "Brahmi": range(69632, 69759 + 1), - "Kaithi": range(69760, 69839 + 1), - "Sora Sompeng": range(69840, 69887 + 1), - "Chakma": range(69888, 69967 + 1), - "Mahajani": range(69968, 70015 + 1), - "Sharada": range(70016, 70111 + 1), - "Sinhala Archaic Numbers": range(70112, 70143 + 1), - "Khojki": range(70144, 70223 + 1), - "Multani": range(70272, 70319 + 1), - "Khudawadi": range(70320, 70399 + 1), - "Grantha": range(70400, 70527 + 1), - "Newa": range(70656, 70783 + 1), - "Tirhuta": range(70784, 70879 + 1), - "Siddham": range(71040, 71167 + 1), - "Modi": range(71168, 71263 + 1), - "Mongolian Supplement": range(71264, 71295 + 1), - "Takri": range(71296, 71375 + 1), - "Ahom": range(71424, 71487 + 1), - "Warang Citi": range(71840, 71935 + 1), - "Zanabazar Square": range(72192, 72271 + 1), - "Soyombo": range(72272, 72367 + 1), - "Pau Cin Hau": range(72384, 72447 + 1), - "Bhaiksuki": range(72704, 72815 + 1), - "Marchen": range(72816, 72895 + 1), - "Masaram Gondi": range(72960, 73055 + 1), - "Cuneiform": range(73728, 74751 + 1), - "Cuneiform Numbers and Punctuation": range(74752, 74879 + 1), - "Early Dynastic Cuneiform": range(74880, 75087 + 1), - "Egyptian Hieroglyphs": range(77824, 78895 + 1), - "Anatolian Hieroglyphs": range(82944, 83583 + 1), - "Bamum Supplement": range(92160, 92735 + 1), - "Mro": range(92736, 92783 + 1), - "Bassa Vah": range(92880, 92927 + 1), - "Pahawh Hmong": range(92928, 93071 + 1), - "Miao": range(93952, 94111 + 1), - "Ideographic Symbols and Punctuation": range(94176, 94207 + 1), - "Tangut": range(94208, 100351 + 1), - "Tangut Components": range(100352, 101119 + 1), - "Kana Supplement": range(110592, 110847 + 1), - "Kana Extended-A": range(110848, 110895 + 1), - "Nushu": range(110960, 111359 + 1), - "Duployan": range(113664, 113823 + 1), - "Shorthand Format Controls": range(113824, 113839 + 1), - "Byzantine Musical Symbols": range(118784, 119039 + 1), - "Musical Symbols": range(119040, 119295 + 1), - "Ancient Greek Musical Notation": range(119296, 119375 + 1), - "Tai Xuan Jing Symbols": range(119552, 119647 + 1), - "Counting Rod Numerals": range(119648, 119679 + 1), - "Mathematical Alphanumeric Symbols": range(119808, 120831 + 1), - "Sutton SignWriting": range(120832, 121519 + 1), - "Glagolitic Supplement": range(122880, 122927 + 1), - "Mende Kikakui": range(124928, 125151 + 1), - "Adlam": range(125184, 125279 + 1), - "Arabic Mathematical Alphabetic Symbols": range(126464, 126719 + 1), - "Mahjong Tiles": range(126976, 127023 + 1), - "Domino Tiles": range(127024, 127135 + 1), - "Playing Cards": range(127136, 127231 + 1), - "Enclosed Alphanumeric Supplement": range(127232, 127487 + 1), - "Enclosed Ideographic Supplement": range(127488, 127743 + 1), - "Miscellaneous Symbols and Pictographs": range(127744, 128511 + 1), - "Emoticons range(Emoji)": range(128512, 128591 + 1), - "Ornamental Dingbats": range(128592, 128639 + 1), - "Transport and Map Symbols": range(128640, 128767 + 1), - "Alchemical Symbols": range(128768, 128895 + 1), - "Geometric Shapes Extended": range(128896, 129023 + 1), - "Supplemental Arrows-C": range(129024, 129279 + 1), - "Supplemental Symbols and Pictographs": range(129280, 129535 + 1), - "CJK Unified Ideographs Extension B": range(131072, 173791 + 1), - "CJK Unified Ideographs Extension C": range(173824, 177983 + 1), - "CJK Unified Ideographs Extension D": range(177984, 178207 + 1), - "CJK Unified Ideographs Extension E": range(178208, 183983 + 1), - "CJK Unified Ideographs Extension F": range(183984, 191471 + 1), - "CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1), - "Tags": range(917504, 917631 + 1), - "Variation Selectors Supplement": range(917760, 917999 + 1), + "Control character": range(32), + "Basic Latin": range(32, 128), + "Latin-1 Supplement": range(128, 256), + "Latin Extended-A": range(256, 384), + "Latin Extended-B": range(384, 592), + "IPA Extensions": range(592, 688), + "Spacing Modifier Letters": range(688, 768), + "Combining Diacritical Marks": range(768, 880), + "Greek and Coptic": range(880, 1024), + "Cyrillic": range(1024, 1280), + "Cyrillic Supplement": range(1280, 1328), + "Armenian": range(1328, 1424), + "Hebrew": range(1424, 1536), + "Arabic": range(1536, 1792), + "Syriac": range(1792, 1872), + "Arabic Supplement": range(1872, 1920), + "Thaana": range(1920, 1984), + "NKo": range(1984, 2048), + "Samaritan": range(2048, 2112), + "Mandaic": range(2112, 2144), + "Syriac Supplement": range(2144, 2160), + "Arabic Extended-B": range(2160, 2208), + "Arabic Extended-A": range(2208, 2304), + "Devanagari": range(2304, 2432), + "Bengali": range(2432, 2560), + "Gurmukhi": range(2560, 2688), + "Gujarati": range(2688, 2816), + "Oriya": range(2816, 2944), + "Tamil": range(2944, 3072), + "Telugu": range(3072, 3200), + "Kannada": range(3200, 3328), + "Malayalam": range(3328, 3456), + "Sinhala": range(3456, 3584), + "Thai": range(3584, 3712), + "Lao": range(3712, 3840), + "Tibetan": range(3840, 4096), + "Myanmar": range(4096, 4256), + "Georgian": range(4256, 4352), + "Hangul Jamo": range(4352, 4608), + "Ethiopic": range(4608, 4992), + "Ethiopic Supplement": range(4992, 5024), + "Cherokee": range(5024, 5120), + "Unified Canadian Aboriginal Syllabics": range(5120, 5760), + "Ogham": range(5760, 5792), + "Runic": range(5792, 5888), + "Tagalog": range(5888, 5920), + "Hanunoo": range(5920, 5952), + "Buhid": range(5952, 5984), + "Tagbanwa": range(5984, 6016), + "Khmer": range(6016, 6144), + "Mongolian": range(6144, 6320), + "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400), + "Limbu": range(6400, 6480), + "Tai Le": range(6480, 6528), + "New Tai Lue": range(6528, 6624), + "Khmer Symbols": range(6624, 6656), + "Buginese": range(6656, 6688), + "Tai Tham": range(6688, 6832), + "Combining Diacritical Marks Extended": range(6832, 6912), + "Balinese": range(6912, 7040), + "Sundanese": range(7040, 7104), + "Batak": range(7104, 7168), + "Lepcha": range(7168, 7248), + "Ol Chiki": range(7248, 7296), + "Cyrillic Extended-C": range(7296, 7312), + "Georgian Extended": range(7312, 7360), + "Sundanese Supplement": range(7360, 7376), + "Vedic Extensions": range(7376, 7424), + "Phonetic Extensions": range(7424, 7552), + "Phonetic Extensions Supplement": range(7552, 7616), + "Combining Diacritical Marks Supplement": range(7616, 7680), + "Latin Extended Additional": range(7680, 7936), + "Greek Extended": range(7936, 8192), + "General Punctuation": range(8192, 8304), + "Superscripts and Subscripts": range(8304, 8352), + "Currency Symbols": range(8352, 8400), + "Combining Diacritical Marks for Symbols": range(8400, 8448), + "Letterlike Symbols": range(8448, 8528), + "Number Forms": range(8528, 8592), + "Arrows": range(8592, 8704), + "Mathematical Operators": range(8704, 8960), + "Miscellaneous Technical": range(8960, 9216), + "Control Pictures": range(9216, 9280), + "Optical Character Recognition": range(9280, 9312), + "Enclosed Alphanumerics": range(9312, 9472), + "Box Drawing": range(9472, 9600), + "Block Elements": range(9600, 9632), + "Geometric Shapes": range(9632, 9728), + "Miscellaneous Symbols": range(9728, 9984), + "Dingbats": range(9984, 10176), + "Miscellaneous Mathematical Symbols-A": range(10176, 10224), + "Supplemental Arrows-A": range(10224, 10240), + "Braille Patterns": range(10240, 10496), + "Supplemental Arrows-B": range(10496, 10624), + "Miscellaneous Mathematical Symbols-B": range(10624, 10752), + "Supplemental Mathematical Operators": range(10752, 11008), + "Miscellaneous Symbols and Arrows": range(11008, 11264), + "Glagolitic": range(11264, 11360), + "Latin Extended-C": range(11360, 11392), + "Coptic": range(11392, 11520), + "Georgian Supplement": range(11520, 11568), + "Tifinagh": range(11568, 11648), + "Ethiopic Extended": range(11648, 11744), + "Cyrillic Extended-A": range(11744, 11776), + "Supplemental Punctuation": range(11776, 11904), + "CJK Radicals Supplement": range(11904, 12032), + "Kangxi Radicals": range(12032, 12256), + "Ideographic Description Characters": range(12272, 12288), + "CJK Symbols and Punctuation": range(12288, 12352), + "Hiragana": range(12352, 12448), + "Katakana": range(12448, 12544), + "Bopomofo": range(12544, 12592), + "Hangul Compatibility Jamo": range(12592, 12688), + "Kanbun": range(12688, 12704), + "Bopomofo Extended": range(12704, 12736), + "CJK Strokes": range(12736, 12784), + "Katakana Phonetic Extensions": range(12784, 12800), + "Enclosed CJK Letters and Months": range(12800, 13056), + "CJK Compatibility": range(13056, 13312), + "CJK Unified Ideographs Extension A": range(13312, 19904), + "Yijing Hexagram Symbols": range(19904, 19968), + "CJK Unified Ideographs": range(19968, 40960), + "Yi Syllables": range(40960, 42128), + "Yi Radicals": range(42128, 42192), + "Lisu": range(42192, 42240), + "Vai": range(42240, 42560), + "Cyrillic Extended-B": range(42560, 42656), + "Bamum": range(42656, 42752), + "Modifier Tone Letters": range(42752, 42784), + "Latin Extended-D": range(42784, 43008), + "Syloti Nagri": range(43008, 43056), + "Common Indic Number Forms": range(43056, 43072), + "Phags-pa": range(43072, 43136), + "Saurashtra": range(43136, 43232), + "Devanagari Extended": range(43232, 43264), + "Kayah Li": range(43264, 43312), + "Rejang": range(43312, 43360), + "Hangul Jamo Extended-A": range(43360, 43392), + "Javanese": range(43392, 43488), + "Myanmar Extended-B": range(43488, 43520), + "Cham": range(43520, 43616), + "Myanmar Extended-A": range(43616, 43648), + "Tai Viet": range(43648, 43744), + "Meetei Mayek Extensions": range(43744, 43776), + "Ethiopic Extended-A": range(43776, 43824), + "Latin Extended-E": range(43824, 43888), + "Cherokee Supplement": range(43888, 43968), + "Meetei Mayek": range(43968, 44032), + "Hangul Syllables": range(44032, 55216), + "Hangul Jamo Extended-B": range(55216, 55296), + "High Surrogates": range(55296, 56192), + "High Private Use Surrogates": range(56192, 56320), + "Low Surrogates": range(56320, 57344), + "Private Use Area": range(57344, 63744), + "CJK Compatibility Ideographs": range(63744, 64256), + "Alphabetic Presentation Forms": range(64256, 64336), + "Arabic Presentation Forms-A": range(64336, 65024), + "Variation Selectors": range(65024, 65040), + "Vertical Forms": range(65040, 65056), + "Combining Half Marks": range(65056, 65072), + "CJK Compatibility Forms": range(65072, 65104), + "Small Form Variants": range(65104, 65136), + "Arabic Presentation Forms-B": range(65136, 65280), + "Halfwidth and Fullwidth Forms": range(65280, 65520), + "Specials": range(65520, 65536), + "Linear B Syllabary": range(65536, 65664), + "Linear B Ideograms": range(65664, 65792), + "Aegean Numbers": range(65792, 65856), + "Ancient Greek Numbers": range(65856, 65936), + "Ancient Symbols": range(65936, 66000), + "Phaistos Disc": range(66000, 66048), + "Lycian": range(66176, 66208), + "Carian": range(66208, 66272), + "Coptic Epact Numbers": range(66272, 66304), + "Old Italic": range(66304, 66352), + "Gothic": range(66352, 66384), + "Old Permic": range(66384, 66432), + "Ugaritic": range(66432, 66464), + "Old Persian": range(66464, 66528), + "Deseret": range(66560, 66640), + "Shavian": range(66640, 66688), + "Osmanya": range(66688, 66736), + "Osage": range(66736, 66816), + "Elbasan": range(66816, 66864), + "Caucasian Albanian": range(66864, 66928), + "Vithkuqi": range(66928, 67008), + "Linear A": range(67072, 67456), + "Latin Extended-F": range(67456, 67520), + "Cypriot Syllabary": range(67584, 67648), + "Imperial Aramaic": range(67648, 67680), + "Palmyrene": range(67680, 67712), + "Nabataean": range(67712, 67760), + "Hatran": range(67808, 67840), + "Phoenician": range(67840, 67872), + "Lydian": range(67872, 67904), + "Meroitic Hieroglyphs": range(67968, 68000), + "Meroitic Cursive": range(68000, 68096), + "Kharoshthi": range(68096, 68192), + "Old South Arabian": range(68192, 68224), + "Old North Arabian": range(68224, 68256), + "Manichaean": range(68288, 68352), + "Avestan": range(68352, 68416), + "Inscriptional Parthian": range(68416, 68448), + "Inscriptional Pahlavi": range(68448, 68480), + "Psalter Pahlavi": range(68480, 68528), + "Old Turkic": range(68608, 68688), + "Old Hungarian": range(68736, 68864), + "Hanifi Rohingya": range(68864, 68928), + "Rumi Numeral Symbols": range(69216, 69248), + "Yezidi": range(69248, 69312), + "Arabic Extended-C": range(69312, 69376), + "Old Sogdian": range(69376, 69424), + "Sogdian": range(69424, 69488), + "Old Uyghur": range(69488, 69552), + "Chorasmian": range(69552, 69600), + "Elymaic": range(69600, 69632), + "Brahmi": range(69632, 69760), + "Kaithi": range(69760, 69840), + "Sora Sompeng": range(69840, 69888), + "Chakma": range(69888, 69968), + "Mahajani": range(69968, 70016), + "Sharada": range(70016, 70112), + "Sinhala Archaic Numbers": range(70112, 70144), + "Khojki": range(70144, 70224), + "Multani": range(70272, 70320), + "Khudawadi": range(70320, 70400), + "Grantha": range(70400, 70528), + "Newa": range(70656, 70784), + "Tirhuta": range(70784, 70880), + "Siddham": range(71040, 71168), + "Modi": range(71168, 71264), + "Mongolian Supplement": range(71264, 71296), + "Takri": range(71296, 71376), + "Ahom": range(71424, 71504), + "Dogra": range(71680, 71760), + "Warang Citi": range(71840, 71936), + "Dives Akuru": range(71936, 72032), + "Nandinagari": range(72096, 72192), + "Zanabazar Square": range(72192, 72272), + "Soyombo": range(72272, 72368), + "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384), + "Pau Cin Hau": range(72384, 72448), + "Devanagari Extended-A": range(72448, 72544), + "Bhaiksuki": range(72704, 72816), + "Marchen": range(72816, 72896), + "Masaram Gondi": range(72960, 73056), + "Gunjala Gondi": range(73056, 73136), + "Makasar": range(73440, 73472), + "Kawi": range(73472, 73568), + "Lisu Supplement": range(73648, 73664), + "Tamil Supplement": range(73664, 73728), + "Cuneiform": range(73728, 74752), + "Cuneiform Numbers and Punctuation": range(74752, 74880), + "Early Dynastic Cuneiform": range(74880, 75088), + "Cypro-Minoan": range(77712, 77824), + "Egyptian Hieroglyphs": range(77824, 78896), + "Egyptian Hieroglyph Format Controls": range(78896, 78944), + "Anatolian Hieroglyphs": range(82944, 83584), + "Bamum Supplement": range(92160, 92736), + "Mro": range(92736, 92784), + "Tangsa": range(92784, 92880), + "Bassa Vah": range(92880, 92928), + "Pahawh Hmong": range(92928, 93072), + "Medefaidrin": range(93760, 93856), + "Miao": range(93952, 94112), + "Ideographic Symbols and Punctuation": range(94176, 94208), + "Tangut": range(94208, 100352), + "Tangut Components": range(100352, 101120), + "Khitan Small Script": range(101120, 101632), + "Tangut Supplement": range(101632, 101760), + "Kana Extended-B": range(110576, 110592), + "Kana Supplement": range(110592, 110848), + "Kana Extended-A": range(110848, 110896), + "Small Kana Extension": range(110896, 110960), + "Nushu": range(110960, 111360), + "Duployan": range(113664, 113824), + "Shorthand Format Controls": range(113824, 113840), + "Znamenny Musical Notation": range(118528, 118736), + "Byzantine Musical Symbols": range(118784, 119040), + "Musical Symbols": range(119040, 119296), + "Ancient Greek Musical Notation": range(119296, 119376), + "Kaktovik Numerals": range(119488, 119520), + "Mayan Numerals": range(119520, 119552), + "Tai Xuan Jing Symbols": range(119552, 119648), + "Counting Rod Numerals": range(119648, 119680), + "Mathematical Alphanumeric Symbols": range(119808, 120832), + "Sutton SignWriting": range(120832, 121520), + "Latin Extended-G": range(122624, 122880), + "Glagolitic Supplement": range(122880, 122928), + "Cyrillic Extended-D": range(122928, 123024), + "Nyiakeng Puachue Hmong": range(123136, 123216), + "Toto": range(123536, 123584), + "Wancho": range(123584, 123648), + "Nag Mundari": range(124112, 124160), + "Ethiopic Extended-B": range(124896, 124928), + "Mende Kikakui": range(124928, 125152), + "Adlam": range(125184, 125280), + "Indic Siyaq Numbers": range(126064, 126144), + "Ottoman Siyaq Numbers": range(126208, 126288), + "Arabic Mathematical Alphabetic Symbols": range(126464, 126720), + "Mahjong Tiles": range(126976, 127024), + "Domino Tiles": range(127024, 127136), + "Playing Cards": range(127136, 127232), + "Enclosed Alphanumeric Supplement": range(127232, 127488), + "Enclosed Ideographic Supplement": range(127488, 127744), + "Miscellaneous Symbols and Pictographs": range(127744, 128512), + "Emoticons range(Emoji)": range(128512, 128592), + "Ornamental Dingbats": range(128592, 128640), + "Transport and Map Symbols": range(128640, 128768), + "Alchemical Symbols": range(128768, 128896), + "Geometric Shapes Extended": range(128896, 129024), + "Supplemental Arrows-C": range(129024, 129280), + "Supplemental Symbols and Pictographs": range(129280, 129536), + "Chess Symbols": range(129536, 129648), + "Symbols and Pictographs Extended-A": range(129648, 129792), + "Symbols for Legacy Computing": range(129792, 130048), + "CJK Unified Ideographs Extension B": range(131072, 173792), + "CJK Unified Ideographs Extension C": range(173824, 177984), + "CJK Unified Ideographs Extension D": range(177984, 178208), + "CJK Unified Ideographs Extension E": range(178208, 183984), + "CJK Unified Ideographs Extension F": range(183984, 191472), + "CJK Compatibility Ideographs Supplement": range(194560, 195104), + "CJK Unified Ideographs Extension G": range(196608, 201552), + "CJK Unified Ideographs Extension H": range(201552, 205744), + "Tags": range(917504, 917632), + "Variation Selectors Supplement": range(917760, 918000), + "Supplementary Private Use Area-A": range(983040, 1048576), + "Supplementary Private Use Area-B": range(1048576, 1114112), } @@ -331,11 +380,23 @@ RE_POSSIBLE_ENCODING_INDICATION = re_compile( IGNORECASE, ) +IANA_NO_ALIASES = [ + "cp720", + "cp737", + "cp856", + "cp874", + "cp875", + "cp1006", + "koi8_r", + "koi8_t", + "koi8_u", +] + IANA_SUPPORTED: List[str] = sorted( filter( lambda x: x.endswith("_codec") is False and x not in {"rot_13", "tactis", "mbcs"}, - list(set(aliases.values())), + list(set(aliases.values())) + IANA_NO_ALIASES, ) ) @@ -489,7 +550,1446 @@ COMMON_SAFE_ASCII_CHARACTERS: Set[str] = { KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"} ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"} -LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES) - # Logging LEVEL below DEBUG TRACE: int = 5 + + +# Language label that contain the em dash "—" +# character are to be considered alternative seq to origin +FREQUENCIES: Dict[str, List[str]] = { + "English": [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "u", + "m", + "f", + "p", + "g", + "w", + "y", + "b", + "v", + "k", + "x", + "j", + "z", + "q", + ], + "English—": [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "m", + "u", + "f", + "p", + "g", + "w", + "b", + "y", + "v", + "k", + "j", + "x", + "z", + "q", + ], + "German": [ + "e", + "n", + "i", + "r", + "s", + "t", + "a", + "d", + "h", + "u", + "l", + "g", + "o", + "c", + "m", + "b", + "f", + "k", + "w", + "z", + "p", + "v", + "ü", + "ä", + "ö", + "j", + ], + "French": [ + "e", + "a", + "s", + "n", + "i", + "t", + "r", + "l", + "u", + "o", + "d", + "c", + "p", + "m", + "é", + "v", + "g", + "f", + "b", + "h", + "q", + "à", + "x", + "è", + "y", + "j", + ], + "Dutch": [ + "e", + "n", + "a", + "i", + "r", + "t", + "o", + "d", + "s", + "l", + "g", + "h", + "v", + "m", + "u", + "k", + "c", + "p", + "b", + "w", + "j", + "z", + "f", + "y", + "x", + "ë", + ], + "Italian": [ + "e", + "i", + "a", + "o", + "n", + "l", + "t", + "r", + "s", + "c", + "d", + "u", + "p", + "m", + "g", + "v", + "f", + "b", + "z", + "h", + "q", + "è", + "à", + "k", + "y", + "ò", + ], + "Polish": [ + "a", + "i", + "o", + "e", + "n", + "r", + "z", + "w", + "s", + "c", + "t", + "k", + "y", + "d", + "p", + "m", + "u", + "l", + "j", + "ł", + "g", + "b", + "h", + "ą", + "ę", + "ó", + ], + "Spanish": [ + "e", + "a", + "o", + "n", + "s", + "r", + "i", + "l", + "d", + "t", + "c", + "u", + "m", + "p", + "b", + "g", + "v", + "f", + "y", + "ó", + "h", + "q", + "í", + "j", + "z", + "á", + ], + "Russian": [ + "о", + "а", + "е", + "и", + "н", + "с", + "т", + "р", + "в", + "л", + "к", + "м", + "д", + "п", + "у", + "г", + "я", + "ы", + "з", + "б", + "й", + "ь", + "ч", + "х", + "ж", + "ц", + ], + # Jap-Kanji + "Japanese": [ + "人", + "一", + "大", + "亅", + "丁", + "丨", + "竹", + "笑", + "口", + "日", + "今", + "二", + "彳", + "行", + "十", + "土", + "丶", + "寸", + "寺", + "時", + "乙", + "丿", + "乂", + "气", + "気", + "冂", + "巾", + "亠", + "市", + "目", + "儿", + "見", + "八", + "小", + "凵", + "県", + "月", + "彐", + "門", + "間", + "木", + "東", + "山", + "出", + "本", + "中", + "刀", + "分", + "耳", + "又", + "取", + "最", + "言", + "田", + "心", + "思", + "刂", + "前", + "京", + "尹", + "事", + "生", + "厶", + "云", + "会", + "未", + "来", + "白", + "冫", + "楽", + "灬", + "馬", + "尸", + "尺", + "駅", + "明", + "耂", + "者", + "了", + "阝", + "都", + "高", + "卜", + "占", + "厂", + "广", + "店", + "子", + "申", + "奄", + "亻", + "俺", + "上", + "方", + "冖", + "学", + "衣", + "艮", + "食", + "自", + ], + # Jap-Katakana + "Japanese—": [ + "ー", + "ン", + "ス", + "・", + "ル", + "ト", + "リ", + "イ", + "ア", + "ラ", + "ッ", + "ク", + "ド", + "シ", + "レ", + "ジ", + "タ", + "フ", + "ロ", + "カ", + "テ", + "マ", + "ィ", + "グ", + "バ", + "ム", + "プ", + "オ", + "コ", + "デ", + "ニ", + "ウ", + "メ", + "サ", + "ビ", + "ナ", + "ブ", + "ャ", + "エ", + "ュ", + "チ", + "キ", + "ズ", + "ダ", + "パ", + "ミ", + "ェ", + "ョ", + "ハ", + "セ", + "ベ", + "ガ", + "モ", + "ツ", + "ネ", + "ボ", + "ソ", + "ノ", + "ァ", + "ヴ", + "ワ", + "ポ", + "ペ", + "ピ", + "ケ", + "ゴ", + "ギ", + "ザ", + "ホ", + "ゲ", + "ォ", + "ヤ", + "ヒ", + "ユ", + "ヨ", + "ヘ", + "ゼ", + "ヌ", + "ゥ", + "ゾ", + "ヶ", + "ヂ", + "ヲ", + "ヅ", + "ヵ", + "ヱ", + "ヰ", + "ヮ", + "ヽ", + "゠", + "ヾ", + "ヷ", + "ヿ", + "ヸ", + "ヹ", + "ヺ", + ], + # Jap-Hiragana + "Japanese——": [ + "の", + "に", + "る", + "た", + "と", + "は", + "し", + "い", + "を", + "で", + "て", + "が", + "な", + "れ", + "か", + "ら", + "さ", + "っ", + "り", + "す", + "あ", + "も", + "こ", + "ま", + "う", + "く", + "よ", + "き", + "ん", + "め", + "お", + "け", + "そ", + "つ", + "だ", + "や", + "え", + "ど", + "わ", + "ち", + "み", + "せ", + "じ", + "ば", + "へ", + "び", + "ず", + "ろ", + "ほ", + "げ", + "む", + "べ", + "ひ", + "ょ", + "ゆ", + "ぶ", + "ご", + "ゃ", + "ね", + "ふ", + "ぐ", + "ぎ", + "ぼ", + "ゅ", + "づ", + "ざ", + "ぞ", + "ぬ", + "ぜ", + "ぱ", + "ぽ", + "ぷ", + "ぴ", + "ぃ", + "ぁ", + "ぇ", + "ぺ", + "ゞ", + "ぢ", + "ぉ", + "ぅ", + "ゐ", + "ゝ", + "ゑ", + "゛", + "゜", + "ゎ", + "ゔ", + "゚", + "ゟ", + "゙", + "ゕ", + "ゖ", + ], + "Portuguese": [ + "a", + "e", + "o", + "s", + "i", + "r", + "d", + "n", + "t", + "m", + "u", + "c", + "l", + "p", + "g", + "v", + "b", + "f", + "h", + "ã", + "q", + "é", + "ç", + "á", + "z", + "í", + ], + "Swedish": [ + "e", + "a", + "n", + "r", + "t", + "s", + "i", + "l", + "d", + "o", + "m", + "k", + "g", + "v", + "h", + "f", + "u", + "p", + "ä", + "c", + "b", + "ö", + "å", + "y", + "j", + "x", + ], + "Chinese": [ + "的", + "一", + "是", + "不", + "了", + "在", + "人", + "有", + "我", + "他", + "这", + "个", + "们", + "中", + "来", + "上", + "大", + "为", + "和", + "国", + "地", + "到", + "以", + "说", + "时", + "要", + "就", + "出", + "会", + "可", + "也", + "你", + "对", + "生", + "能", + "而", + "子", + "那", + "得", + "于", + "着", + "下", + "自", + "之", + "年", + "过", + "发", + "后", + "作", + "里", + "用", + "道", + "行", + "所", + "然", + "家", + "种", + "事", + "成", + "方", + "多", + "经", + "么", + "去", + "法", + "学", + "如", + "都", + "同", + "现", + "当", + "没", + "动", + "面", + "起", + "看", + "定", + "天", + "分", + "还", + "进", + "好", + "小", + "部", + "其", + "些", + "主", + "样", + "理", + "心", + "她", + "本", + "前", + "开", + "但", + "因", + "只", + "从", + "想", + "实", + ], + "Ukrainian": [ + "о", + "а", + "н", + "і", + "и", + "р", + "в", + "т", + "е", + "с", + "к", + "л", + "у", + "д", + "м", + "п", + "з", + "я", + "ь", + "б", + "г", + "й", + "ч", + "х", + "ц", + "ї", + ], + "Norwegian": [ + "e", + "r", + "n", + "t", + "a", + "s", + "i", + "o", + "l", + "d", + "g", + "k", + "m", + "v", + "f", + "p", + "u", + "b", + "h", + "å", + "y", + "j", + "ø", + "c", + "æ", + "w", + ], + "Finnish": [ + "a", + "i", + "n", + "t", + "e", + "s", + "l", + "o", + "u", + "k", + "ä", + "m", + "r", + "v", + "j", + "h", + "p", + "y", + "d", + "ö", + "g", + "c", + "b", + "f", + "w", + "z", + ], + "Vietnamese": [ + "n", + "h", + "t", + "i", + "c", + "g", + "a", + "o", + "u", + "m", + "l", + "r", + "à", + "đ", + "s", + "e", + "v", + "p", + "b", + "y", + "ư", + "d", + "á", + "k", + "ộ", + "ế", + ], + "Czech": [ + "o", + "e", + "a", + "n", + "t", + "s", + "i", + "l", + "v", + "r", + "k", + "d", + "u", + "m", + "p", + "í", + "c", + "h", + "z", + "á", + "y", + "j", + "b", + "ě", + "é", + "ř", + ], + "Hungarian": [ + "e", + "a", + "t", + "l", + "s", + "n", + "k", + "r", + "i", + "o", + "z", + "á", + "é", + "g", + "m", + "b", + "y", + "v", + "d", + "h", + "u", + "p", + "j", + "ö", + "f", + "c", + ], + "Korean": [ + "이", + "다", + "에", + "의", + "는", + "로", + "하", + "을", + "가", + "고", + "지", + "서", + "한", + "은", + "기", + "으", + "년", + "대", + "사", + "시", + "를", + "리", + "도", + "인", + "스", + "일", + ], + "Indonesian": [ + "a", + "n", + "e", + "i", + "r", + "t", + "u", + "s", + "d", + "k", + "m", + "l", + "g", + "p", + "b", + "o", + "h", + "y", + "j", + "c", + "w", + "f", + "v", + "z", + "x", + "q", + ], + "Turkish": [ + "a", + "e", + "i", + "n", + "r", + "l", + "ı", + "k", + "d", + "t", + "s", + "m", + "y", + "u", + "o", + "b", + "ü", + "ş", + "v", + "g", + "z", + "h", + "c", + "p", + "ç", + "ğ", + ], + "Romanian": [ + "e", + "i", + "a", + "r", + "n", + "t", + "u", + "l", + "o", + "c", + "s", + "d", + "p", + "m", + "ă", + "f", + "v", + "î", + "g", + "b", + "ș", + "ț", + "z", + "h", + "â", + "j", + ], + "Farsi": [ + "ا", + "ی", + "ر", + "د", + "ن", + "ه", + "و", + "م", + "ت", + "ب", + "س", + "ل", + "ک", + "ش", + "ز", + "ف", + "گ", + "ع", + "خ", + "ق", + "ج", + "آ", + "پ", + "ح", + "ط", + "ص", + ], + "Arabic": [ + "ا", + "ل", + "ي", + "م", + "و", + "ن", + "ر", + "ت", + "ب", + "ة", + "ع", + "د", + "س", + "ف", + "ه", + "ك", + "ق", + "أ", + "ح", + "ج", + "ش", + "ط", + "ص", + "ى", + "خ", + "إ", + ], + "Danish": [ + "e", + "r", + "n", + "t", + "a", + "i", + "s", + "d", + "l", + "o", + "g", + "m", + "k", + "f", + "v", + "u", + "b", + "h", + "p", + "å", + "y", + "ø", + "æ", + "c", + "j", + "w", + ], + "Serbian": [ + "а", + "и", + "о", + "е", + "н", + "р", + "с", + "у", + "т", + "к", + "ј", + "в", + "д", + "м", + "п", + "л", + "г", + "з", + "б", + "a", + "i", + "e", + "o", + "n", + "ц", + "ш", + ], + "Lithuanian": [ + "i", + "a", + "s", + "o", + "r", + "e", + "t", + "n", + "u", + "k", + "m", + "l", + "p", + "v", + "d", + "j", + "g", + "ė", + "b", + "y", + "ų", + "š", + "ž", + "c", + "ą", + "į", + ], + "Slovene": [ + "e", + "a", + "i", + "o", + "n", + "r", + "s", + "l", + "t", + "j", + "v", + "k", + "d", + "p", + "m", + "u", + "z", + "b", + "g", + "h", + "č", + "c", + "š", + "ž", + "f", + "y", + ], + "Slovak": [ + "o", + "a", + "e", + "n", + "i", + "r", + "v", + "t", + "s", + "l", + "k", + "d", + "m", + "p", + "u", + "c", + "h", + "j", + "b", + "z", + "á", + "y", + "ý", + "í", + "č", + "é", + ], + "Hebrew": [ + "י", + "ו", + "ה", + "ל", + "ר", + "ב", + "ת", + "מ", + "א", + "ש", + "נ", + "ע", + "ם", + "ד", + "ק", + "ח", + "פ", + "ס", + "כ", + "ג", + "ט", + "צ", + "ן", + "ז", + "ך", + ], + "Bulgarian": [ + "а", + "и", + "о", + "е", + "н", + "т", + "р", + "с", + "в", + "л", + "к", + "д", + "п", + "м", + "з", + "г", + "я", + "ъ", + "у", + "б", + "ч", + "ц", + "й", + "ж", + "щ", + "х", + ], + "Croatian": [ + "a", + "i", + "o", + "e", + "n", + "r", + "j", + "s", + "t", + "u", + "k", + "l", + "v", + "d", + "m", + "p", + "g", + "z", + "b", + "c", + "č", + "h", + "š", + "ž", + "ć", + "f", + ], + "Hindi": [ + "क", + "र", + "स", + "न", + "त", + "म", + "ह", + "प", + "य", + "ल", + "व", + "ज", + "द", + "ग", + "ब", + "श", + "ट", + "अ", + "ए", + "थ", + "भ", + "ड", + "च", + "ध", + "ष", + "इ", + ], + "Estonian": [ + "a", + "i", + "e", + "s", + "t", + "l", + "u", + "n", + "o", + "k", + "r", + "d", + "m", + "v", + "g", + "p", + "j", + "h", + "ä", + "b", + "õ", + "ü", + "f", + "c", + "ö", + "y", + ], + "Thai": [ + "า", + "น", + "ร", + "อ", + "ก", + "เ", + "ง", + "ม", + "ย", + "ล", + "ว", + "ด", + "ท", + "ส", + "ต", + "ะ", + "ป", + "บ", + "ค", + "ห", + "แ", + "จ", + "พ", + "ช", + "ข", + "ใ", + ], + "Greek": [ + "α", + "τ", + "ο", + "ι", + "ε", + "ν", + "ρ", + "σ", + "κ", + "η", + "π", + "ς", + "υ", + "μ", + "λ", + "ί", + "ό", + "ά", + "γ", + "έ", + "δ", + "ή", + "ω", + "χ", + "θ", + "ύ", + ], + "Tamil": [ + "க", + "த", + "ப", + "ட", + "ர", + "ம", + "ல", + "ன", + "வ", + "ற", + "ய", + "ள", + "ச", + "ந", + "இ", + "ண", + "அ", + "ஆ", + "ழ", + "ங", + "எ", + "உ", + "ஒ", + "ஸ", + ], + "Kazakh": [ + "а", + "ы", + "е", + "н", + "т", + "р", + "л", + "і", + "д", + "с", + "м", + "қ", + "к", + "о", + "б", + "и", + "у", + "ғ", + "ж", + "ң", + "з", + "ш", + "й", + "п", + "г", + "ө", + ], +} + +LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES) diff --git a/contrib/python/charset-normalizer/charset_normalizer/md.py b/contrib/python/charset-normalizer/charset_normalizer/md.py index 13aa062e71e..a6d9350c8b2 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/md.py +++ b/contrib/python/charset-normalizer/charset_normalizer/md.py @@ -9,7 +9,6 @@ from .constant import ( ) from .utils import ( is_accentuated, - is_ascii, is_case_variable, is_cjk, is_emoticon, @@ -419,7 +418,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin): return - if self._current_ascii_only is True and is_ascii(character) is False: + if self._current_ascii_only is True and character.isascii() is False: self._current_ascii_only = False if self._last_alpha_seen is not None: diff --git a/contrib/python/charset-normalizer/charset_normalizer/models.py b/contrib/python/charset-normalizer/charset_normalizer/models.py index 7f8ca389050..f3f7bcc8f9a 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/models.py +++ b/contrib/python/charset-normalizer/charset_normalizer/models.py @@ -54,16 +54,16 @@ class CharsetMatch: # Below 1% difference --> Use Coherence if chaos_difference < 0.01 and coherence_difference > 0.02: - # When having a tough decision, use the result that decoded as many multi-byte as possible. - if chaos_difference == 0.0 and self.coherence == other.coherence: - return self.multi_byte_usage > other.multi_byte_usage return self.coherence > other.coherence + elif chaos_difference < 0.01 and coherence_difference <= 0.02: + # When having a difficult decision, use the result that decoded as many multi-byte as possible. + return self.multi_byte_usage > other.multi_byte_usage return self.chaos < other.chaos @property def multi_byte_usage(self) -> float: - return 1.0 - len(str(self)) / len(self.raw) + return 1.0 - (len(str(self)) / len(self.raw)) def __str__(self) -> str: # Lazy Str Loading diff --git a/contrib/python/charset-normalizer/charset_normalizer/utils.py b/contrib/python/charset-normalizer/charset_normalizer/utils.py index bf2767a0e60..45a402e42f2 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/utils.py +++ b/contrib/python/charset-normalizer/charset_normalizer/utils.py @@ -70,15 +70,6 @@ def is_latin(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) -def is_ascii(character: str) -> bool: - try: - character.encode("ascii") - except UnicodeEncodeError: - return False - return True - - -@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_punctuation(character: str) -> bool: character_category: str = unicodedata.category(character) @@ -133,12 +124,6 @@ def is_case_variable(character: str) -> bool: return character.islower() != character.isupper() -def is_private_use_only(character: str) -> bool: - character_category: str = unicodedata.category(character) - - return character_category == "Co" - - @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_cjk(character: str) -> bool: try: @@ -205,7 +190,7 @@ def is_unprintable(character: str) -> bool: ) -def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: +def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]: """ Extract using ASCII-only decoder any specified encoding in the first n-bytes. """ diff --git a/contrib/python/charset-normalizer/charset_normalizer/version.py b/contrib/python/charset-normalizer/charset_normalizer/version.py index 5eed49a42ab..db1ff57a1d4 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/version.py +++ b/contrib/python/charset-normalizer/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.2.0" +__version__ = "3.3.0" VERSION = __version__.split(".") diff --git a/contrib/python/charset-normalizer/ya.make b/contrib/python/charset-normalizer/ya.make index b8f141e28ba..c7d65632b06 100644 --- a/contrib/python/charset-normalizer/ya.make +++ b/contrib/python/charset-normalizer/ya.make @@ -2,7 +2,7 @@ PY3_LIBRARY() -VERSION(3.2.0) +VERSION(3.3.0) LICENSE(MIT) @@ -11,11 +11,11 @@ NO_LINT() PY_SRCS( TOP_LEVEL charset_normalizer/__init__.py + charset_normalizer/__main__.py charset_normalizer/api.py - charset_normalizer/assets/__init__.py charset_normalizer/cd.py charset_normalizer/cli/__init__.py - charset_normalizer/cli/normalizer.py + charset_normalizer/cli/__main__.py charset_normalizer/constant.py charset_normalizer/legacy.py charset_normalizer/md.py |