diff options
author | arcadia-devtools <arcadia-devtools@yandex-team.ru> | 2022-06-20 18:39:30 +0300 |
---|---|---|
committer | arcadia-devtools <arcadia-devtools@yandex-team.ru> | 2022-06-20 18:39:30 +0300 |
commit | 798d25a291578fceb2223382b508fba1723fef4a (patch) | |
tree | 227b9a24000c40ae3354f4321ff9fe19143423f7 /contrib/python/charset-normalizer/charset_normalizer/utils.py | |
parent | d934aec555f13784eabe2d7682211050918e6cf5 (diff) | |
download | ydb-798d25a291578fceb2223382b508fba1723fef4a.tar.gz |
intermediate changes
ref:ac842eacda5e614f20cf9d3985d932732f92beab
Diffstat (limited to 'contrib/python/charset-normalizer/charset_normalizer/utils.py')
-rw-r--r-- | contrib/python/charset-normalizer/charset_normalizer/utils.py | 118 |
1 files changed, 95 insertions, 23 deletions
diff --git a/contrib/python/charset-normalizer/charset_normalizer/utils.py b/contrib/python/charset-normalizer/charset_normalizer/utils.py index dcb14dfee1..17eaee0408 100644 --- a/contrib/python/charset-normalizer/charset_normalizer/utils.py +++ b/contrib/python/charset-normalizer/charset_normalizer/utils.py @@ -1,4 +1,6 @@ try: + # WARNING: unicodedata2 support is going to be removed in 3.0 + # Python is quickly catching up. import unicodedata2 as unicodedata except ImportError: import unicodedata # type: ignore[no-redef] @@ -9,7 +11,7 @@ from codecs import IncrementalDecoder from encodings.aliases import aliases from functools import lru_cache from re import findall -from typing import List, Optional, Set, Tuple, Union +from typing import Generator, List, Optional, Set, Tuple, Union from _multibytecodec import MultibyteIncrementalDecoder # type: ignore @@ -26,7 +28,7 @@ from .constant import ( @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_accentuated(character: str) -> bool: try: - description = unicodedata.name(character) # type: str + description: str = unicodedata.name(character) except ValueError: return False return ( @@ -41,11 +43,11 @@ def is_accentuated(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def remove_accent(character: str) -> str: - decomposed = unicodedata.decomposition(character) # type: str + decomposed: str = unicodedata.decomposition(character) if not decomposed: return character - codes = decomposed.split(" ") # type: List[str] + codes: List[str] = decomposed.split(" ") return chr(int(codes[0], 16)) @@ -55,7 +57,7 @@ def unicode_range(character: str) -> Optional[str]: """ Retrieve the Unicode range official name from a single character. """ - character_ord = ord(character) # type: int + character_ord: int = ord(character) for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): if character_ord in ord_range: @@ -67,12 +69,13 @@ def unicode_range(character: str) -> Optional[str]: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_latin(character: str) -> bool: try: - description = unicodedata.name(character) # type: str + description: str = unicodedata.name(character) except ValueError: return False return "LATIN" in description +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_ascii(character: str) -> bool: try: character.encode("ascii") @@ -83,12 +86,12 @@ def is_ascii(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_punctuation(character: str) -> bool: - character_category = unicodedata.category(character) # type: str + character_category: str = unicodedata.category(character) if "P" in character_category: return True - character_range = unicode_range(character) # type: Optional[str] + character_range: Optional[str] = unicode_range(character) if character_range is None: return False @@ -98,12 +101,12 @@ def is_punctuation(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_symbol(character: str) -> bool: - character_category = unicodedata.category(character) # type: str + character_category: str = unicodedata.category(character) if "S" in character_category or "N" in character_category: return True - character_range = unicode_range(character) # type: Optional[str] + character_range: Optional[str] = unicode_range(character) if character_range is None: return False @@ -113,7 +116,7 @@ def is_symbol(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_emoticon(character: str) -> bool: - character_range = unicode_range(character) # type: Optional[str] + character_range: Optional[str] = unicode_range(character) if character_range is None: return False @@ -126,7 +129,7 @@ def is_separator(character: str) -> bool: if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}: return True - character_category = unicodedata.category(character) # type: str + character_category: str = unicodedata.category(character) return "Z" in character_category @@ -137,7 +140,7 @@ def is_case_variable(character: str) -> bool: def is_private_use_only(character: str) -> bool: - character_category = unicodedata.category(character) # type: str + character_category: str = unicodedata.category(character) return character_category == "Co" @@ -197,6 +200,17 @@ def is_unicode_range_secondary(range_name: str) -> bool: return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_unprintable(character: str) -> bool: + return ( + character.isspace() is False # includes \n \t \r \v + and character.isprintable() is False + and character != "\x1A" # Why? Its the ASCII substitute character. + and character != b"\xEF\xBB\xBF".decode("utf_8") # bug discovered in Python, + # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. + ) + + def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: """ Extract using ASCII-only decoder any specified encoding in the first n-bytes. @@ -204,12 +218,12 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional if not isinstance(sequence, bytes): raise TypeError - seq_len = len(sequence) # type: int + seq_len: int = len(sequence) - results = findall( + results: List[str] = findall( RE_POSSIBLE_ENCODING_INDICATION, sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), - ) # type: List[str] + ) if len(results) == 0: return None @@ -253,7 +267,7 @@ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: """ for iana_encoding in ENCODING_MARKS: - marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]] + marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding] if isinstance(marks, bytes): marks = [marks] @@ -283,10 +297,10 @@ def iana_name(cp_name: str, strict: bool = True) -> str: def range_scan(decoded_sequence: str) -> List[str]: - ranges = set() # type: Set[str] + ranges: Set[str] = set() for character in decoded_sequence: - character_range = unicode_range(character) # type: Optional[str] + character_range: Optional[str] = unicode_range(character) if character_range is None: continue @@ -304,13 +318,13 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore - id_a = decoder_a(errors="ignore") # type: IncrementalDecoder - id_b = decoder_b(errors="ignore") # type: IncrementalDecoder + id_a: IncrementalDecoder = decoder_a(errors="ignore") + id_b: IncrementalDecoder = decoder_b(errors="ignore") - character_match_count = 0 # type: int + character_match_count: int = 0 for i in range(255): - to_be_decoded = bytes([i]) # type: bytes + to_be_decoded: bytes = bytes([i]) if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): character_match_count += 1 @@ -340,3 +354,61 @@ def set_logging_handler( handler = logging.StreamHandler() handler.setFormatter(logging.Formatter(format_string)) logger.addHandler(handler) + + +def cut_sequence_chunks( + sequences: bytes, + encoding_iana: str, + offsets: range, + chunk_size: int, + bom_or_sig_available: bool, + strip_sig_or_bom: bool, + sig_payload: bytes, + is_multi_byte_decoder: bool, + decoded_payload: Optional[str] = None, +) -> Generator[str, None, None]: + + if decoded_payload and is_multi_byte_decoder is False: + for i in offsets: + chunk = decoded_payload[i : i + chunk_size] + if not chunk: + break + yield chunk + else: + for i in offsets: + chunk_end = i + chunk_size + if chunk_end > len(sequences) + 8: + continue + + cut_sequence = sequences[i : i + chunk_size] + + if bom_or_sig_available and strip_sig_or_bom is False: + cut_sequence = sig_payload + cut_sequence + + chunk = cut_sequence.decode( + encoding_iana, + errors="ignore" if is_multi_byte_decoder else "strict", + ) + + # multi-byte bad cutting detector and adjustment + # not the cleanest way to perform that fix but clever enough for now. + if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: + + chunk_partial_size_chk: int = min(chunk_size, 16) + + if ( + decoded_payload + and chunk[:chunk_partial_size_chk] not in decoded_payload + ): + for j in range(i, i - 4, -1): + cut_sequence = sequences[j:chunk_end] + + if bom_or_sig_available and strip_sig_or_bom is False: + cut_sequence = sig_payload + cut_sequence + + chunk = cut_sequence.decode(encoding_iana, errors="ignore") + + if chunk[:chunk_partial_size_chk] in decoded_payload: + break + + yield chunk |