"""URL / email matching helpers for the GFM autolink extension. Ported from the Rust ``gfm_autolinks`` crate. """ from __future__ import annotations import unicodedata # --------------------------------------------------------------------------- # Character classification helpers # --------------------------------------------------------------------------- _VALID_PREV_CHARS = frozenset(" \t\r\n*_~(") def check_prev(ch: str) -> bool: """Return ``True`` if *ch* is a valid preceding character for an autolink.""" return ch in _VALID_PREV_CHARS def _is_valid_hostchar(ch: str) -> bool: """Return ``True`` if *ch* is valid inside a domain label (not whitespace/punctuation).""" if ch.isspace(): return False cat = unicodedata.category(ch) # Unicode punctuation categories: Pc, Pd, Pe, Pf, Pi, Po, Ps return not cat.startswith("P") # Characters that terminate a URL (before autolink_delim trimming). _SPACE_CHARS = frozenset(" \t\r\n\x00\x0b\x0c") def _isspace(ch: str) -> bool: return ch in _SPACE_CHARS _LINK_END_ASSORTMENT = frozenset("?!.,:*_~'\"[]") def _autolink_delim(data: str, link_end: int) -> int: """Trim trailing punctuation from a URL according to GFM rules.""" # Truncate at first '<' for i, ch in enumerate(data[:link_end]): if ch == "<": link_end = i break while link_end > 0: cclose = data[link_end - 1] copen = "(" if cclose == ")" else None if cclose in _LINK_END_ASSORTMENT: link_end -= 1 elif cclose == ";": new_end = link_end - 2 while new_end > 0 and data[new_end].isalpha(): new_end -= 1 if new_end < link_end - 2 and data[new_end] == "&": link_end = new_end else: link_end -= 1 elif copen is not None: opening = data[:link_end].count(copen) closing = data[:link_end].count(cclose) if closing <= opening: break link_end -= 1 else: break return link_end # --------------------------------------------------------------------------- # Domain validation # --------------------------------------------------------------------------- def _check_domain(data: str, allow_short: bool) -> int | None: """Validate a domain name and return the length consumed, or ``None``.""" if not data: return None np = 0 uscore1 = 0 uscore2 = 0 for i, ch in enumerate(data): if ch == "_": uscore2 += 1 elif ch == ".": uscore1 = uscore2 uscore2 = 0 np += 1 elif not _is_valid_hostchar(ch) and ch != "-": if uscore1 == 0 and uscore2 == 0 and (allow_short or np > 0): return i return None # else: valid hostchar or '-' if (uscore1 > 0 or uscore2 > 0) and np <= 10: return None if allow_short or np > 0: return len(data) return None # --------------------------------------------------------------------------- # www matching # --------------------------------------------------------------------------- _EMAIL_OK = frozenset(".+-_") def match_www(text: str) -> tuple[str, int] | None: """Match a bare ``www.`` URL at the start of *text*. Returns ``(url_with_scheme, char_count)`` or ``None``. """ if not text.startswith("www."): return None link_end = _check_domain(text[4:], False) if link_end is None: return None # link_end is offset from position 4 link_end += 4 # extend to the end of non-space characters while link_end < len(text) and not _isspace(text[link_end]): link_end += 1 link_end = _autolink_delim(text, link_end) matched = text[:link_end] url = "http://" + matched return url, len(matched) # --------------------------------------------------------------------------- # http(s):// matching # --------------------------------------------------------------------------- def match_http(text: str) -> tuple[str, int] | None: """Match an ``http://`` or ``https://`` URL at the start of *text*. Returns ``(url, char_count)`` or ``None``. """ if text.startswith("http://"): prefix_len = 7 elif text.startswith("https://"): prefix_len = 8 else: return None link_end = _check_domain(text[prefix_len:], True) if link_end is None: return None link_end += prefix_len while link_end < len(text) and not _isspace(text[link_end]): link_end += 1 link_end = _autolink_delim(text, link_end) url = text[:link_end] return url, len(url) # --------------------------------------------------------------------------- # Email matching # --------------------------------------------------------------------------- def match_email(text: str) -> tuple[str, int] | None: """Match an email address (optionally prefixed by ``mailto:``/``xmpp:``).""" pos = 0 protocol: str | None = None if text.startswith("mailto:"): protocol = "mailto" pos = 7 elif text.startswith("xmpp:"): protocol = "xmpp" pos = 5 return match_any_email(text, pos, protocol) def match_any_email( text: str, pos: int, protocol: str | None ) -> tuple[str, int] | None: """Match an email address in *text* starting the local-part scan at *pos*. *protocol* is ``"mailto"``, ``"xmpp"``, or ``None`` (bare address). Returns ``(url, char_count)`` or ``None``. """ size = len(text) # scan local part (before @) start_pos = pos while pos < size: ch = text[pos] if ch.isascii() and (ch.isalnum() or ch in _EMAIL_OK): pos += 1 continue if ch == "@": break return None if pos == start_pos: return None # scan domain (after @) link_end = pos + 1 np = 0 num_slash = 0 while link_end < size: ch = text[link_end] if ch.isascii() and ch.isalnum(): pass elif ch == "@": if protocol != "xmpp": return None elif ( ch == "." and link_end < size - 1 and text[link_end + 1].isascii() and text[link_end + 1].isalnum() ): np += 1 elif ch == "/" and protocol == "xmpp" and num_slash == 0: num_slash += 1 elif ch != "-" and ch != "_": break link_end += 1 if link_end < 2 or np == 0: return None last_ch = text[link_end - 1] if not (last_ch.isascii() and last_ch.isalpha()) and last_ch != ".": return None url = "mailto:" + text[:link_end] if protocol is None else text[:link_end] return url, link_end