summaryrefslogtreecommitdiffstats
path: root/contrib/python/mdit-py-plugins/mdit_py_plugins/gfm_autolink/index.py
blob: 48de4b728f0c611678da766cf1fdf9ae500a5e6a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""GFM autolink extension rules.

Three inline scanners are registered:

- **gfm_autolink_www** (char ``w``): bare ``www.`` URLs.
  Uses ``add_terminator_char("w")`` so the text scanner interrupts at ``w``.
- **gfm_autolink_protocol** (char ``:``): ``http://``, ``https://``,
  ``mailto:``, ``xmpp:`` URLs via back-scanning ``pending``.
- **gfm_autolink_email** (char ``@``): bare email addresses via
  back-scanning ``pending``.

Since ``:`` and ``@`` are already default terminator characters in
markdown-it-py, the protocol and email rules are invoked at every occurrence
of those characters. They use a *back-scanning* approach: looking backwards
through ``state.pending`` for a protocol prefix or email local-part that was
accumulated by the text rule. This means every ``:`` and ``@`` in the
document incurs a (cheap) regex check or character scan of pending text.

The trade-off vs. a **core-rule** (post-processing) approach — which would
walk the final token stream, find autolink patterns in text tokens, and
split them — is:

- **Inline approach** (current): simpler, integrates naturally with
  ``state.linkLevel`` to suppress matching inside links, but relies on the
  prefix being present in ``state.pending`` (if a prior inline rule consumed
  part of the prefix, matching would fail — unlikely in practice).
- **Core-rule approach**: guaranteed to find all autolinks regardless of
  inline rule ordering, but requires token-stream surgery (splitting text
  tokens and inserting link tokens) and cannot easily interact with nesting
  guards like ``linkLevel``.

The ``w`` terminator is the only *new* terminator added. It causes the text
rule to interrupt at every ``w``, which is a minor performance cost for
documents heavy in that letter, but necessary since ``www.`` must be matched
from the start of the URL.

Specification: https://github.github.com/gfm/#autolinks-extension-

.. versionadded:: 0.5.0

Requires markdown-it-py ≥ 4.1.0.
"""

from __future__ import annotations

import re

from markdown_it import MarkdownIt
from markdown_it.rules_inline import StateInline

from ._match import check_prev, match_any_email, match_http, match_www

# Regex to back-scan pending text for a protocol name ending at the current
# position (the colon character).
_PROTO_RE = re.compile(r"(?:^|.)(https?|mailto|xmpp)$", re.DOTALL)


def gfm_autolink_plugin(md: MarkdownIt) -> None:
    """Enable the GFM autolink extension.

    Recognises bare ``www.`` URLs, ``http(s)://`` URLs,
    ``mailto:``/``xmpp:`` links, and bare email addresses.

    Requires markdown-it-py ≥ 4.1.0.
    """
    if not hasattr(md.inline, "add_terminator_char"):
        raise RuntimeError("gfm_autolink_plugin requires markdown-it-py >= 4.1.0")

    md.inline.add_terminator_char("w")
    md.inline.ruler.push("gfm_autolink_www", _www_inline_rule)
    md.inline.ruler.push("gfm_autolink_protocol", _protocol_rule)
    md.inline.ruler.push("gfm_autolink_email", _email_rule)


# ---------------------------------------------------------------------------
# Helpers (inline rules)
# ---------------------------------------------------------------------------


def _preceding_ok(state: StateInline, bscan_len: int) -> bool:
    """Check whether the character before the back-scanned portion allows an autolink."""
    abs_pos = state.pos - bscan_len
    if abs_pos <= 0:
        return True
    preceding = state.src[abs_pos - 1]
    return check_prev(preceding)


def _create_autolink(
    state: StateInline,
    bscan_len: int,
    total_len: int,
    url: str,
    text: str,
) -> bool:
    """Emit ``link_open`` / ``text`` / ``link_close`` tokens.

    *bscan_len* characters are trimmed from the end of ``state.pending``
    (the back-scanned protocol or local part).  The parser position is
    then advanced by ``total_len - bscan_len`` characters.
    """
    if bscan_len:
        state.pending = state.pending[:-bscan_len]

    full_url = state.md.normalizeLink(url)
    if not state.md.validateLink(full_url):
        return False

    token = state.push("link_open", "a", 1)
    token.attrs = {"href": full_url}
    token.markup = "autolink"
    token.info = "auto"

    token = state.push("text", "", 0)
    token.content = state.md.normalizeLinkText(text)

    token = state.push("link_close", "a", -1)
    token.markup = "autolink"
    token.info = "auto"

    state.pos += total_len - bscan_len
    return True


# ---------------------------------------------------------------------------
# www inline rule  (requires add_terminator_char("w") — markdown-it-py >= 4.1.0)
# ---------------------------------------------------------------------------


def _www_inline_rule(state: StateInline, silent: bool) -> bool:
    """Match ``www.`` autolinks as an inline rule (trigger char: ``w``)."""
    if state.linkLevel > 0:
        return False

    pos = state.pos
    src = state.src

    # Quick check: must be 'w' and form "www."
    if src[pos] != "w":
        return False
    if pos + 4 > state.posMax or src[pos : pos + 4] != "www.":
        return False

    # Check preceding character (from pending text or start-of-line).
    if state.pending:
        preceding = state.pending[-1]
        if not check_prev(preceding):
            return False
    elif pos > 0:
        preceding = src[pos - 1]
        if not check_prev(preceding):
            return False

    result = match_www(src[pos : state.posMax])
    if result is None:
        return False

    url, length = result
    label = src[pos : pos + length]

    if silent:
        return True

    full_url = state.md.normalizeLink(url)
    if not state.md.validateLink(full_url):
        return False

    token = state.push("link_open", "a", 1)
    token.attrs = {"href": full_url}
    token.markup = "autolink"
    token.info = "auto"

    token = state.push("text", "", 0)
    token.content = state.md.normalizeLinkText(label)

    token = state.push("link_close", "a", -1)
    token.markup = "autolink"
    token.info = "auto"

    state.pos += length
    return True


# ---------------------------------------------------------------------------
# Protocol scanner  (trigger char: ':')
# ---------------------------------------------------------------------------


def _protocol_rule(state: StateInline, silent: bool) -> bool:
    if state.linkLevel > 0:
        return False

    pos = state.pos
    remaining = state.src[pos : state.posMax]

    # Must start with ':' and have at least 3 more characters.
    if len(remaining) < 4 or remaining[0] != ":":
        return False

    # Back-scan pending text for a known protocol name.
    m = _PROTO_RE.search(state.pending)
    if m is None:
        return False

    proto = m.group(1)
    bscan_len = len(proto)

    if not _preceding_ok(state, bscan_len):
        return False

    # Combine back-scanned protocol with the remaining text.
    combined = proto + remaining

    if proto in ("mailto", "xmpp"):
        result = match_any_email(combined, bscan_len + 1, proto)
    else:
        result = match_http(combined)

    if result is None:
        return False

    full_url, total_len = result
    label = combined[:total_len]

    if silent:
        return True
    return _create_autolink(state, bscan_len, total_len, full_url, label)


# ---------------------------------------------------------------------------
# Bare email scanner  (trigger char: '@')
# ---------------------------------------------------------------------------


def _email_rule(state: StateInline, silent: bool) -> bool:
    if state.linkLevel > 0:
        return False

    pos = state.pos
    if pos >= state.posMax or state.src[pos] != "@":
        return False
    # Need at least one character after '@'.
    if pos + 1 >= state.posMax:
        return False

    # Back-scan pending text for the local part of the email.
    local_rev: list[str] = []
    for ch in reversed(state.pending):
        if ch.isascii() and (ch.isalnum() or ch in ".+-_"):
            local_rev.append(ch)
        else:
            break

    if not local_rev:
        return False

    local_len = len(local_rev)
    if not _preceding_ok(state, local_len):
        return False

    # Forward-scan for the domain part.
    after_at = state.src[pos + 1 : state.posMax]
    domain_len = 0
    num_period = 0
    for i, ch in enumerate(after_at):
        if ch.isascii() and ch.isalnum():
            pass
        elif ch == "@":
            return False
        elif (
            ch == "."
            and i + 1 < len(after_at)
            and after_at[i + 1].isascii()
            and after_at[i + 1].isalnum()
        ):
            num_period += 1
        elif ch != "-" and ch != "_":
            break
        domain_len += 1

    if domain_len == 0 or num_period == 0:
        return False

    last_ch = after_at[domain_len - 1]
    if not (last_ch.isascii() and last_ch.isalnum()) and last_ch != ".":
        return False

    local_part = "".join(reversed(local_rev))
    email_text = local_part + state.src[pos : pos + 1 + domain_len]
    total_len = local_len + 1 + domain_len
    url = "mailto:" + email_text

    if silent:
        return True
    return _create_autolink(state, local_len, total_len, url, email_text)