contrib/python/markdown-it-py/markdown_it/rules_core/replacements.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

"""Simple typographic replacements

* ``(c)``, ``(C)`` → ©
* ``(tm)``, ``(TM)`` → ™
* ``(r)``, ``(R)`` → ®
* ``+-`` → ±
* ``...`` → …
* ``?....`` → ?..
* ``!....`` → !..
* ``????????`` → ???
* ``!!!!!`` → !!!
* ``,,,`` → ,
* ``--`` → &ndash
* ``---`` → &mdash
"""

from __future__ import annotations

import logging
import re

from ..token import Token
from .state_core import StateCore

LOGGER = logging.getLogger(__name__)

# TODO:
# - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾
# - multiplication 2 x 4 -> 2 × 4

RARE_RE = re.compile(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--")

# Workaround for phantomjs - need regex without /g flag,
# or root check will fail every second time
# SCOPED_ABBR_TEST_RE = r"\((c|tm|r)\)"

SCOPED_ABBR_RE = re.compile(r"\((c|tm|r)\)", flags=re.IGNORECASE)

PLUS_MINUS_RE = re.compile(r"\+-")

ELLIPSIS_RE = re.compile(r"\.{2,}")

ELLIPSIS_QUESTION_EXCLAMATION_RE = re.compile(r"([?!])…")

QUESTION_EXCLAMATION_RE = re.compile(r"([?!]){4,}")

COMMA_RE = re.compile(r",{2,}")

EM_DASH_RE = re.compile(r"(^|[^-])---(?=[^-]|$)", flags=re.MULTILINE)

EN_DASH_RE = re.compile(r"(^|\s)--(?=\s|$)", flags=re.MULTILINE)

EN_DASH_INDENT_RE = re.compile(r"(^|[^-\s])--(?=[^-\s]|$)", flags=re.MULTILINE)


SCOPED_ABBR = {"c": "©", "r": "®", "tm": "™"}


def replaceFn(match: re.Match[str]) -> str:
    return SCOPED_ABBR[match.group(1).lower()]


def replace_scoped(inlineTokens: list[Token]) -> None:
    inside_autolink = 0

    for token in inlineTokens:
        if token.type == "text" and not inside_autolink:
            token.content = SCOPED_ABBR_RE.sub(replaceFn, token.content)

        if token.type == "link_open" and token.info == "auto":
            inside_autolink -= 1

        if token.type == "link_close" and token.info == "auto":
            inside_autolink += 1


def replace_rare(inlineTokens: list[Token]) -> None:
    inside_autolink = 0

    for token in inlineTokens:
        if (
            token.type == "text"
            and (not inside_autolink)
            and RARE_RE.search(token.content)
        ):
            # +- -> ±
            token.content = PLUS_MINUS_RE.sub("±", token.content)

            # .., ..., ....... -> …
            token.content = ELLIPSIS_RE.sub("…", token.content)

            # but ?..... & !..... -> ?.. & !..
            token.content = ELLIPSIS_QUESTION_EXCLAMATION_RE.sub("\\1..", token.content)
            token.content = QUESTION_EXCLAMATION_RE.sub("\\1\\1\\1", token.content)

            # ,,  ,,,  ,,,, -> ,
            token.content = COMMA_RE.sub(",", token.content)

            # em-dash
            token.content = EM_DASH_RE.sub("\\1\u2014", token.content)

            # en-dash
            token.content = EN_DASH_RE.sub("\\1\u2013", token.content)
            token.content = EN_DASH_INDENT_RE.sub("\\1\u2013", token.content)

        if token.type == "link_open" and token.info == "auto":
            inside_autolink -= 1

        if token.type == "link_close" and token.info == "auto":
            inside_autolink += 1


def replace(state: StateCore) -> None:
    if not state.md.options.typographer:
        return

    for token in state.tokens:
        if token.type != "inline":
            continue
        if token.children is None:
            continue

        if SCOPED_ABBR_RE.search(token.content):
            replace_scoped(token.children)

        if RARE_RE.search(token.content):
            replace_rare(token.children)