aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/python/hypothesis/py3/hypothesis/extra/lark.py
blob: 9bab2c65ceec48b3bcea8c9caddd5b7b1ba576cf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

"""
----------------
hypothesis[lark]
----------------

This extra can be used to generate strings matching any context-free grammar,
using the `Lark parser library <https://github.com/lark-parser/lark>`_.

It currently only supports Lark's native EBNF syntax, but we plan to extend
this to support other common syntaxes such as ANTLR and :rfc:`5234` ABNF.
Lark already `supports loading grammars
<https://lark-parser.readthedocs.io/en/latest/nearley.html>`_
from `nearley.js <https://nearley.js.org/>`_, so you may not have to write
your own at all.
"""

from inspect import signature
from typing import Dict, Optional

import lark
from lark.grammar import NonTerminal, Terminal

from hypothesis import strategies as st
from hypothesis.errors import InvalidArgument
from hypothesis.internal.conjecture.utils import calc_label_from_name
from hypothesis.internal.validation import check_type
from hypothesis.strategies._internal.utils import cacheable, defines_strategy

__all__ = ["from_lark"]


def get_terminal_names(terminals, rules, ignore_names):
    """Get names of all terminals in the grammar.

    The arguments are the results of calling ``Lark.grammar.compile()``,
    so you would think that the ``terminals`` and ``ignore_names`` would
    have it all... but they omit terminals created with ``@declare``,
    which appear only in the expansion(s) of nonterminals.
    """
    names = {t.name for t in terminals} | set(ignore_names)
    for rule in rules:
        names |= {t.name for t in rule.expansion if isinstance(t, Terminal)}
    return names


class LarkStrategy(st.SearchStrategy):
    """Low-level strategy implementation wrapping a Lark grammar.

    See ``from_lark`` for details.
    """

    def __init__(self, grammar, start, explicit):
        assert isinstance(grammar, lark.lark.Lark)
        if start is None:
            start = grammar.options.start
        if not isinstance(start, list):
            start = [start]
        self.grammar = grammar

        # This is a total hack, but working around the changes is a nicer user
        # experience than breaking for anyone who doesn't instantly update their
        # installation of Lark alongside Hypothesis.
        compile_args = signature(grammar.grammar.compile).parameters
        if "terminals_to_keep" in compile_args:
            terminals, rules, ignore_names = grammar.grammar.compile(start, ())
        elif "start" in compile_args:  # pragma: no cover
            # Support lark <= 0.10.0, without the terminals_to_keep argument.
            terminals, rules, ignore_names = grammar.grammar.compile(start)
        else:  # pragma: no cover
            # This branch is to support lark <= 0.7.1, without the start argument.
            terminals, rules, ignore_names = grammar.grammar.compile()

        self.names_to_symbols = {}

        for r in rules:
            t = r.origin
            self.names_to_symbols[t.name] = t

        for t in terminals:
            self.names_to_symbols[t.name] = Terminal(t.name)

        self.start = st.sampled_from([self.names_to_symbols[s] for s in start])

        self.ignored_symbols = tuple(self.names_to_symbols[n] for n in ignore_names)

        self.terminal_strategies = {
            t.name: st.from_regex(t.pattern.to_regexp(), fullmatch=True)
            for t in terminals
        }
        unknown_explicit = set(explicit) - get_terminal_names(
            terminals, rules, ignore_names
        )
        if unknown_explicit:
            raise InvalidArgument(
                "The following arguments were passed as explicit_strategies, "
                "but there is no such terminal production in this grammar: "
                + repr(sorted(unknown_explicit))
            )
        self.terminal_strategies.update(explicit)

        nonterminals = {}

        for rule in rules:
            nonterminals.setdefault(rule.origin.name, []).append(tuple(rule.expansion))

        for v in nonterminals.values():
            v.sort(key=len)

        self.nonterminal_strategies = {
            k: st.sampled_from(v) for k, v in nonterminals.items()
        }

        self.__rule_labels = {}

    def do_draw(self, data):
        state = []
        start = data.draw(self.start)
        self.draw_symbol(data, start, state)
        return "".join(state)

    def rule_label(self, name):
        try:
            return self.__rule_labels[name]
        except KeyError:
            return self.__rule_labels.setdefault(
                name, calc_label_from_name(f"LARK:{name}")
            )

    def draw_symbol(self, data, symbol, draw_state):
        if isinstance(symbol, Terminal):
            try:
                strategy = self.terminal_strategies[symbol.name]
            except KeyError:
                raise InvalidArgument(
                    "Undefined terminal %r. Generation does not currently support "
                    "use of %%declare unless you pass `explicit`, a dict of "
                    'names-to-strategies, such as `{%r: st.just("")}`'
                    % (symbol.name, symbol.name)
                ) from None
            draw_state.append(data.draw(strategy))
        else:
            assert isinstance(symbol, NonTerminal)
            data.start_example(self.rule_label(symbol.name))
            expansion = data.draw(self.nonterminal_strategies[symbol.name])
            for e in expansion:
                self.draw_symbol(data, e, draw_state)
                self.gen_ignore(data, draw_state)
            data.stop_example()

    def gen_ignore(self, data, draw_state):
        if self.ignored_symbols and data.draw_bits(2) == 3:
            emit = data.draw(st.sampled_from(self.ignored_symbols))
            self.draw_symbol(data, emit, draw_state)

    def calc_has_reusable_values(self, recur):
        return True


def check_explicit(name):
    def inner(value):
        check_type(str, value, "value drawn from " + name)
        return value

    return inner


@cacheable
@defines_strategy(force_reusable_values=True)
def from_lark(
    grammar: lark.lark.Lark,
    *,
    start: Optional[str] = None,
    explicit: Optional[Dict[str, st.SearchStrategy[str]]] = None,
) -> st.SearchStrategy[str]:
    """A strategy for strings accepted by the given context-free grammar.

    ``grammar`` must be a ``Lark`` object, which wraps an EBNF specification.
    The Lark EBNF grammar reference can be found
    `here <https://lark-parser.readthedocs.io/en/latest/grammar.html>`_.

    ``from_lark`` will automatically generate strings matching the
    nonterminal ``start`` symbol in the grammar, which was supplied as an
    argument to the Lark class.  To generate strings matching a different
    symbol, including terminals, you can override this by passing the
    ``start`` argument to ``from_lark``.  Note that Lark may remove unreachable
    productions when the grammar is compiled, so you should probably pass the
    same value for ``start`` to both.

    Currently ``from_lark`` does not support grammars that need custom lexing.
    Any lexers will be ignored, and any undefined terminals from the use of
    ``%declare`` will result in generation errors.  To define strategies for
    such terminals, pass a dictionary mapping their name to a corresponding
    strategy as the ``explicit`` argument.

    The :pypi:`hypothesmith` project includes a strategy for Python source,
    based on a grammar and careful post-processing.
    """
    check_type(lark.lark.Lark, grammar, "grammar")
    if explicit is None:
        explicit = {}
    else:
        check_type(dict, explicit, "explicit")
        explicit = {
            k: v.map(check_explicit(f"explicit[{k!r}]={v!r}"))
            for k, v in explicit.items()
        }
    return LarkStrategy(grammar, start, explicit)