aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/cython/Cython/Plex/Lexicons.py
blob: eaacbc9c544530ae497be719e26c1fb5835a48da (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#=======================================================================
#
#   Python Lexical Analyser
#
#   Lexical Analyser Specification
#
#=======================================================================

from __future__ import absolute_import

import types

from . import Actions
from . import DFA
from . import Errors
from . import Machines
from . import Regexps

# debug_flags for Lexicon constructor
DUMP_NFA = 1
DUMP_DFA = 2


class State(object):
    """ 
    This class is used as part of a Plex.Lexicon specification to 
    introduce a user-defined state. 

    Constructor: 

       State(name, token_specifications) 
    """ 

    name = None 
    tokens = None 

    def __init__(self, name, tokens): 
        self.name = name 
        self.tokens = tokens 

 
class Lexicon(object):
    """ 
    Lexicon(specification) builds a lexical analyser from the given 
    |specification|. The specification consists of a list of 
    specification items. Each specification item may be either: 

       1) A token definition, which is a tuple: 

             (pattern, action) 

          The |pattern| is a regular axpression built using the 
          constructors defined in the Plex module. 

          The |action| is the action to be performed when this pattern 
          is recognised (see below). 

       2) A state definition: 

             State(name, tokens) 

          where |name| is a character string naming the state, 
          and |tokens| is a list of token definitions as 
          above. The meaning and usage of states is described 
          below. 

    Actions 
    ------- 

    The |action| in a token specication may be one of three things: 

       1) A function, which is called as follows: 

             function(scanner, text) 

          where |scanner| is the relevant Scanner instance, and |text| 
          is the matched text. If the function returns anything 
          other than None, that value is returned as the value of the 
          token. If it returns None, scanning continues as if the IGNORE 
          action were specified (see below). 

        2) One of the following special actions: 

           IGNORE means that the recognised characters will be treated as 
                  white space and ignored. Scanning will continue until 
                  the next non-ignored token is recognised before returning. 

           TEXT   causes the scanned text itself to be returned as the 
                  value of the token. 

        3) Any other value, which is returned as the value of the token. 

    States 
    ------ 

    At any given time, the scanner is in one of a number of states. 
    Associated with each state is a set of possible tokens. When scanning, 
    only tokens associated with the current state are recognised. 

    There is a default state, whose name is the empty string. Token 
    definitions which are not inside any State definition belong to 
    the default state. 

    The initial state of the scanner is the default state. The state can 
    be changed in one of two ways: 

       1) Using Begin(state_name) as the action of a token. 

       2) Calling the begin(state_name) method of the Scanner. 

    To change back to the default state, use '' as the state name. 
    """ 

    machine = None  # Machine 
    tables = None   # StateTableMachine 

    def __init__(self, specifications, debug=None, debug_flags=7, timings=None): 
        if not isinstance(specifications, list): 
            raise Errors.InvalidScanner("Scanner definition is not a list") 
        if timings: 
            from .Timing import time 

            total_time = 0.0 
            time1 = time() 
        nfa = Machines.Machine() 
        default_initial_state = nfa.new_initial_state('') 
        token_number = 1 
        for spec in specifications: 
            if isinstance(spec, State): 
                user_initial_state = nfa.new_initial_state(spec.name) 
                for token in spec.tokens: 
                    self.add_token_to_machine( 
                        nfa, user_initial_state, token, token_number) 
                    token_number += 1 
            elif isinstance(spec, tuple): 
                self.add_token_to_machine( 
                    nfa, default_initial_state, spec, token_number) 
                token_number += 1 
            else: 
                raise Errors.InvalidToken( 
                    token_number, 
                    "Expected a token definition (tuple) or State instance") 
        if timings: 
            time2 = time() 
            total_time = total_time + (time2 - time1) 
            time3 = time() 
        if debug and (debug_flags & 1): 
            debug.write("\n============= NFA ===========\n") 
            nfa.dump(debug) 
        dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug) 
        if timings: 
            time4 = time() 
            total_time = total_time + (time4 - time3) 
        if debug and (debug_flags & 2): 
            debug.write("\n============= DFA ===========\n") 
            dfa.dump(debug) 
        if timings: 
            timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) 
            timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) 
            timings.write("TOTAL            : %5.2f\n" % total_time) 
        self.machine = dfa 
 
    def add_token_to_machine(self, machine, initial_state, token_spec, token_number): 
        try:
            (re, action_spec) = self.parse_token_definition(token_spec) 
            # Disabled this -- matching empty strings can be useful 
            #if re.nullable: 
            #  raise Errors.InvalidToken( 
            #    token_number, "Pattern can match 0 input symbols") 
            if isinstance(action_spec, Actions.Action): 
                action = action_spec 
            else: 
                try: 
                    action_spec.__call__ 
                except AttributeError: 
                    action = Actions.Return(action_spec) 
                else: 
                    action = Actions.Call(action_spec) 
            final_state = machine.new_state() 
            re.build_machine(machine, initial_state, final_state, 
                             match_bol=1, nocase=0) 
            final_state.set_action(action, priority=-token_number) 
        except Errors.PlexError as e: 
            raise e.__class__("Token number %d: %s" % (token_number, e)) 

    def parse_token_definition(self, token_spec): 
        if not isinstance(token_spec, tuple): 
            raise Errors.InvalidToken("Token definition is not a tuple") 
        if len(token_spec) != 2: 
            raise Errors.InvalidToken("Wrong number of items in token definition") 
        pattern, action = token_spec 
        if not isinstance(pattern, Regexps.RE): 
            raise Errors.InvalidToken("Pattern is not an RE instance") 
        return (pattern, action) 

    def get_initial_state(self, name): 
        return self.machine.get_initial_state(name)