aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/cython/Cython/Plex/Scanners.py
blob: 88f7e2da3ba8f5cae4a6f3d85f7d828bc732636a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# cython: auto_pickle=False
#=======================================================================
#
#   Python Lexical Analyser
#
#
#   Scanning an input stream
#
#=======================================================================

from __future__ import absolute_import

import cython

cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object)

from . import Errors
from .Regexps import BOL, EOL, EOF

NOT_FOUND = object()


class Scanner(object):
    """
    A Scanner is used to read tokens from a stream of characters
    using the token set specified by a Plex.Lexicon.

    Constructor:

      Scanner(lexicon, stream, name = '')

        See the docstring of the __init__ method for details.

    Methods:

      See the docstrings of the individual methods for more
      information.

      read() --> (value, text)
        Reads the next lexical token from the stream.

      position() --> (name, line, col)
        Returns the position of the last token read using the
        read() method.

      begin(state_name)
        Causes scanner to change state.

      produce(value [, text])
        Causes return of a token value to the caller of the
        Scanner.

    """

    #  lexicon = None        # Lexicon
    #  stream = None         # file-like object
    #  name = ''
    #  buffer = ''
    #  buf_start_pos = 0     # position in input of start of buffer
    #  next_pos = 0          # position in input of next char to read
    #  cur_pos = 0           # position in input of current char
    #  cur_line = 1          # line number of current char
    #  cur_line_start = 0    # position in input of start of current line
    #  start_pos = 0         # position in input of start of token
    #  start_line = 0        # line number of start of token
    #  start_col = 0         # position in line of start of token
    #  text = None           # text of last token read
    #  initial_state = None  # Node
    #  state_name = ''       # Name of initial state
    #  queue = None          # list of tokens to be returned
    #  trace = 0

    def __init__(self, lexicon, stream, name='', initial_pos=None):
        """
        Scanner(lexicon, stream, name = '')

          |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
          to be recognised.

          |stream| can be a file object or anything which implements a
          compatible read() method.

          |name| is optional, and may be the name of the file being
          scanned or any other identifying string.
        """
        self.trace = 0

        self.buffer = u''
        self.buf_start_pos = 0
        self.next_pos = 0
        self.cur_pos = 0
        self.cur_line = 1
        self.start_pos = 0
        self.start_line = 0
        self.start_col = 0
        self.text = None
        self.state_name = None

        self.lexicon = lexicon
        self.stream = stream
        self.name = name
        self.queue = []
        self.initial_state = None
        self.begin('')
        self.next_pos = 0
        self.cur_pos = 0
        self.cur_line_start = 0
        self.cur_char = BOL
        self.input_state = 1
        if initial_pos is not None:
            self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2]

    def read(self):
        """
        Read the next lexical token from the stream and return a
        tuple (value, text), where |value| is the value associated with
        the token as specified by the Lexicon, and |text| is the actual
        string read from the stream. Returns (None, '') on end of file.
        """
        queue = self.queue
        while not queue:
            self.text, action = self.scan_a_token()
            if action is None:
                self.produce(None)
                self.eof()
            else:
                value = action.perform(self, self.text)
                if value is not None:
                    self.produce(value)
        result = queue[0]
        del queue[0]
        return result

    def scan_a_token(self):
        """
        Read the next input sequence recognised by the machine
        and return (text, action). Returns ('', None) on end of
        file.
        """
        self.start_pos = self.cur_pos
        self.start_line = self.cur_line
        self.start_col = self.cur_pos - self.cur_line_start
        action = self.run_machine_inlined()
        if action is not None:
            if self.trace:
                print("Scanner: read: Performing %s %d:%d" % (
                    action, self.start_pos, self.cur_pos))
            text = self.buffer[
                self.start_pos - self.buf_start_pos:
                self.cur_pos - self.buf_start_pos]
            return (text, action)
        else:
            if self.cur_pos == self.start_pos:
                if self.cur_char is EOL:
                    self.next_char()
                if self.cur_char is None or self.cur_char is EOF:
                    return (u'', None)
            raise Errors.UnrecognizedInput(self, self.state_name)

    def run_machine_inlined(self):
        """
        Inlined version of run_machine for speed.
        """
        state = self.initial_state
        cur_pos = self.cur_pos
        cur_line = self.cur_line
        cur_line_start = self.cur_line_start
        cur_char = self.cur_char
        input_state = self.input_state
        next_pos = self.next_pos
        buffer = self.buffer
        buf_start_pos = self.buf_start_pos
        buf_len = len(buffer)
        b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
            None, 0, 0, 0, u'', 0, 0
        trace = self.trace
        while 1:
            if trace:  #TRACE#
                print("State %d, %d/%d:%s -->" % (  #TRACE#
                    state['number'], input_state, cur_pos, repr(cur_char)))  #TRACE#
            # Begin inlined self.save_for_backup()
            #action = state.action #@slow
            action = state['action']  #@fast
            if action is not None:
                b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
                    action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos
            # End inlined self.save_for_backup()
            c = cur_char
            #new_state = state.new_state(c) #@slow
            new_state = state.get(c, NOT_FOUND)  #@fast
            if new_state is NOT_FOUND:  #@fast
                new_state = c and state.get('else')  #@fast
            if new_state:
                if trace:  #TRACE#
                    print("State %d" % new_state['number'])  #TRACE#
                state = new_state
                # Begin inlined: self.next_char()
                if input_state == 1:
                    cur_pos = next_pos
                    # Begin inlined: c = self.read_char()
                    buf_index = next_pos - buf_start_pos
                    if buf_index < buf_len:
                        c = buffer[buf_index]
                        next_pos += 1
                    else:
                        discard = self.start_pos - buf_start_pos
                        data = self.stream.read(0x1000)
                        buffer = self.buffer[discard:] + data
                        self.buffer = buffer
                        buf_start_pos += discard
                        self.buf_start_pos = buf_start_pos
                        buf_len = len(buffer)
                        buf_index -= discard
                        if data:
                            c = buffer[buf_index]
                            next_pos += 1
                        else:
                            c = u''
                    # End inlined: c = self.read_char()
                    if c == u'\n':
                        cur_char = EOL
                        input_state = 2
                    elif not c:
                        cur_char = EOL
                        input_state = 4
                    else:
                        cur_char = c
                elif input_state == 2:
                    cur_char = u'\n'
                    input_state = 3
                elif input_state == 3:
                    cur_line += 1
                    cur_line_start = cur_pos = next_pos
                    cur_char = BOL
                    input_state = 1
                elif input_state == 4:
                    cur_char = EOF
                    input_state = 5
                else:  # input_state = 5
                    cur_char = u''
                    # End inlined self.next_char()
            else:  # not new_state
                if trace:  #TRACE#
                    print("blocked")  #TRACE#
                # Begin inlined: action = self.back_up()
                if b_action is not None:
                    (action, cur_pos, cur_line, cur_line_start,
                     cur_char, input_state, next_pos) = \
                        (b_action, b_cur_pos, b_cur_line, b_cur_line_start,
                         b_cur_char, b_input_state, b_next_pos)
                else:
                    action = None
                break  # while 1
                # End inlined: action = self.back_up()
        self.cur_pos = cur_pos
        self.cur_line = cur_line
        self.cur_line_start = cur_line_start
        self.cur_char = cur_char
        self.input_state = input_state
        self.next_pos = next_pos
        if trace:  #TRACE#
            if action is not None:  #TRACE#
                print("Doing %s" % action)  #TRACE#
        return action

    def next_char(self):
        input_state = self.input_state
        if self.trace:
            print("Scanner: next: %s [%d] %d" % (" " * 20, input_state, self.cur_pos))
        if input_state == 1:
            self.cur_pos = self.next_pos
            c = self.read_char()
            if c == u'\n':
                self.cur_char = EOL
                self.input_state = 2
            elif not c:
                self.cur_char = EOL
                self.input_state = 4
            else:
                self.cur_char = c
        elif input_state == 2:
            self.cur_char = u'\n'
            self.input_state = 3
        elif input_state == 3:
            self.cur_line += 1
            self.cur_line_start = self.cur_pos = self.next_pos
            self.cur_char = BOL
            self.input_state = 1
        elif input_state == 4:
            self.cur_char = EOF
            self.input_state = 5
        else:  # input_state = 5
            self.cur_char = u''
        if self.trace:
            print("--> [%d] %d %r" % (input_state, self.cur_pos, self.cur_char))

    def position(self):
        """
        Return a tuple (name, line, col) representing the location of
        the last token read using the read() method. |name| is the
        name that was provided to the Scanner constructor; |line|
        is the line number in the stream (1-based); |col| is the
        position within the line of the first character of the token
        (0-based).
        """
        return (self.name, self.start_line, self.start_col)

    def get_position(self):
        """Python accessible wrapper around position(), only for error reporting.
        """
        return self.position()

    def begin(self, state_name):
        """Set the current state of the scanner to the named state."""
        self.initial_state = (
            self.lexicon.get_initial_state(state_name))
        self.state_name = state_name

    def produce(self, value, text=None):
        """
        Called from an action procedure, causes |value| to be returned
        as the token value from read(). If |text| is supplied, it is
        returned in place of the scanned text.

        produce() can be called more than once during a single call to an action
        procedure, in which case the tokens are queued up and returned one
        at a time by subsequent calls to read(), until the queue is empty,
        whereupon scanning resumes.
        """
        if text is None:
            text = self.text
        self.queue.append((value, text))

    def eof(self):
        """
        Override this method if you want something to be done at
        end of file.
        """