aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/cython/Cython/Plex
diff options
context:
space:
mode:
authoralexv-smirnov <alex@ydb.tech>2023-03-15 19:59:12 +0300
committeralexv-smirnov <alex@ydb.tech>2023-03-15 19:59:12 +0300
commit056bb284ccf8dd6793ec3a54ffa36c4fb2b9ad11 (patch)
tree4740980126f32e3af7937ba0ca5f83e59baa4ab0 /contrib/tools/cython/Cython/Plex
parent269126dcced1cc8b53eb4398b4a33e5142f10290 (diff)
downloadydb-056bb284ccf8dd6793ec3a54ffa36c4fb2b9ad11.tar.gz
add library/cpp/actors, ymake build to ydb oss export
Diffstat (limited to 'contrib/tools/cython/Cython/Plex')
-rw-r--r--contrib/tools/cython/Cython/Plex/Actions.pxd25
-rw-r--r--contrib/tools/cython/Cython/Plex/Actions.py110
-rw-r--r--contrib/tools/cython/Cython/Plex/DFA.py164
-rw-r--r--contrib/tools/cython/Cython/Plex/Errors.py54
-rw-r--r--contrib/tools/cython/Cython/Plex/Lexicons.py200
-rw-r--r--contrib/tools/cython/Cython/Plex/Machines.py255
-rw-r--r--contrib/tools/cython/Cython/Plex/Regexps.py576
-rw-r--r--contrib/tools/cython/Cython/Plex/Scanners.pxd50
-rw-r--r--contrib/tools/cython/Cython/Plex/Scanners.py338
-rw-r--r--contrib/tools/cython/Cython/Plex/Timing.py23
-rw-r--r--contrib/tools/cython/Cython/Plex/Traditional.py158
-rw-r--r--contrib/tools/cython/Cython/Plex/Transitions.py251
-rw-r--r--contrib/tools/cython/Cython/Plex/__init__.py39
13 files changed, 2243 insertions, 0 deletions
diff --git a/contrib/tools/cython/Cython/Plex/Actions.pxd b/contrib/tools/cython/Cython/Plex/Actions.pxd
new file mode 100644
index 0000000000..34660a2d9b
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Actions.pxd
@@ -0,0 +1,25 @@
+
+cdef class Action:
+ cdef perform(self, token_stream, text)
+ cpdef same_as(self, other)
+
+cdef class Return(Action):
+ cdef object value
+ cdef perform(self, token_stream, text)
+ cpdef same_as(self, other)
+
+cdef class Call(Action):
+ cdef object function
+ cdef perform(self, token_stream, text)
+ cpdef same_as(self, other)
+
+cdef class Begin(Action):
+ cdef object state_name
+ cdef perform(self, token_stream, text)
+ cpdef same_as(self, other)
+
+cdef class Ignore(Action):
+ cdef perform(self, token_stream, text)
+
+cdef class Text(Action):
+ cdef perform(self, token_stream, text)
diff --git a/contrib/tools/cython/Cython/Plex/Actions.py b/contrib/tools/cython/Cython/Plex/Actions.py
new file mode 100644
index 0000000000..c88176e716
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Actions.py
@@ -0,0 +1,110 @@
+# cython: auto_pickle=False
+#=======================================================================
+#
+# Python Lexical Analyser
+#
+# Actions for use in token specifications
+#
+#=======================================================================
+
+class Action(object):
+ def perform(self, token_stream, text):
+ pass # abstract
+
+ def same_as(self, other):
+ return self is other
+
+
+class Return(Action):
+ """
+ Internal Plex action which causes |value| to
+ be returned as the value of the associated token
+ """
+
+ def __init__(self, value):
+ self.value = value
+
+ def perform(self, token_stream, text):
+ return self.value
+
+ def same_as(self, other):
+ return isinstance(other, Return) and self.value == other.value
+
+ def __repr__(self):
+ return "Return(%s)" % repr(self.value)
+
+
+class Call(Action):
+ """
+ Internal Plex action which causes a function to be called.
+ """
+
+ def __init__(self, function):
+ self.function = function
+
+ def perform(self, token_stream, text):
+ return self.function(token_stream, text)
+
+ def __repr__(self):
+ return "Call(%s)" % self.function.__name__
+
+ def same_as(self, other):
+ return isinstance(other, Call) and self.function is other.function
+
+
+class Begin(Action):
+ """
+ Begin(state_name) is a Plex action which causes the Scanner to
+ enter the state |state_name|. See the docstring of Plex.Lexicon
+ for more information.
+ """
+
+ def __init__(self, state_name):
+ self.state_name = state_name
+
+ def perform(self, token_stream, text):
+ token_stream.begin(self.state_name)
+
+ def __repr__(self):
+ return "Begin(%s)" % self.state_name
+
+ def same_as(self, other):
+ return isinstance(other, Begin) and self.state_name == other.state_name
+
+
+class Ignore(Action):
+ """
+ IGNORE is a Plex action which causes its associated token
+ to be ignored. See the docstring of Plex.Lexicon for more
+ information.
+ """
+
+ def perform(self, token_stream, text):
+ return None
+
+ def __repr__(self):
+ return "IGNORE"
+
+
+IGNORE = Ignore()
+#IGNORE.__doc__ = Ignore.__doc__
+
+
+class Text(Action):
+ """
+ TEXT is a Plex action which causes the text of a token to
+ be returned as the value of the token. See the docstring of
+ Plex.Lexicon for more information.
+ """
+
+ def perform(self, token_stream, text):
+ return text
+
+ def __repr__(self):
+ return "TEXT"
+
+
+TEXT = Text()
+#TEXT.__doc__ = Text.__doc__
+
+
diff --git a/contrib/tools/cython/Cython/Plex/DFA.py b/contrib/tools/cython/Cython/Plex/DFA.py
new file mode 100644
index 0000000000..76324621fc
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/DFA.py
@@ -0,0 +1,164 @@
+#=======================================================================
+#
+# Python Lexical Analyser
+#
+# Converting NFA to DFA
+#
+#=======================================================================
+
+from __future__ import absolute_import
+
+from . import Machines
+from .Machines import LOWEST_PRIORITY
+from .Transitions import TransitionMap
+
+
+def nfa_to_dfa(old_machine, debug=None):
+ """
+ Given a nondeterministic Machine, return a new equivalent
+ Machine which is deterministic.
+ """
+ # We build a new machine whose states correspond to sets of states
+ # in the old machine. Initially we add a new state corresponding to
+ # the epsilon-closure of each initial old state. Then we give transitions
+ # to each new state which are the union of all transitions out of any
+ # of the corresponding old states. The new state reached on a given
+ # character is the one corresponding to the set of states reachable
+ # on that character from any of the old states. As new combinations of
+ # old states are created, new states are added as needed until closure
+ # is reached.
+ new_machine = Machines.FastMachine()
+ state_map = StateMap(new_machine)
+ # Seed the process using the initial states of the old machine.
+ # Make the corresponding new states into initial states of the new
+ # machine with the same names.
+ for (key, old_state) in old_machine.initial_states.items():
+ new_state = state_map.old_to_new(epsilon_closure(old_state))
+ new_machine.make_initial_state(key, new_state)
+ # Tricky bit here: we add things to the end of this list while we're
+ # iterating over it. The iteration stops when closure is achieved.
+ for new_state in new_machine.states:
+ transitions = TransitionMap()
+ for old_state in state_map.new_to_old(new_state):
+ for event, old_target_states in old_state.transitions.items():
+ if event and old_target_states:
+ transitions.add_set(event, set_epsilon_closure(old_target_states))
+ for event, old_states in transitions.items():
+ new_machine.add_transitions(new_state, event, state_map.old_to_new(old_states))
+ if debug:
+ debug.write("\n===== State Mapping =====\n")
+ state_map.dump(debug)
+ return new_machine
+
+
+def set_epsilon_closure(state_set):
+ """
+ Given a set of states, return the union of the epsilon
+ closures of its member states.
+ """
+ result = {}
+ for state1 in state_set:
+ for state2 in epsilon_closure(state1):
+ result[state2] = 1
+ return result
+
+
+def epsilon_closure(state):
+ """
+ Return the set of states reachable from the given state
+ by epsilon moves.
+ """
+ # Cache the result
+ result = state.epsilon_closure
+ if result is None:
+ result = {}
+ state.epsilon_closure = result
+ add_to_epsilon_closure(result, state)
+ return result
+
+
+def add_to_epsilon_closure(state_set, state):
+ """
+ Recursively add to |state_set| states reachable from the given state
+ by epsilon moves.
+ """
+ if not state_set.get(state, 0):
+ state_set[state] = 1
+ state_set_2 = state.transitions.get_epsilon()
+ if state_set_2:
+ for state2 in state_set_2:
+ add_to_epsilon_closure(state_set, state2)
+
+
+class StateMap(object):
+ """
+ Helper class used by nfa_to_dfa() to map back and forth between
+ sets of states from the old machine and states of the new machine.
+ """
+ new_machine = None # Machine
+ old_to_new_dict = None # {(old_state,...) : new_state}
+ new_to_old_dict = None # {id(new_state) : old_state_set}
+
+ def __init__(self, new_machine):
+ self.new_machine = new_machine
+ self.old_to_new_dict = {}
+ self.new_to_old_dict = {}
+
+ def old_to_new(self, old_state_set):
+ """
+ Return the state of the new machine corresponding to the
+ set of old machine states represented by |state_set|. A new
+ state will be created if necessary. If any of the old states
+ are accepting states, the new state will be an accepting state
+ with the highest priority action from the old states.
+ """
+ key = self.make_key(old_state_set)
+ new_state = self.old_to_new_dict.get(key, None)
+ if not new_state:
+ action = self.highest_priority_action(old_state_set)
+ new_state = self.new_machine.new_state(action)
+ self.old_to_new_dict[key] = new_state
+ self.new_to_old_dict[id(new_state)] = old_state_set
+ #for old_state in old_state_set.keys():
+ #new_state.merge_actions(old_state)
+ return new_state
+
+ def highest_priority_action(self, state_set):
+ best_action = None
+ best_priority = LOWEST_PRIORITY
+ for state in state_set:
+ priority = state.action_priority
+ if priority > best_priority:
+ best_action = state.action
+ best_priority = priority
+ return best_action
+
+ # def old_to_new_set(self, old_state_set):
+ # """
+ # Return the new state corresponding to a set of old states as
+ # a singleton set.
+ # """
+ # return {self.old_to_new(old_state_set):1}
+
+ def new_to_old(self, new_state):
+ """Given a new state, return a set of corresponding old states."""
+ return self.new_to_old_dict[id(new_state)]
+
+ def make_key(self, state_set):
+ """
+ Convert a set of states into a uniquified
+ sorted tuple suitable for use as a dictionary key.
+ """
+ lst = list(state_set)
+ lst.sort()
+ return tuple(lst)
+
+ def dump(self, file):
+ from .Transitions import state_set_str
+
+ for new_state in self.new_machine.states:
+ old_state_set = self.new_to_old_dict[id(new_state)]
+ file.write(" State %s <-- %s\n" % (
+ new_state['number'], state_set_str(old_state_set)))
+
+
diff --git a/contrib/tools/cython/Cython/Plex/Errors.py b/contrib/tools/cython/Cython/Plex/Errors.py
new file mode 100644
index 0000000000..f460100d77
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Errors.py
@@ -0,0 +1,54 @@
+#=======================================================================
+#
+# Python Lexical Analyser
+#
+# Exception classes
+#
+#=======================================================================
+
+
+class PlexError(Exception):
+ message = ""
+
+
+class PlexTypeError(PlexError, TypeError):
+ pass
+
+
+class PlexValueError(PlexError, ValueError):
+ pass
+
+
+class InvalidRegex(PlexError):
+ pass
+
+
+class InvalidToken(PlexError):
+ def __init__(self, token_number, message):
+ PlexError.__init__(self, "Token number %d: %s" % (token_number, message))
+
+
+class InvalidScanner(PlexError):
+ pass
+
+
+class AmbiguousAction(PlexError):
+ message = "Two tokens with different actions can match the same string"
+
+ def __init__(self):
+ pass
+
+
+class UnrecognizedInput(PlexError):
+ scanner = None
+ position = None
+ state_name = None
+
+ def __init__(self, scanner, state_name):
+ self.scanner = scanner
+ self.position = scanner.get_position()
+ self.state_name = state_name
+
+ def __str__(self):
+ return ("'%s', line %d, char %d: Token not recognised in state %r" % (
+ self.position + (self.state_name,)))
diff --git a/contrib/tools/cython/Cython/Plex/Lexicons.py b/contrib/tools/cython/Cython/Plex/Lexicons.py
new file mode 100644
index 0000000000..787f5854b8
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Lexicons.py
@@ -0,0 +1,200 @@
+#=======================================================================
+#
+# Python Lexical Analyser
+#
+# Lexical Analyser Specification
+#
+#=======================================================================
+
+from __future__ import absolute_import
+
+import types
+
+from . import Actions
+from . import DFA
+from . import Errors
+from . import Machines
+from . import Regexps
+
+# debug_flags for Lexicon constructor
+DUMP_NFA = 1
+DUMP_DFA = 2
+
+
+class State(object):
+ """
+ This class is used as part of a Plex.Lexicon specification to
+ introduce a user-defined state.
+
+ Constructor:
+
+ State(name, token_specifications)
+ """
+
+ name = None
+ tokens = None
+
+ def __init__(self, name, tokens):
+ self.name = name
+ self.tokens = tokens
+
+
+class Lexicon(object):
+ """
+ Lexicon(specification) builds a lexical analyser from the given
+ |specification|. The specification consists of a list of
+ specification items. Each specification item may be either:
+
+ 1) A token definition, which is a tuple:
+
+ (pattern, action)
+
+ The |pattern| is a regular axpression built using the
+ constructors defined in the Plex module.
+
+ The |action| is the action to be performed when this pattern
+ is recognised (see below).
+
+ 2) A state definition:
+
+ State(name, tokens)
+
+ where |name| is a character string naming the state,
+ and |tokens| is a list of token definitions as
+ above. The meaning and usage of states is described
+ below.
+
+ Actions
+ -------
+
+ The |action| in a token specication may be one of three things:
+
+ 1) A function, which is called as follows:
+
+ function(scanner, text)
+
+ where |scanner| is the relevant Scanner instance, and |text|
+ is the matched text. If the function returns anything
+ other than None, that value is returned as the value of the
+ token. If it returns None, scanning continues as if the IGNORE
+ action were specified (see below).
+
+ 2) One of the following special actions:
+
+ IGNORE means that the recognised characters will be treated as
+ white space and ignored. Scanning will continue until
+ the next non-ignored token is recognised before returning.
+
+ TEXT causes the scanned text itself to be returned as the
+ value of the token.
+
+ 3) Any other value, which is returned as the value of the token.
+
+ States
+ ------
+
+ At any given time, the scanner is in one of a number of states.
+ Associated with each state is a set of possible tokens. When scanning,
+ only tokens associated with the current state are recognised.
+
+ There is a default state, whose name is the empty string. Token
+ definitions which are not inside any State definition belong to
+ the default state.
+
+ The initial state of the scanner is the default state. The state can
+ be changed in one of two ways:
+
+ 1) Using Begin(state_name) as the action of a token.
+
+ 2) Calling the begin(state_name) method of the Scanner.
+
+ To change back to the default state, use '' as the state name.
+ """
+
+ machine = None # Machine
+ tables = None # StateTableMachine
+
+ def __init__(self, specifications, debug=None, debug_flags=7, timings=None):
+ if not isinstance(specifications, list):
+ raise Errors.InvalidScanner("Scanner definition is not a list")
+ if timings:
+ from .Timing import time
+
+ total_time = 0.0
+ time1 = time()
+ nfa = Machines.Machine()
+ default_initial_state = nfa.new_initial_state('')
+ token_number = 1
+ for spec in specifications:
+ if isinstance(spec, State):
+ user_initial_state = nfa.new_initial_state(spec.name)
+ for token in spec.tokens:
+ self.add_token_to_machine(
+ nfa, user_initial_state, token, token_number)
+ token_number += 1
+ elif isinstance(spec, tuple):
+ self.add_token_to_machine(
+ nfa, default_initial_state, spec, token_number)
+ token_number += 1
+ else:
+ raise Errors.InvalidToken(
+ token_number,
+ "Expected a token definition (tuple) or State instance")
+ if timings:
+ time2 = time()
+ total_time = total_time + (time2 - time1)
+ time3 = time()
+ if debug and (debug_flags & 1):
+ debug.write("\n============= NFA ===========\n")
+ nfa.dump(debug)
+ dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug)
+ if timings:
+ time4 = time()
+ total_time = total_time + (time4 - time3)
+ if debug and (debug_flags & 2):
+ debug.write("\n============= DFA ===========\n")
+ dfa.dump(debug)
+ if timings:
+ timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
+ timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
+ timings.write("TOTAL : %5.2f\n" % total_time)
+ self.machine = dfa
+
+ def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
+ try:
+ (re, action_spec) = self.parse_token_definition(token_spec)
+ # Disabled this -- matching empty strings can be useful
+ #if re.nullable:
+ # raise Errors.InvalidToken(
+ # token_number, "Pattern can match 0 input symbols")
+ if isinstance(action_spec, Actions.Action):
+ action = action_spec
+ else:
+ try:
+ action_spec.__call__
+ except AttributeError:
+ action = Actions.Return(action_spec)
+ else:
+ action = Actions.Call(action_spec)
+ final_state = machine.new_state()
+ re.build_machine(machine, initial_state, final_state,
+ match_bol=1, nocase=0)
+ final_state.set_action(action, priority=-token_number)
+ except Errors.PlexError as e:
+ raise e.__class__("Token number %d: %s" % (token_number, e))
+
+ def parse_token_definition(self, token_spec):
+ if not isinstance(token_spec, tuple):
+ raise Errors.InvalidToken("Token definition is not a tuple")
+ if len(token_spec) != 2:
+ raise Errors.InvalidToken("Wrong number of items in token definition")
+ pattern, action = token_spec
+ if not isinstance(pattern, Regexps.RE):
+ raise Errors.InvalidToken("Pattern is not an RE instance")
+ return (pattern, action)
+
+ def get_initial_state(self, name):
+ return self.machine.get_initial_state(name)
+
+
+
diff --git a/contrib/tools/cython/Cython/Plex/Machines.py b/contrib/tools/cython/Cython/Plex/Machines.py
new file mode 100644
index 0000000000..398850976b
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Machines.py
@@ -0,0 +1,255 @@
+#=======================================================================
+#
+# Python Lexical Analyser
+#
+# Classes for building NFAs and DFAs
+#
+#=======================================================================
+
+from __future__ import absolute_import
+
+import sys
+
+from .Transitions import TransitionMap
+
+try:
+ from sys import maxsize as maxint
+except ImportError:
+ from sys import maxint
+
+try:
+ unichr
+except NameError:
+ unichr = chr
+
+LOWEST_PRIORITY = -maxint
+
+
+class Machine(object):
+ """A collection of Nodes representing an NFA or DFA."""
+ states = None # [Node]
+ next_state_number = 1
+ initial_states = None # {(name, bol): Node}
+
+ def __init__(self):
+ self.states = []
+ self.initial_states = {}
+
+ def __del__(self):
+ #print "Destroying", self ###
+ for state in self.states:
+ state.destroy()
+
+ def new_state(self):
+ """Add a new state to the machine and return it."""
+ s = Node()
+ n = self.next_state_number
+ self.next_state_number = n + 1
+ s.number = n
+ self.states.append(s)
+ return s
+
+ def new_initial_state(self, name):
+ state = self.new_state()
+ self.make_initial_state(name, state)
+ return state
+
+ def make_initial_state(self, name, state):
+ self.initial_states[name] = state
+
+ def get_initial_state(self, name):
+ return self.initial_states[name]
+
+ def dump(self, file):
+ file.write("Plex.Machine:\n")
+ if self.initial_states is not None:
+ file.write(" Initial states:\n")
+ for (name, state) in sorted(self.initial_states.items()):
+ file.write(" '%s': %d\n" % (name, state.number))
+ for s in self.states:
+ s.dump(file)
+
+
+class Node(object):
+ """A state of an NFA or DFA."""
+ transitions = None # TransitionMap
+ action = None # Action
+ action_priority = None # integer
+ number = 0 # for debug output
+ epsilon_closure = None # used by nfa_to_dfa()
+
+ def __init__(self):
+ # Preinitialise the list of empty transitions, because
+ # the nfa-to-dfa algorithm needs it
+ #self.transitions = {'':[]}
+ self.transitions = TransitionMap()
+ self.action_priority = LOWEST_PRIORITY
+
+ def destroy(self):
+ #print "Destroying", self ###
+ self.transitions = None
+ self.action = None
+ self.epsilon_closure = None
+
+ def add_transition(self, event, new_state):
+ self.transitions.add(event, new_state)
+
+ def link_to(self, state):
+ """Add an epsilon-move from this state to another state."""
+ self.add_transition('', state)
+
+ def set_action(self, action, priority):
+ """Make this an accepting state with the given action. If
+ there is already an action, choose the action with highest
+ priority."""
+ if priority > self.action_priority:
+ self.action = action
+ self.action_priority = priority
+
+ def get_action(self):
+ return self.action
+
+ def get_action_priority(self):
+ return self.action_priority
+
+ def is_accepting(self):
+ return self.action is not None
+
+ def __str__(self):
+ return "State %d" % self.number
+
+ def dump(self, file):
+ # Header
+ file.write(" State %d:\n" % self.number)
+ # Transitions
+ # self.dump_transitions(file)
+ self.transitions.dump(file)
+ # Action
+ action = self.action
+ priority = self.action_priority
+ if action is not None:
+ file.write(" %s [priority %d]\n" % (action, priority))
+
+ def __lt__(self, other):
+ return self.number < other.number
+
+
+class FastMachine(object):
+ """
+ FastMachine is a deterministic machine represented in a way that
+ allows fast scanning.
+ """
+ initial_states = None # {state_name:state}
+ states = None # [state] where state = {event:state, 'else':state, 'action':Action}
+ next_number = 1 # for debugging
+
+ new_state_template = {
+ '': None, 'bol': None, 'eol': None, 'eof': None, 'else': None
+ }
+
+ def __init__(self):
+ self.initial_states = {}
+ self.states = []
+
+ def __del__(self):
+ for state in self.states:
+ state.clear()
+
+ def new_state(self, action=None):
+ number = self.next_number
+ self.next_number = number + 1
+ result = self.new_state_template.copy()
+ result['number'] = number
+ result['action'] = action
+ self.states.append(result)
+ return result
+
+ def make_initial_state(self, name, state):
+ self.initial_states[name] = state
+
+ def add_transitions(self, state, event, new_state, maxint=maxint):
+ if type(event) is tuple:
+ code0, code1 = event
+ if code0 == -maxint:
+ state['else'] = new_state
+ elif code1 != maxint:
+ while code0 < code1:
+ state[unichr(code0)] = new_state
+ code0 += 1
+ else:
+ state[event] = new_state
+
+ def get_initial_state(self, name):
+ return self.initial_states[name]
+
+ def dump(self, file):
+ file.write("Plex.FastMachine:\n")
+ file.write(" Initial states:\n")
+ for name, state in sorted(self.initial_states.items()):
+ file.write(" %s: %s\n" % (repr(name), state['number']))
+ for state in self.states:
+ self.dump_state(state, file)
+
+ def dump_state(self, state, file):
+ # Header
+ file.write(" State %d:\n" % state['number'])
+ # Transitions
+ self.dump_transitions(state, file)
+ # Action
+ action = state['action']
+ if action is not None:
+ file.write(" %s\n" % action)
+
+ def dump_transitions(self, state, file):
+ chars_leading_to_state = {}
+ special_to_state = {}
+ for (c, s) in state.items():
+ if len(c) == 1:
+ chars = chars_leading_to_state.get(id(s), None)
+ if chars is None:
+ chars = []
+ chars_leading_to_state[id(s)] = chars
+ chars.append(c)
+ elif len(c) <= 4:
+ special_to_state[c] = s
+ ranges_to_state = {}
+ for state in self.states:
+ char_list = chars_leading_to_state.get(id(state), None)
+ if char_list:
+ ranges = self.chars_to_ranges(char_list)
+ ranges_to_state[ranges] = state
+ ranges_list = ranges_to_state.keys()
+ ranges_list.sort()
+ for ranges in ranges_list:
+ key = self.ranges_to_string(ranges)
+ state = ranges_to_state[ranges]
+ file.write(" %s --> State %d\n" % (key, state['number']))
+ for key in ('bol', 'eol', 'eof', 'else'):
+ state = special_to_state.get(key, None)
+ if state:
+ file.write(" %s --> State %d\n" % (key, state['number']))
+
+ def chars_to_ranges(self, char_list):
+ char_list.sort()
+ i = 0
+ n = len(char_list)
+ result = []
+ while i < n:
+ c1 = ord(char_list[i])
+ c2 = c1
+ i += 1
+ while i < n and ord(char_list[i]) == c2 + 1:
+ i += 1
+ c2 += 1
+ result.append((chr(c1), chr(c2)))
+ return tuple(result)
+
+ def ranges_to_string(self, range_list):
+ return ','.join(map(self.range_to_string, range_list))
+
+ def range_to_string(self, range_tuple):
+ (c1, c2) = range_tuple
+ if c1 == c2:
+ return repr(c1)
+ else:
+ return "%s..%s" % (repr(c1), repr(c2))
diff --git a/contrib/tools/cython/Cython/Plex/Regexps.py b/contrib/tools/cython/Cython/Plex/Regexps.py
new file mode 100644
index 0000000000..41816c939a
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Regexps.py
@@ -0,0 +1,576 @@
+#=======================================================================
+#
+# Python Lexical Analyser
+#
+# Regular Expressions
+#
+#=======================================================================
+
+from __future__ import absolute_import
+
+import types
+try:
+ from sys import maxsize as maxint
+except ImportError:
+ from sys import maxint
+
+from . import Errors
+
+#
+# Constants
+#
+
+BOL = 'bol'
+EOL = 'eol'
+EOF = 'eof'
+
+nl_code = ord('\n')
+
+
+#
+# Helper functions
+#
+
+def chars_to_ranges(s):
+ """
+ Return a list of character codes consisting of pairs
+ [code1a, code1b, code2a, code2b,...] which cover all
+ the characters in |s|.
+ """
+ char_list = list(s)
+ char_list.sort()
+ i = 0
+ n = len(char_list)
+ result = []
+ while i < n:
+ code1 = ord(char_list[i])
+ code2 = code1 + 1
+ i += 1
+ while i < n and code2 >= ord(char_list[i]):
+ code2 += 1
+ i += 1
+ result.append(code1)
+ result.append(code2)
+ return result
+
+
+def uppercase_range(code1, code2):
+ """
+ If the range of characters from code1 to code2-1 includes any
+ lower case letters, return the corresponding upper case range.
+ """
+ code3 = max(code1, ord('a'))
+ code4 = min(code2, ord('z') + 1)
+ if code3 < code4:
+ d = ord('A') - ord('a')
+ return (code3 + d, code4 + d)
+ else:
+ return None
+
+
+def lowercase_range(code1, code2):
+ """
+ If the range of characters from code1 to code2-1 includes any
+ upper case letters, return the corresponding lower case range.
+ """
+ code3 = max(code1, ord('A'))
+ code4 = min(code2, ord('Z') + 1)
+ if code3 < code4:
+ d = ord('a') - ord('A')
+ return (code3 + d, code4 + d)
+ else:
+ return None
+
+
+def CodeRanges(code_list):
+ """
+ Given a list of codes as returned by chars_to_ranges, return
+ an RE which will match a character in any of the ranges.
+ """
+ re_list = [CodeRange(code_list[i], code_list[i + 1]) for i in range(0, len(code_list), 2)]
+ return Alt(*re_list)
+
+
+def CodeRange(code1, code2):
+ """
+ CodeRange(code1, code2) is an RE which matches any character
+ with a code |c| in the range |code1| <= |c| < |code2|.
+ """
+ if code1 <= nl_code < code2:
+ return Alt(RawCodeRange(code1, nl_code),
+ RawNewline,
+ RawCodeRange(nl_code + 1, code2))
+ else:
+ return RawCodeRange(code1, code2)
+
+
+#
+# Abstract classes
+#
+
+class RE(object):
+ """RE is the base class for regular expression constructors.
+ The following operators are defined on REs:
+
+ re1 + re2 is an RE which matches |re1| followed by |re2|
+ re1 | re2 is an RE which matches either |re1| or |re2|
+ """
+
+ nullable = 1 # True if this RE can match 0 input symbols
+ match_nl = 1 # True if this RE can match a string ending with '\n'
+ str = None # Set to a string to override the class's __str__ result
+
+ def build_machine(self, machine, initial_state, final_state,
+ match_bol, nocase):
+ """
+ This method should add states to |machine| to implement this
+ RE, starting at |initial_state| and ending at |final_state|.
+ If |match_bol| is true, the RE must be able to match at the
+ beginning of a line. If nocase is true, upper and lower case
+ letters should be treated as equivalent.
+ """
+ raise NotImplementedError("%s.build_machine not implemented" %
+ self.__class__.__name__)
+
+ def build_opt(self, m, initial_state, c):
+ """
+ Given a state |s| of machine |m|, return a new state
+ reachable from |s| on character |c| or epsilon.
+ """
+ s = m.new_state()
+ initial_state.link_to(s)
+ initial_state.add_transition(c, s)
+ return s
+
+ def __add__(self, other):
+ return Seq(self, other)
+
+ def __or__(self, other):
+ return Alt(self, other)
+
+ def __str__(self):
+ if self.str:
+ return self.str
+ else:
+ return self.calc_str()
+
+ def check_re(self, num, value):
+ if not isinstance(value, RE):
+ self.wrong_type(num, value, "Plex.RE instance")
+
+ def check_string(self, num, value):
+ if type(value) != type(''):
+ self.wrong_type(num, value, "string")
+
+ def check_char(self, num, value):
+ self.check_string(num, value)
+ if len(value) != 1:
+ raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s."
+ "Expected a string of length 1, got: %s" % (
+ num, self.__class__.__name__, repr(value)))
+
+ def wrong_type(self, num, value, expected):
+ if type(value) == types.InstanceType:
+ got = "%s.%s instance" % (
+ value.__class__.__module__, value.__class__.__name__)
+ else:
+ got = type(value).__name__
+ raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "
+ "(expected %s, got %s" % (
+ num, self.__class__.__name__, expected, got))
+
+#
+# Primitive RE constructors
+# -------------------------
+#
+# These are the basic REs from which all others are built.
+#
+
+## class Char(RE):
+## """
+## Char(c) is an RE which matches the character |c|.
+## """
+
+## nullable = 0
+
+## def __init__(self, char):
+## self.char = char
+## self.match_nl = char == '\n'
+
+## def build_machine(self, m, initial_state, final_state, match_bol, nocase):
+## c = self.char
+## if match_bol and c != BOL:
+## s1 = self.build_opt(m, initial_state, BOL)
+## else:
+## s1 = initial_state
+## if c == '\n' or c == EOF:
+## s1 = self.build_opt(m, s1, EOL)
+## if len(c) == 1:
+## code = ord(self.char)
+## s1.add_transition((code, code+1), final_state)
+## if nocase and is_letter_code(code):
+## code2 = other_case_code(code)
+## s1.add_transition((code2, code2+1), final_state)
+## else:
+## s1.add_transition(c, final_state)
+
+## def calc_str(self):
+## return "Char(%s)" % repr(self.char)
+
+
+def Char(c):
+ """
+ Char(c) is an RE which matches the character |c|.
+ """
+ if len(c) == 1:
+ result = CodeRange(ord(c), ord(c) + 1)
+ else:
+ result = SpecialSymbol(c)
+ result.str = "Char(%s)" % repr(c)
+ return result
+
+
+class RawCodeRange(RE):
+ """
+ RawCodeRange(code1, code2) is a low-level RE which matches any character
+ with a code |c| in the range |code1| <= |c| < |code2|, where the range
+ does not include newline. For internal use only.
+ """
+ nullable = 0
+ match_nl = 0
+ range = None # (code, code)
+ uppercase_range = None # (code, code) or None
+ lowercase_range = None # (code, code) or None
+
+ def __init__(self, code1, code2):
+ self.range = (code1, code2)
+ self.uppercase_range = uppercase_range(code1, code2)
+ self.lowercase_range = lowercase_range(code1, code2)
+
+ def build_machine(self, m, initial_state, final_state, match_bol, nocase):
+ if match_bol:
+ initial_state = self.build_opt(m, initial_state, BOL)
+ initial_state.add_transition(self.range, final_state)
+ if nocase:
+ if self.uppercase_range:
+ initial_state.add_transition(self.uppercase_range, final_state)
+ if self.lowercase_range:
+ initial_state.add_transition(self.lowercase_range, final_state)
+
+ def calc_str(self):
+ return "CodeRange(%d,%d)" % (self.code1, self.code2)
+
+
+class _RawNewline(RE):
+ """
+ RawNewline is a low-level RE which matches a newline character.
+ For internal use only.
+ """
+ nullable = 0
+ match_nl = 1
+
+ def build_machine(self, m, initial_state, final_state, match_bol, nocase):
+ if match_bol:
+ initial_state = self.build_opt(m, initial_state, BOL)
+ s = self.build_opt(m, initial_state, EOL)
+ s.add_transition((nl_code, nl_code + 1), final_state)
+
+
+RawNewline = _RawNewline()
+
+
+class SpecialSymbol(RE):
+ """
+ SpecialSymbol(sym) is an RE which matches the special input
+ symbol |sym|, which is one of BOL, EOL or EOF.
+ """
+ nullable = 0
+ match_nl = 0
+ sym = None
+
+ def __init__(self, sym):
+ self.sym = sym
+
+ def build_machine(self, m, initial_state, final_state, match_bol, nocase):
+ # Sequences 'bol bol' and 'bol eof' are impossible, so only need
+ # to allow for bol if sym is eol
+ if match_bol and self.sym == EOL:
+ initial_state = self.build_opt(m, initial_state, BOL)
+ initial_state.add_transition(self.sym, final_state)
+
+
+class Seq(RE):
+ """Seq(re1, re2, re3...) is an RE which matches |re1| followed by
+ |re2| followed by |re3|..."""
+
+ def __init__(self, *re_list):
+ nullable = 1
+ for i, re in enumerate(re_list):
+ self.check_re(i, re)
+ nullable = nullable and re.nullable
+ self.re_list = re_list
+ self.nullable = nullable
+ i = len(re_list)
+ match_nl = 0
+ while i:
+ i -= 1
+ re = re_list[i]
+ if re.match_nl:
+ match_nl = 1
+ break
+ if not re.nullable:
+ break
+ self.match_nl = match_nl
+
+ def build_machine(self, m, initial_state, final_state, match_bol, nocase):
+ re_list = self.re_list
+ if len(re_list) == 0:
+ initial_state.link_to(final_state)
+ else:
+ s1 = initial_state
+ n = len(re_list)
+ for i, re in enumerate(re_list):
+ if i < n - 1:
+ s2 = m.new_state()
+ else:
+ s2 = final_state
+ re.build_machine(m, s1, s2, match_bol, nocase)
+ s1 = s2
+ match_bol = re.match_nl or (match_bol and re.nullable)
+
+ def calc_str(self):
+ return "Seq(%s)" % ','.join(map(str, self.re_list))
+
+
+class Alt(RE):
+ """Alt(re1, re2, re3...) is an RE which matches either |re1| or
+ |re2| or |re3|..."""
+
+ def __init__(self, *re_list):
+ self.re_list = re_list
+ nullable = 0
+ match_nl = 0
+ nullable_res = []
+ non_nullable_res = []
+ i = 1
+ for re in re_list:
+ self.check_re(i, re)
+ if re.nullable:
+ nullable_res.append(re)
+ nullable = 1
+ else:
+ non_nullable_res.append(re)
+ if re.match_nl:
+ match_nl = 1
+ i += 1
+ self.nullable_res = nullable_res
+ self.non_nullable_res = non_nullable_res
+ self.nullable = nullable
+ self.match_nl = match_nl
+
+ def build_machine(self, m, initial_state, final_state, match_bol, nocase):
+ for re in self.nullable_res:
+ re.build_machine(m, initial_state, final_state, match_bol, nocase)
+ if self.non_nullable_res:
+ if match_bol:
+ initial_state = self.build_opt(m, initial_state, BOL)
+ for re in self.non_nullable_res:
+ re.build_machine(m, initial_state, final_state, 0, nocase)
+
+ def calc_str(self):
+ return "Alt(%s)" % ','.join(map(str, self.re_list))
+
+
+class Rep1(RE):
+ """Rep1(re) is an RE which matches one or more repetitions of |re|."""
+
+ def __init__(self, re):
+ self.check_re(1, re)
+ self.re = re
+ self.nullable = re.nullable
+ self.match_nl = re.match_nl
+
+ def build_machine(self, m, initial_state, final_state, match_bol, nocase):
+ s1 = m.new_state()
+ s2 = m.new_state()
+ initial_state.link_to(s1)
+ self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase)
+ s2.link_to(s1)
+ s2.link_to(final_state)
+
+ def calc_str(self):
+ return "Rep1(%s)" % self.re
+
+
+class SwitchCase(RE):
+ """
+ SwitchCase(re, nocase) is an RE which matches the same strings as RE,
+ but treating upper and lower case letters according to |nocase|. If
+ |nocase| is true, case is ignored, otherwise it is not.
+ """
+ re = None
+ nocase = None
+
+ def __init__(self, re, nocase):
+ self.re = re
+ self.nocase = nocase
+ self.nullable = re.nullable
+ self.match_nl = re.match_nl
+
+ def build_machine(self, m, initial_state, final_state, match_bol, nocase):
+ self.re.build_machine(m, initial_state, final_state, match_bol,
+ self.nocase)
+
+ def calc_str(self):
+ if self.nocase:
+ name = "NoCase"
+ else:
+ name = "Case"
+ return "%s(%s)" % (name, self.re)
+
+#
+# Composite RE constructors
+# -------------------------
+#
+# These REs are defined in terms of the primitive REs.
+#
+
+Empty = Seq()
+Empty.__doc__ = \
+ """
+ Empty is an RE which matches the empty string.
+ """
+Empty.str = "Empty"
+
+
+def Str1(s):
+ """
+ Str1(s) is an RE which matches the literal string |s|.
+ """
+ result = Seq(*tuple(map(Char, s)))
+ result.str = "Str(%s)" % repr(s)
+ return result
+
+
+def Str(*strs):
+ """
+ Str(s) is an RE which matches the literal string |s|.
+ Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|...
+ """
+ if len(strs) == 1:
+ return Str1(strs[0])
+ else:
+ result = Alt(*tuple(map(Str1, strs)))
+ result.str = "Str(%s)" % ','.join(map(repr, strs))
+ return result
+
+
+def Any(s):
+ """
+ Any(s) is an RE which matches any character in the string |s|.
+ """
+ #result = apply(Alt, tuple(map(Char, s)))
+ result = CodeRanges(chars_to_ranges(s))
+ result.str = "Any(%s)" % repr(s)
+ return result
+
+
+def AnyBut(s):
+ """
+ AnyBut(s) is an RE which matches any character (including
+ newline) which is not in the string |s|.
+ """
+ ranges = chars_to_ranges(s)
+ ranges.insert(0, -maxint)
+ ranges.append(maxint)
+ result = CodeRanges(ranges)
+ result.str = "AnyBut(%s)" % repr(s)
+ return result
+
+
+AnyChar = AnyBut("")
+AnyChar.__doc__ = \
+ """
+ AnyChar is an RE which matches any single character (including a newline).
+ """
+AnyChar.str = "AnyChar"
+
+
+def Range(s1, s2=None):
+ """
+ Range(c1, c2) is an RE which matches any single character in the range
+ |c1| to |c2| inclusive.
+ Range(s) where |s| is a string of even length is an RE which matches
+ any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,...
+ """
+ if s2:
+ result = CodeRange(ord(s1), ord(s2) + 1)
+ result.str = "Range(%s,%s)" % (s1, s2)
+ else:
+ ranges = []
+ for i in range(0, len(s1), 2):
+ ranges.append(CodeRange(ord(s1[i]), ord(s1[i + 1]) + 1))
+ result = Alt(*ranges)
+ result.str = "Range(%s)" % repr(s1)
+ return result
+
+
+def Opt(re):
+ """
+ Opt(re) is an RE which matches either |re| or the empty string.
+ """
+ result = Alt(re, Empty)
+ result.str = "Opt(%s)" % re
+ return result
+
+
+def Rep(re):
+ """
+ Rep(re) is an RE which matches zero or more repetitions of |re|.
+ """
+ result = Opt(Rep1(re))
+ result.str = "Rep(%s)" % re
+ return result
+
+
+def NoCase(re):
+ """
+ NoCase(re) is an RE which matches the same strings as RE, but treating
+ upper and lower case letters as equivalent.
+ """
+ return SwitchCase(re, nocase=1)
+
+
+def Case(re):
+ """
+ Case(re) is an RE which matches the same strings as RE, but treating
+ upper and lower case letters as distinct, i.e. it cancels the effect
+ of any enclosing NoCase().
+ """
+ return SwitchCase(re, nocase=0)
+
+#
+# RE Constants
+#
+
+Bol = Char(BOL)
+Bol.__doc__ = \
+ """
+ Bol is an RE which matches the beginning of a line.
+ """
+Bol.str = "Bol"
+
+Eol = Char(EOL)
+Eol.__doc__ = \
+ """
+ Eol is an RE which matches the end of a line.
+ """
+Eol.str = "Eol"
+
+Eof = Char(EOF)
+Eof.__doc__ = \
+ """
+ Eof is an RE which matches the end of the file.
+ """
+Eof.str = "Eof"
+
diff --git a/contrib/tools/cython/Cython/Plex/Scanners.pxd b/contrib/tools/cython/Cython/Plex/Scanners.pxd
new file mode 100644
index 0000000000..6e75f55e61
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Scanners.pxd
@@ -0,0 +1,50 @@
+from __future__ import absolute_import
+
+import cython
+
+from Cython.Plex.Actions cimport Action
+
+cdef class Scanner:
+
+ cdef public lexicon
+ cdef public stream
+ cdef public name
+ cdef public unicode buffer
+ cdef public Py_ssize_t buf_start_pos
+ cdef public Py_ssize_t next_pos
+ cdef public Py_ssize_t cur_pos
+ cdef public Py_ssize_t cur_line
+ cdef public Py_ssize_t cur_line_start
+ cdef public Py_ssize_t start_pos
+ cdef public Py_ssize_t start_line
+ cdef public Py_ssize_t start_col
+ cdef public text
+ cdef public initial_state # int?
+ cdef public state_name
+ cdef public list queue
+ cdef public bint trace
+ cdef public cur_char
+ cdef public long input_state
+
+ cdef public level
+
+ @cython.final
+ @cython.locals(input_state=long)
+ cdef next_char(self)
+ @cython.locals(action=Action)
+ cpdef tuple read(self)
+ @cython.final
+ cdef tuple scan_a_token(self)
+ ##cdef tuple position(self) # used frequently by Parsing.py
+
+ @cython.final
+ @cython.locals(cur_pos=Py_ssize_t, cur_line=Py_ssize_t, cur_line_start=Py_ssize_t,
+ input_state=long, next_pos=Py_ssize_t, state=dict,
+ buf_start_pos=Py_ssize_t, buf_len=Py_ssize_t, buf_index=Py_ssize_t,
+ trace=bint, discard=Py_ssize_t, data=unicode, buffer=unicode)
+ cdef run_machine_inlined(self)
+
+ @cython.final
+ cdef begin(self, state)
+ @cython.final
+ cdef produce(self, value, text = *)
diff --git a/contrib/tools/cython/Cython/Plex/Scanners.py b/contrib/tools/cython/Cython/Plex/Scanners.py
new file mode 100644
index 0000000000..88f7e2da3b
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Scanners.py
@@ -0,0 +1,338 @@
+# cython: auto_pickle=False
+#=======================================================================
+#
+# Python Lexical Analyser
+#
+#
+# Scanning an input stream
+#
+#=======================================================================
+
+from __future__ import absolute_import
+
+import cython
+
+cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object)
+
+from . import Errors
+from .Regexps import BOL, EOL, EOF
+
+NOT_FOUND = object()
+
+
+class Scanner(object):
+ """
+ A Scanner is used to read tokens from a stream of characters
+ using the token set specified by a Plex.Lexicon.
+
+ Constructor:
+
+ Scanner(lexicon, stream, name = '')
+
+ See the docstring of the __init__ method for details.
+
+ Methods:
+
+ See the docstrings of the individual methods for more
+ information.
+
+ read() --> (value, text)
+ Reads the next lexical token from the stream.
+
+ position() --> (name, line, col)
+ Returns the position of the last token read using the
+ read() method.
+
+ begin(state_name)
+ Causes scanner to change state.
+
+ produce(value [, text])
+ Causes return of a token value to the caller of the
+ Scanner.
+
+ """
+
+ # lexicon = None # Lexicon
+ # stream = None # file-like object
+ # name = ''
+ # buffer = ''
+ # buf_start_pos = 0 # position in input of start of buffer
+ # next_pos = 0 # position in input of next char to read
+ # cur_pos = 0 # position in input of current char
+ # cur_line = 1 # line number of current char
+ # cur_line_start = 0 # position in input of start of current line
+ # start_pos = 0 # position in input of start of token
+ # start_line = 0 # line number of start of token
+ # start_col = 0 # position in line of start of token
+ # text = None # text of last token read
+ # initial_state = None # Node
+ # state_name = '' # Name of initial state
+ # queue = None # list of tokens to be returned
+ # trace = 0
+
+ def __init__(self, lexicon, stream, name='', initial_pos=None):
+ """
+ Scanner(lexicon, stream, name = '')
+
+ |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
+ to be recognised.
+
+ |stream| can be a file object or anything which implements a
+ compatible read() method.
+
+ |name| is optional, and may be the name of the file being
+ scanned or any other identifying string.
+ """
+ self.trace = 0
+
+ self.buffer = u''
+ self.buf_start_pos = 0
+ self.next_pos = 0
+ self.cur_pos = 0
+ self.cur_line = 1
+ self.start_pos = 0
+ self.start_line = 0
+ self.start_col = 0
+ self.text = None
+ self.state_name = None
+
+ self.lexicon = lexicon
+ self.stream = stream
+ self.name = name
+ self.queue = []
+ self.initial_state = None
+ self.begin('')
+ self.next_pos = 0
+ self.cur_pos = 0
+ self.cur_line_start = 0
+ self.cur_char = BOL
+ self.input_state = 1
+ if initial_pos is not None:
+ self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2]
+
+ def read(self):
+ """
+ Read the next lexical token from the stream and return a
+ tuple (value, text), where |value| is the value associated with
+ the token as specified by the Lexicon, and |text| is the actual
+ string read from the stream. Returns (None, '') on end of file.
+ """
+ queue = self.queue
+ while not queue:
+ self.text, action = self.scan_a_token()
+ if action is None:
+ self.produce(None)
+ self.eof()
+ else:
+ value = action.perform(self, self.text)
+ if value is not None:
+ self.produce(value)
+ result = queue[0]
+ del queue[0]
+ return result
+
+ def scan_a_token(self):
+ """
+ Read the next input sequence recognised by the machine
+ and return (text, action). Returns ('', None) on end of
+ file.
+ """
+ self.start_pos = self.cur_pos
+ self.start_line = self.cur_line
+ self.start_col = self.cur_pos - self.cur_line_start
+ action = self.run_machine_inlined()
+ if action is not None:
+ if self.trace:
+ print("Scanner: read: Performing %s %d:%d" % (
+ action, self.start_pos, self.cur_pos))
+ text = self.buffer[
+ self.start_pos - self.buf_start_pos:
+ self.cur_pos - self.buf_start_pos]
+ return (text, action)
+ else:
+ if self.cur_pos == self.start_pos:
+ if self.cur_char is EOL:
+ self.next_char()
+ if self.cur_char is None or self.cur_char is EOF:
+ return (u'', None)
+ raise Errors.UnrecognizedInput(self, self.state_name)
+
+ def run_machine_inlined(self):
+ """
+ Inlined version of run_machine for speed.
+ """
+ state = self.initial_state
+ cur_pos = self.cur_pos
+ cur_line = self.cur_line
+ cur_line_start = self.cur_line_start
+ cur_char = self.cur_char
+ input_state = self.input_state
+ next_pos = self.next_pos
+ buffer = self.buffer
+ buf_start_pos = self.buf_start_pos
+ buf_len = len(buffer)
+ b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
+ None, 0, 0, 0, u'', 0, 0
+ trace = self.trace
+ while 1:
+ if trace: #TRACE#
+ print("State %d, %d/%d:%s -->" % ( #TRACE#
+ state['number'], input_state, cur_pos, repr(cur_char))) #TRACE#
+ # Begin inlined self.save_for_backup()
+ #action = state.action #@slow
+ action = state['action'] #@fast
+ if action is not None:
+ b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
+ action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos
+ # End inlined self.save_for_backup()
+ c = cur_char
+ #new_state = state.new_state(c) #@slow
+ new_state = state.get(c, NOT_FOUND) #@fast
+ if new_state is NOT_FOUND: #@fast
+ new_state = c and state.get('else') #@fast
+ if new_state:
+ if trace: #TRACE#
+ print("State %d" % new_state['number']) #TRACE#
+ state = new_state
+ # Begin inlined: self.next_char()
+ if input_state == 1:
+ cur_pos = next_pos
+ # Begin inlined: c = self.read_char()
+ buf_index = next_pos - buf_start_pos
+ if buf_index < buf_len:
+ c = buffer[buf_index]
+ next_pos += 1
+ else:
+ discard = self.start_pos - buf_start_pos
+ data = self.stream.read(0x1000)
+ buffer = self.buffer[discard:] + data
+ self.buffer = buffer
+ buf_start_pos += discard
+ self.buf_start_pos = buf_start_pos
+ buf_len = len(buffer)
+ buf_index -= discard
+ if data:
+ c = buffer[buf_index]
+ next_pos += 1
+ else:
+ c = u''
+ # End inlined: c = self.read_char()
+ if c == u'\n':
+ cur_char = EOL
+ input_state = 2
+ elif not c:
+ cur_char = EOL
+ input_state = 4
+ else:
+ cur_char = c
+ elif input_state == 2:
+ cur_char = u'\n'
+ input_state = 3
+ elif input_state == 3:
+ cur_line += 1
+ cur_line_start = cur_pos = next_pos
+ cur_char = BOL
+ input_state = 1
+ elif input_state == 4:
+ cur_char = EOF
+ input_state = 5
+ else: # input_state = 5
+ cur_char = u''
+ # End inlined self.next_char()
+ else: # not new_state
+ if trace: #TRACE#
+ print("blocked") #TRACE#
+ # Begin inlined: action = self.back_up()
+ if b_action is not None:
+ (action, cur_pos, cur_line, cur_line_start,
+ cur_char, input_state, next_pos) = \
+ (b_action, b_cur_pos, b_cur_line, b_cur_line_start,
+ b_cur_char, b_input_state, b_next_pos)
+ else:
+ action = None
+ break # while 1
+ # End inlined: action = self.back_up()
+ self.cur_pos = cur_pos
+ self.cur_line = cur_line
+ self.cur_line_start = cur_line_start
+ self.cur_char = cur_char
+ self.input_state = input_state
+ self.next_pos = next_pos
+ if trace: #TRACE#
+ if action is not None: #TRACE#
+ print("Doing %s" % action) #TRACE#
+ return action
+
+ def next_char(self):
+ input_state = self.input_state
+ if self.trace:
+ print("Scanner: next: %s [%d] %d" % (" " * 20, input_state, self.cur_pos))
+ if input_state == 1:
+ self.cur_pos = self.next_pos
+ c = self.read_char()
+ if c == u'\n':
+ self.cur_char = EOL
+ self.input_state = 2
+ elif not c:
+ self.cur_char = EOL
+ self.input_state = 4
+ else:
+ self.cur_char = c
+ elif input_state == 2:
+ self.cur_char = u'\n'
+ self.input_state = 3
+ elif input_state == 3:
+ self.cur_line += 1
+ self.cur_line_start = self.cur_pos = self.next_pos
+ self.cur_char = BOL
+ self.input_state = 1
+ elif input_state == 4:
+ self.cur_char = EOF
+ self.input_state = 5
+ else: # input_state = 5
+ self.cur_char = u''
+ if self.trace:
+ print("--> [%d] %d %r" % (input_state, self.cur_pos, self.cur_char))
+
+ def position(self):
+ """
+ Return a tuple (name, line, col) representing the location of
+ the last token read using the read() method. |name| is the
+ name that was provided to the Scanner constructor; |line|
+ is the line number in the stream (1-based); |col| is the
+ position within the line of the first character of the token
+ (0-based).
+ """
+ return (self.name, self.start_line, self.start_col)
+
+ def get_position(self):
+ """Python accessible wrapper around position(), only for error reporting.
+ """
+ return self.position()
+
+ def begin(self, state_name):
+ """Set the current state of the scanner to the named state."""
+ self.initial_state = (
+ self.lexicon.get_initial_state(state_name))
+ self.state_name = state_name
+
+ def produce(self, value, text=None):
+ """
+ Called from an action procedure, causes |value| to be returned
+ as the token value from read(). If |text| is supplied, it is
+ returned in place of the scanned text.
+
+ produce() can be called more than once during a single call to an action
+ procedure, in which case the tokens are queued up and returned one
+ at a time by subsequent calls to read(), until the queue is empty,
+ whereupon scanning resumes.
+ """
+ if text is None:
+ text = self.text
+ self.queue.append((value, text))
+
+ def eof(self):
+ """
+ Override this method if you want something to be done at
+ end of file.
+ """
diff --git a/contrib/tools/cython/Cython/Plex/Timing.py b/contrib/tools/cython/Cython/Plex/Timing.py
new file mode 100644
index 0000000000..5c3692693b
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Timing.py
@@ -0,0 +1,23 @@
+#
+# Get time in platform-dependent way
+#
+
+from __future__ import absolute_import
+
+import os
+from sys import platform, exit, stderr
+
+if platform == 'mac':
+ import MacOS
+ def time():
+ return MacOS.GetTicks() / 60.0
+ timekind = "real"
+elif hasattr(os, 'times'):
+ def time():
+ t = os.times()
+ return t[0] + t[1]
+ timekind = "cpu"
+else:
+ stderr.write(
+ "Don't know how to get time on platform %s\n" % repr(platform))
+ exit(1)
diff --git a/contrib/tools/cython/Cython/Plex/Traditional.py b/contrib/tools/cython/Cython/Plex/Traditional.py
new file mode 100644
index 0000000000..ec7252daed
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Traditional.py
@@ -0,0 +1,158 @@
+#=======================================================================
+#
+# Python Lexical Analyser
+#
+# Traditional Regular Expression Syntax
+#
+#=======================================================================
+
+from __future__ import absolute_import
+
+from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char
+from .Errors import PlexError
+
+
+class RegexpSyntaxError(PlexError):
+ pass
+
+
+def re(s):
+ """
+ Convert traditional string representation of regular expression |s|
+ into Plex representation.
+ """
+ return REParser(s).parse_re()
+
+
+class REParser(object):
+ def __init__(self, s):
+ self.s = s
+ self.i = -1
+ self.end = 0
+ self.next()
+
+ def parse_re(self):
+ re = self.parse_alt()
+ if not self.end:
+ self.error("Unexpected %s" % repr(self.c))
+ return re
+
+ def parse_alt(self):
+ """Parse a set of alternative regexps."""
+ re = self.parse_seq()
+ if self.c == '|':
+ re_list = [re]
+ while self.c == '|':
+ self.next()
+ re_list.append(self.parse_seq())
+ re = Alt(*re_list)
+ return re
+
+ def parse_seq(self):
+ """Parse a sequence of regexps."""
+ re_list = []
+ while not self.end and not self.c in "|)":
+ re_list.append(self.parse_mod())
+ return Seq(*re_list)
+
+ def parse_mod(self):
+ """Parse a primitive regexp followed by *, +, ? modifiers."""
+ re = self.parse_prim()
+ while not self.end and self.c in "*+?":
+ if self.c == '*':
+ re = Rep(re)
+ elif self.c == '+':
+ re = Rep1(re)
+ else: # self.c == '?'
+ re = Opt(re)
+ self.next()
+ return re
+
+ def parse_prim(self):
+ """Parse a primitive regexp."""
+ c = self.get()
+ if c == '.':
+ re = AnyBut("\n")
+ elif c == '^':
+ re = Bol
+ elif c == '$':
+ re = Eol
+ elif c == '(':
+ re = self.parse_alt()
+ self.expect(')')
+ elif c == '[':
+ re = self.parse_charset()
+ self.expect(']')
+ else:
+ if c == '\\':
+ c = self.get()
+ re = Char(c)
+ return re
+
+ def parse_charset(self):
+ """Parse a charset. Does not include the surrounding []."""
+ char_list = []
+ invert = 0
+ if self.c == '^':
+ invert = 1
+ self.next()
+ if self.c == ']':
+ char_list.append(']')
+ self.next()
+ while not self.end and self.c != ']':
+ c1 = self.get()
+ if self.c == '-' and self.lookahead(1) != ']':
+ self.next()
+ c2 = self.get()
+ for a in range(ord(c1), ord(c2) + 1):
+ char_list.append(chr(a))
+ else:
+ char_list.append(c1)
+ chars = ''.join(char_list)
+ if invert:
+ return AnyBut(chars)
+ else:
+ return Any(chars)
+
+ def next(self):
+ """Advance to the next char."""
+ s = self.s
+ i = self.i = self.i + 1
+ if i < len(s):
+ self.c = s[i]
+ else:
+ self.c = ''
+ self.end = 1
+
+ def get(self):
+ if self.end:
+ self.error("Premature end of string")
+ c = self.c
+ self.next()
+ return c
+
+ def lookahead(self, n):
+ """Look ahead n chars."""
+ j = self.i + n
+ if j < len(self.s):
+ return self.s[j]
+ else:
+ return ''
+
+ def expect(self, c):
+ """
+ Expect to find character |c| at current position.
+ Raises an exception otherwise.
+ """
+ if self.c == c:
+ self.next()
+ else:
+ self.error("Missing %s" % repr(c))
+
+ def error(self, mess):
+ """Raise exception to signal syntax error in regexp."""
+ raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
+ repr(self.s), self.i, mess))
+
+
+
diff --git a/contrib/tools/cython/Cython/Plex/Transitions.py b/contrib/tools/cython/Cython/Plex/Transitions.py
new file mode 100644
index 0000000000..3833817946
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/Transitions.py
@@ -0,0 +1,251 @@
+#
+# Plex - Transition Maps
+#
+# This version represents state sets directly as dicts for speed.
+#
+
+from __future__ import absolute_import
+
+try:
+ from sys import maxsize as maxint
+except ImportError:
+ from sys import maxint
+
+
+class TransitionMap(object):
+ """
+ A TransitionMap maps an input event to a set of states.
+ An input event is one of: a range of character codes,
+ the empty string (representing an epsilon move), or one
+ of the special symbols BOL, EOL, EOF.
+
+ For characters, this implementation compactly represents
+ the map by means of a list:
+
+ [code_0, states_0, code_1, states_1, code_2, states_2,
+ ..., code_n-1, states_n-1, code_n]
+
+ where |code_i| is a character code, and |states_i| is a
+ set of states corresponding to characters with codes |c|
+ in the range |code_i| <= |c| <= |code_i+1|.
+
+ The following invariants hold:
+ n >= 1
+ code_0 == -maxint
+ code_n == maxint
+ code_i < code_i+1 for i in 0..n-1
+ states_0 == states_n-1
+
+ Mappings for the special events '', BOL, EOL, EOF are
+ kept separately in a dictionary.
+ """
+
+ map = None # The list of codes and states
+ special = None # Mapping for special events
+
+ def __init__(self, map=None, special=None):
+ if not map:
+ map = [-maxint, {}, maxint]
+ if not special:
+ special = {}
+ self.map = map
+ self.special = special
+ #self.check() ###
+
+ def add(self, event, new_state,
+ TupleType=tuple):
+ """
+ Add transition to |new_state| on |event|.
+ """
+ if type(event) is TupleType:
+ code0, code1 = event
+ i = self.split(code0)
+ j = self.split(code1)
+ map = self.map
+ while i < j:
+ map[i + 1][new_state] = 1
+ i += 2
+ else:
+ self.get_special(event)[new_state] = 1
+
+ def add_set(self, event, new_set,
+ TupleType=tuple):
+ """
+ Add transitions to the states in |new_set| on |event|.
+ """
+ if type(event) is TupleType:
+ code0, code1 = event
+ i = self.split(code0)
+ j = self.split(code1)
+ map = self.map
+ while i < j:
+ map[i + 1].update(new_set)
+ i += 2
+ else:
+ self.get_special(event).update(new_set)
+
+ def get_epsilon(self,
+ none=None):
+ """
+ Return the mapping for epsilon, or None.
+ """
+ return self.special.get('', none)
+
+ def iteritems(self,
+ len=len):
+ """
+ Return the mapping as an iterable of ((code1, code2), state_set) and
+ (special_event, state_set) pairs.
+ """
+ result = []
+ map = self.map
+ else_set = map[1]
+ i = 0
+ n = len(map) - 1
+ code0 = map[0]
+ while i < n:
+ set = map[i + 1]
+ code1 = map[i + 2]
+ if set or else_set:
+ result.append(((code0, code1), set))
+ code0 = code1
+ i += 2
+ for event, set in self.special.items():
+ if set:
+ result.append((event, set))
+ return iter(result)
+
+ items = iteritems
+
+ # ------------------- Private methods --------------------
+
+ def split(self, code,
+ len=len, maxint=maxint):
+ """
+ Search the list for the position of the split point for |code|,
+ inserting a new split point if necessary. Returns index |i| such
+ that |code| == |map[i]|.
+ """
+ # We use a funky variation on binary search.
+ map = self.map
+ hi = len(map) - 1
+ # Special case: code == map[-1]
+ if code == maxint:
+ return hi
+ # General case
+ lo = 0
+ # loop invariant: map[lo] <= code < map[hi] and hi - lo >= 2
+ while hi - lo >= 4:
+ # Find midpoint truncated to even index
+ mid = ((lo + hi) // 2) & ~1
+ if code < map[mid]:
+ hi = mid
+ else:
+ lo = mid
+ # map[lo] <= code < map[hi] and hi - lo == 2
+ if map[lo] == code:
+ return lo
+ else:
+ map[hi:hi] = [code, map[hi - 1].copy()]
+ #self.check() ###
+ return hi
+
+ def get_special(self, event):
+ """
+ Get state set for special event, adding a new entry if necessary.
+ """
+ special = self.special
+ set = special.get(event, None)
+ if not set:
+ set = {}
+ special[event] = set
+ return set
+
+ # --------------------- Conversion methods -----------------------
+
+ def __str__(self):
+ map_strs = []
+ map = self.map
+ n = len(map)
+ i = 0
+ while i < n:
+ code = map[i]
+ if code == -maxint:
+ code_str = "-inf"
+ elif code == maxint:
+ code_str = "inf"
+ else:
+ code_str = str(code)
+ map_strs.append(code_str)
+ i += 1
+ if i < n:
+ map_strs.append(state_set_str(map[i]))
+ i += 1
+ special_strs = {}
+ for event, set in self.special.items():
+ special_strs[event] = state_set_str(set)
+ return "[%s]+%s" % (
+ ','.join(map_strs),
+ special_strs
+ )
+
+ # --------------------- Debugging methods -----------------------
+
+ def check(self):
+ """Check data structure integrity."""
+ if not self.map[-3] < self.map[-1]:
+ print(self)
+ assert 0
+
+ def dump(self, file):
+ map = self.map
+ i = 0
+ n = len(map) - 1
+ while i < n:
+ self.dump_range(map[i], map[i + 2], map[i + 1], file)
+ i += 2
+ for event, set in self.special.items():
+ if set:
+ if not event:
+ event = 'empty'
+ self.dump_trans(event, set, file)
+
+ def dump_range(self, code0, code1, set, file):
+ if set:
+ if code0 == -maxint:
+ if code1 == maxint:
+ k = "any"
+ else:
+ k = "< %s" % self.dump_char(code1)
+ elif code1 == maxint:
+ k = "> %s" % self.dump_char(code0 - 1)
+ elif code0 == code1 - 1:
+ k = self.dump_char(code0)
+ else:
+ k = "%s..%s" % (self.dump_char(code0),
+ self.dump_char(code1 - 1))
+ self.dump_trans(k, set, file)
+
+ def dump_char(self, code):
+ if 0 <= code <= 255:
+ return repr(chr(code))
+ else:
+ return "chr(%d)" % code
+
+ def dump_trans(self, key, set, file):
+ file.write(" %s --> %s\n" % (key, self.dump_set(set)))
+
+ def dump_set(self, set):
+ return state_set_str(set)
+
+
+#
+# State set manipulation functions
+#
+
+#def merge_state_sets(set1, set2):
+# for state in set2.keys():
+# set1[state] = 1
+
+def state_set_str(set):
+ return "[%s]" % ','.join(["S%d" % state.number for state in set])
diff --git a/contrib/tools/cython/Cython/Plex/__init__.py b/contrib/tools/cython/Cython/Plex/__init__.py
new file mode 100644
index 0000000000..81a066f782
--- /dev/null
+++ b/contrib/tools/cython/Cython/Plex/__init__.py
@@ -0,0 +1,39 @@
+#=======================================================================
+#
+# Python Lexical Analyser
+#
+#=======================================================================
+
+"""
+The Plex module provides lexical analysers with similar capabilities
+to GNU Flex. The following classes and functions are exported;
+see the attached docstrings for more information.
+
+ Scanner For scanning a character stream under the
+ direction of a Lexicon.
+
+ Lexicon For constructing a lexical definition
+ to be used by a Scanner.
+
+ Str, Any, AnyBut, AnyChar, Seq, Alt, Opt, Rep, Rep1,
+ Bol, Eol, Eof, Empty
+
+ Regular expression constructors, for building pattern
+ definitions for a Lexicon.
+
+ State For defining scanner states when creating a
+ Lexicon.
+
+ TEXT, IGNORE, Begin
+
+ Actions for associating with patterns when
+ creating a Lexicon.
+"""
+
+from __future__ import absolute_import
+
+from .Actions import TEXT, IGNORE, Begin
+from .Lexicons import Lexicon, State
+from .Regexps import RE, Seq, Alt, Rep1, Empty, Str, Any, AnyBut, AnyChar, Range
+from .Regexps import Opt, Rep, Bol, Eol, Eof, Case, NoCase
+from .Scanners import Scanner