diff options
author | alexv-smirnov <alex@ydb.tech> | 2023-03-15 19:59:12 +0300 |
---|---|---|
committer | alexv-smirnov <alex@ydb.tech> | 2023-03-15 19:59:12 +0300 |
commit | 056bb284ccf8dd6793ec3a54ffa36c4fb2b9ad11 (patch) | |
tree | 4740980126f32e3af7937ba0ca5f83e59baa4ab0 /contrib/tools/cython/Cython/Plex | |
parent | 269126dcced1cc8b53eb4398b4a33e5142f10290 (diff) | |
download | ydb-056bb284ccf8dd6793ec3a54ffa36c4fb2b9ad11.tar.gz |
add library/cpp/actors, ymake build to ydb oss export
Diffstat (limited to 'contrib/tools/cython/Cython/Plex')
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Actions.pxd | 25 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Actions.py | 110 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/DFA.py | 164 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Errors.py | 54 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Lexicons.py | 200 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Machines.py | 255 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Regexps.py | 576 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Scanners.pxd | 50 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Scanners.py | 338 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Timing.py | 23 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Traditional.py | 158 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Transitions.py | 251 | ||||
-rw-r--r-- | contrib/tools/cython/Cython/Plex/__init__.py | 39 |
13 files changed, 2243 insertions, 0 deletions
diff --git a/contrib/tools/cython/Cython/Plex/Actions.pxd b/contrib/tools/cython/Cython/Plex/Actions.pxd new file mode 100644 index 0000000000..34660a2d9b --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Actions.pxd @@ -0,0 +1,25 @@ + +cdef class Action: + cdef perform(self, token_stream, text) + cpdef same_as(self, other) + +cdef class Return(Action): + cdef object value + cdef perform(self, token_stream, text) + cpdef same_as(self, other) + +cdef class Call(Action): + cdef object function + cdef perform(self, token_stream, text) + cpdef same_as(self, other) + +cdef class Begin(Action): + cdef object state_name + cdef perform(self, token_stream, text) + cpdef same_as(self, other) + +cdef class Ignore(Action): + cdef perform(self, token_stream, text) + +cdef class Text(Action): + cdef perform(self, token_stream, text) diff --git a/contrib/tools/cython/Cython/Plex/Actions.py b/contrib/tools/cython/Cython/Plex/Actions.py new file mode 100644 index 0000000000..c88176e716 --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Actions.py @@ -0,0 +1,110 @@ +# cython: auto_pickle=False +#======================================================================= +# +# Python Lexical Analyser +# +# Actions for use in token specifications +# +#======================================================================= + +class Action(object): + def perform(self, token_stream, text): + pass # abstract + + def same_as(self, other): + return self is other + + +class Return(Action): + """ + Internal Plex action which causes |value| to + be returned as the value of the associated token + """ + + def __init__(self, value): + self.value = value + + def perform(self, token_stream, text): + return self.value + + def same_as(self, other): + return isinstance(other, Return) and self.value == other.value + + def __repr__(self): + return "Return(%s)" % repr(self.value) + + +class Call(Action): + """ + Internal Plex action which causes a function to be called. + """ + + def __init__(self, function): + self.function = function + + def perform(self, token_stream, text): + return self.function(token_stream, text) + + def __repr__(self): + return "Call(%s)" % self.function.__name__ + + def same_as(self, other): + return isinstance(other, Call) and self.function is other.function + + +class Begin(Action): + """ + Begin(state_name) is a Plex action which causes the Scanner to + enter the state |state_name|. See the docstring of Plex.Lexicon + for more information. + """ + + def __init__(self, state_name): + self.state_name = state_name + + def perform(self, token_stream, text): + token_stream.begin(self.state_name) + + def __repr__(self): + return "Begin(%s)" % self.state_name + + def same_as(self, other): + return isinstance(other, Begin) and self.state_name == other.state_name + + +class Ignore(Action): + """ + IGNORE is a Plex action which causes its associated token + to be ignored. See the docstring of Plex.Lexicon for more + information. + """ + + def perform(self, token_stream, text): + return None + + def __repr__(self): + return "IGNORE" + + +IGNORE = Ignore() +#IGNORE.__doc__ = Ignore.__doc__ + + +class Text(Action): + """ + TEXT is a Plex action which causes the text of a token to + be returned as the value of the token. See the docstring of + Plex.Lexicon for more information. + """ + + def perform(self, token_stream, text): + return text + + def __repr__(self): + return "TEXT" + + +TEXT = Text() +#TEXT.__doc__ = Text.__doc__ + + diff --git a/contrib/tools/cython/Cython/Plex/DFA.py b/contrib/tools/cython/Cython/Plex/DFA.py new file mode 100644 index 0000000000..76324621fc --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/DFA.py @@ -0,0 +1,164 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Converting NFA to DFA +# +#======================================================================= + +from __future__ import absolute_import + +from . import Machines +from .Machines import LOWEST_PRIORITY +from .Transitions import TransitionMap + + +def nfa_to_dfa(old_machine, debug=None): + """ + Given a nondeterministic Machine, return a new equivalent + Machine which is deterministic. + """ + # We build a new machine whose states correspond to sets of states + # in the old machine. Initially we add a new state corresponding to + # the epsilon-closure of each initial old state. Then we give transitions + # to each new state which are the union of all transitions out of any + # of the corresponding old states. The new state reached on a given + # character is the one corresponding to the set of states reachable + # on that character from any of the old states. As new combinations of + # old states are created, new states are added as needed until closure + # is reached. + new_machine = Machines.FastMachine() + state_map = StateMap(new_machine) + # Seed the process using the initial states of the old machine. + # Make the corresponding new states into initial states of the new + # machine with the same names. + for (key, old_state) in old_machine.initial_states.items(): + new_state = state_map.old_to_new(epsilon_closure(old_state)) + new_machine.make_initial_state(key, new_state) + # Tricky bit here: we add things to the end of this list while we're + # iterating over it. The iteration stops when closure is achieved. + for new_state in new_machine.states: + transitions = TransitionMap() + for old_state in state_map.new_to_old(new_state): + for event, old_target_states in old_state.transitions.items(): + if event and old_target_states: + transitions.add_set(event, set_epsilon_closure(old_target_states)) + for event, old_states in transitions.items(): + new_machine.add_transitions(new_state, event, state_map.old_to_new(old_states)) + if debug: + debug.write("\n===== State Mapping =====\n") + state_map.dump(debug) + return new_machine + + +def set_epsilon_closure(state_set): + """ + Given a set of states, return the union of the epsilon + closures of its member states. + """ + result = {} + for state1 in state_set: + for state2 in epsilon_closure(state1): + result[state2] = 1 + return result + + +def epsilon_closure(state): + """ + Return the set of states reachable from the given state + by epsilon moves. + """ + # Cache the result + result = state.epsilon_closure + if result is None: + result = {} + state.epsilon_closure = result + add_to_epsilon_closure(result, state) + return result + + +def add_to_epsilon_closure(state_set, state): + """ + Recursively add to |state_set| states reachable from the given state + by epsilon moves. + """ + if not state_set.get(state, 0): + state_set[state] = 1 + state_set_2 = state.transitions.get_epsilon() + if state_set_2: + for state2 in state_set_2: + add_to_epsilon_closure(state_set, state2) + + +class StateMap(object): + """ + Helper class used by nfa_to_dfa() to map back and forth between + sets of states from the old machine and states of the new machine. + """ + new_machine = None # Machine + old_to_new_dict = None # {(old_state,...) : new_state} + new_to_old_dict = None # {id(new_state) : old_state_set} + + def __init__(self, new_machine): + self.new_machine = new_machine + self.old_to_new_dict = {} + self.new_to_old_dict = {} + + def old_to_new(self, old_state_set): + """ + Return the state of the new machine corresponding to the + set of old machine states represented by |state_set|. A new + state will be created if necessary. If any of the old states + are accepting states, the new state will be an accepting state + with the highest priority action from the old states. + """ + key = self.make_key(old_state_set) + new_state = self.old_to_new_dict.get(key, None) + if not new_state: + action = self.highest_priority_action(old_state_set) + new_state = self.new_machine.new_state(action) + self.old_to_new_dict[key] = new_state + self.new_to_old_dict[id(new_state)] = old_state_set + #for old_state in old_state_set.keys(): + #new_state.merge_actions(old_state) + return new_state + + def highest_priority_action(self, state_set): + best_action = None + best_priority = LOWEST_PRIORITY + for state in state_set: + priority = state.action_priority + if priority > best_priority: + best_action = state.action + best_priority = priority + return best_action + + # def old_to_new_set(self, old_state_set): + # """ + # Return the new state corresponding to a set of old states as + # a singleton set. + # """ + # return {self.old_to_new(old_state_set):1} + + def new_to_old(self, new_state): + """Given a new state, return a set of corresponding old states.""" + return self.new_to_old_dict[id(new_state)] + + def make_key(self, state_set): + """ + Convert a set of states into a uniquified + sorted tuple suitable for use as a dictionary key. + """ + lst = list(state_set) + lst.sort() + return tuple(lst) + + def dump(self, file): + from .Transitions import state_set_str + + for new_state in self.new_machine.states: + old_state_set = self.new_to_old_dict[id(new_state)] + file.write(" State %s <-- %s\n" % ( + new_state['number'], state_set_str(old_state_set))) + + diff --git a/contrib/tools/cython/Cython/Plex/Errors.py b/contrib/tools/cython/Cython/Plex/Errors.py new file mode 100644 index 0000000000..f460100d77 --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Errors.py @@ -0,0 +1,54 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Exception classes +# +#======================================================================= + + +class PlexError(Exception): + message = "" + + +class PlexTypeError(PlexError, TypeError): + pass + + +class PlexValueError(PlexError, ValueError): + pass + + +class InvalidRegex(PlexError): + pass + + +class InvalidToken(PlexError): + def __init__(self, token_number, message): + PlexError.__init__(self, "Token number %d: %s" % (token_number, message)) + + +class InvalidScanner(PlexError): + pass + + +class AmbiguousAction(PlexError): + message = "Two tokens with different actions can match the same string" + + def __init__(self): + pass + + +class UnrecognizedInput(PlexError): + scanner = None + position = None + state_name = None + + def __init__(self, scanner, state_name): + self.scanner = scanner + self.position = scanner.get_position() + self.state_name = state_name + + def __str__(self): + return ("'%s', line %d, char %d: Token not recognised in state %r" % ( + self.position + (self.state_name,))) diff --git a/contrib/tools/cython/Cython/Plex/Lexicons.py b/contrib/tools/cython/Cython/Plex/Lexicons.py new file mode 100644 index 0000000000..787f5854b8 --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Lexicons.py @@ -0,0 +1,200 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Lexical Analyser Specification +# +#======================================================================= + +from __future__ import absolute_import + +import types + +from . import Actions +from . import DFA +from . import Errors +from . import Machines +from . import Regexps + +# debug_flags for Lexicon constructor +DUMP_NFA = 1 +DUMP_DFA = 2 + + +class State(object): + """ + This class is used as part of a Plex.Lexicon specification to + introduce a user-defined state. + + Constructor: + + State(name, token_specifications) + """ + + name = None + tokens = None + + def __init__(self, name, tokens): + self.name = name + self.tokens = tokens + + +class Lexicon(object): + """ + Lexicon(specification) builds a lexical analyser from the given + |specification|. The specification consists of a list of + specification items. Each specification item may be either: + + 1) A token definition, which is a tuple: + + (pattern, action) + + The |pattern| is a regular axpression built using the + constructors defined in the Plex module. + + The |action| is the action to be performed when this pattern + is recognised (see below). + + 2) A state definition: + + State(name, tokens) + + where |name| is a character string naming the state, + and |tokens| is a list of token definitions as + above. The meaning and usage of states is described + below. + + Actions + ------- + + The |action| in a token specication may be one of three things: + + 1) A function, which is called as follows: + + function(scanner, text) + + where |scanner| is the relevant Scanner instance, and |text| + is the matched text. If the function returns anything + other than None, that value is returned as the value of the + token. If it returns None, scanning continues as if the IGNORE + action were specified (see below). + + 2) One of the following special actions: + + IGNORE means that the recognised characters will be treated as + white space and ignored. Scanning will continue until + the next non-ignored token is recognised before returning. + + TEXT causes the scanned text itself to be returned as the + value of the token. + + 3) Any other value, which is returned as the value of the token. + + States + ------ + + At any given time, the scanner is in one of a number of states. + Associated with each state is a set of possible tokens. When scanning, + only tokens associated with the current state are recognised. + + There is a default state, whose name is the empty string. Token + definitions which are not inside any State definition belong to + the default state. + + The initial state of the scanner is the default state. The state can + be changed in one of two ways: + + 1) Using Begin(state_name) as the action of a token. + + 2) Calling the begin(state_name) method of the Scanner. + + To change back to the default state, use '' as the state name. + """ + + machine = None # Machine + tables = None # StateTableMachine + + def __init__(self, specifications, debug=None, debug_flags=7, timings=None): + if not isinstance(specifications, list): + raise Errors.InvalidScanner("Scanner definition is not a list") + if timings: + from .Timing import time + + total_time = 0.0 + time1 = time() + nfa = Machines.Machine() + default_initial_state = nfa.new_initial_state('') + token_number = 1 + for spec in specifications: + if isinstance(spec, State): + user_initial_state = nfa.new_initial_state(spec.name) + for token in spec.tokens: + self.add_token_to_machine( + nfa, user_initial_state, token, token_number) + token_number += 1 + elif isinstance(spec, tuple): + self.add_token_to_machine( + nfa, default_initial_state, spec, token_number) + token_number += 1 + else: + raise Errors.InvalidToken( + token_number, + "Expected a token definition (tuple) or State instance") + if timings: + time2 = time() + total_time = total_time + (time2 - time1) + time3 = time() + if debug and (debug_flags & 1): + debug.write("\n============= NFA ===========\n") + nfa.dump(debug) + dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug) + if timings: + time4 = time() + total_time = total_time + (time4 - time3) + if debug and (debug_flags & 2): + debug.write("\n============= DFA ===========\n") + dfa.dump(debug) + if timings: + timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) + timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) + timings.write("TOTAL : %5.2f\n" % total_time) + self.machine = dfa + + def add_token_to_machine(self, machine, initial_state, token_spec, token_number): + try: + (re, action_spec) = self.parse_token_definition(token_spec) + # Disabled this -- matching empty strings can be useful + #if re.nullable: + # raise Errors.InvalidToken( + # token_number, "Pattern can match 0 input symbols") + if isinstance(action_spec, Actions.Action): + action = action_spec + else: + try: + action_spec.__call__ + except AttributeError: + action = Actions.Return(action_spec) + else: + action = Actions.Call(action_spec) + final_state = machine.new_state() + re.build_machine(machine, initial_state, final_state, + match_bol=1, nocase=0) + final_state.set_action(action, priority=-token_number) + except Errors.PlexError as e: + raise e.__class__("Token number %d: %s" % (token_number, e)) + + def parse_token_definition(self, token_spec): + if not isinstance(token_spec, tuple): + raise Errors.InvalidToken("Token definition is not a tuple") + if len(token_spec) != 2: + raise Errors.InvalidToken("Wrong number of items in token definition") + pattern, action = token_spec + if not isinstance(pattern, Regexps.RE): + raise Errors.InvalidToken("Pattern is not an RE instance") + return (pattern, action) + + def get_initial_state(self, name): + return self.machine.get_initial_state(name) + + + diff --git a/contrib/tools/cython/Cython/Plex/Machines.py b/contrib/tools/cython/Cython/Plex/Machines.py new file mode 100644 index 0000000000..398850976b --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Machines.py @@ -0,0 +1,255 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Classes for building NFAs and DFAs +# +#======================================================================= + +from __future__ import absolute_import + +import sys + +from .Transitions import TransitionMap + +try: + from sys import maxsize as maxint +except ImportError: + from sys import maxint + +try: + unichr +except NameError: + unichr = chr + +LOWEST_PRIORITY = -maxint + + +class Machine(object): + """A collection of Nodes representing an NFA or DFA.""" + states = None # [Node] + next_state_number = 1 + initial_states = None # {(name, bol): Node} + + def __init__(self): + self.states = [] + self.initial_states = {} + + def __del__(self): + #print "Destroying", self ### + for state in self.states: + state.destroy() + + def new_state(self): + """Add a new state to the machine and return it.""" + s = Node() + n = self.next_state_number + self.next_state_number = n + 1 + s.number = n + self.states.append(s) + return s + + def new_initial_state(self, name): + state = self.new_state() + self.make_initial_state(name, state) + return state + + def make_initial_state(self, name, state): + self.initial_states[name] = state + + def get_initial_state(self, name): + return self.initial_states[name] + + def dump(self, file): + file.write("Plex.Machine:\n") + if self.initial_states is not None: + file.write(" Initial states:\n") + for (name, state) in sorted(self.initial_states.items()): + file.write(" '%s': %d\n" % (name, state.number)) + for s in self.states: + s.dump(file) + + +class Node(object): + """A state of an NFA or DFA.""" + transitions = None # TransitionMap + action = None # Action + action_priority = None # integer + number = 0 # for debug output + epsilon_closure = None # used by nfa_to_dfa() + + def __init__(self): + # Preinitialise the list of empty transitions, because + # the nfa-to-dfa algorithm needs it + #self.transitions = {'':[]} + self.transitions = TransitionMap() + self.action_priority = LOWEST_PRIORITY + + def destroy(self): + #print "Destroying", self ### + self.transitions = None + self.action = None + self.epsilon_closure = None + + def add_transition(self, event, new_state): + self.transitions.add(event, new_state) + + def link_to(self, state): + """Add an epsilon-move from this state to another state.""" + self.add_transition('', state) + + def set_action(self, action, priority): + """Make this an accepting state with the given action. If + there is already an action, choose the action with highest + priority.""" + if priority > self.action_priority: + self.action = action + self.action_priority = priority + + def get_action(self): + return self.action + + def get_action_priority(self): + return self.action_priority + + def is_accepting(self): + return self.action is not None + + def __str__(self): + return "State %d" % self.number + + def dump(self, file): + # Header + file.write(" State %d:\n" % self.number) + # Transitions + # self.dump_transitions(file) + self.transitions.dump(file) + # Action + action = self.action + priority = self.action_priority + if action is not None: + file.write(" %s [priority %d]\n" % (action, priority)) + + def __lt__(self, other): + return self.number < other.number + + +class FastMachine(object): + """ + FastMachine is a deterministic machine represented in a way that + allows fast scanning. + """ + initial_states = None # {state_name:state} + states = None # [state] where state = {event:state, 'else':state, 'action':Action} + next_number = 1 # for debugging + + new_state_template = { + '': None, 'bol': None, 'eol': None, 'eof': None, 'else': None + } + + def __init__(self): + self.initial_states = {} + self.states = [] + + def __del__(self): + for state in self.states: + state.clear() + + def new_state(self, action=None): + number = self.next_number + self.next_number = number + 1 + result = self.new_state_template.copy() + result['number'] = number + result['action'] = action + self.states.append(result) + return result + + def make_initial_state(self, name, state): + self.initial_states[name] = state + + def add_transitions(self, state, event, new_state, maxint=maxint): + if type(event) is tuple: + code0, code1 = event + if code0 == -maxint: + state['else'] = new_state + elif code1 != maxint: + while code0 < code1: + state[unichr(code0)] = new_state + code0 += 1 + else: + state[event] = new_state + + def get_initial_state(self, name): + return self.initial_states[name] + + def dump(self, file): + file.write("Plex.FastMachine:\n") + file.write(" Initial states:\n") + for name, state in sorted(self.initial_states.items()): + file.write(" %s: %s\n" % (repr(name), state['number'])) + for state in self.states: + self.dump_state(state, file) + + def dump_state(self, state, file): + # Header + file.write(" State %d:\n" % state['number']) + # Transitions + self.dump_transitions(state, file) + # Action + action = state['action'] + if action is not None: + file.write(" %s\n" % action) + + def dump_transitions(self, state, file): + chars_leading_to_state = {} + special_to_state = {} + for (c, s) in state.items(): + if len(c) == 1: + chars = chars_leading_to_state.get(id(s), None) + if chars is None: + chars = [] + chars_leading_to_state[id(s)] = chars + chars.append(c) + elif len(c) <= 4: + special_to_state[c] = s + ranges_to_state = {} + for state in self.states: + char_list = chars_leading_to_state.get(id(state), None) + if char_list: + ranges = self.chars_to_ranges(char_list) + ranges_to_state[ranges] = state + ranges_list = ranges_to_state.keys() + ranges_list.sort() + for ranges in ranges_list: + key = self.ranges_to_string(ranges) + state = ranges_to_state[ranges] + file.write(" %s --> State %d\n" % (key, state['number'])) + for key in ('bol', 'eol', 'eof', 'else'): + state = special_to_state.get(key, None) + if state: + file.write(" %s --> State %d\n" % (key, state['number'])) + + def chars_to_ranges(self, char_list): + char_list.sort() + i = 0 + n = len(char_list) + result = [] + while i < n: + c1 = ord(char_list[i]) + c2 = c1 + i += 1 + while i < n and ord(char_list[i]) == c2 + 1: + i += 1 + c2 += 1 + result.append((chr(c1), chr(c2))) + return tuple(result) + + def ranges_to_string(self, range_list): + return ','.join(map(self.range_to_string, range_list)) + + def range_to_string(self, range_tuple): + (c1, c2) = range_tuple + if c1 == c2: + return repr(c1) + else: + return "%s..%s" % (repr(c1), repr(c2)) diff --git a/contrib/tools/cython/Cython/Plex/Regexps.py b/contrib/tools/cython/Cython/Plex/Regexps.py new file mode 100644 index 0000000000..41816c939a --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Regexps.py @@ -0,0 +1,576 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Regular Expressions +# +#======================================================================= + +from __future__ import absolute_import + +import types +try: + from sys import maxsize as maxint +except ImportError: + from sys import maxint + +from . import Errors + +# +# Constants +# + +BOL = 'bol' +EOL = 'eol' +EOF = 'eof' + +nl_code = ord('\n') + + +# +# Helper functions +# + +def chars_to_ranges(s): + """ + Return a list of character codes consisting of pairs + [code1a, code1b, code2a, code2b,...] which cover all + the characters in |s|. + """ + char_list = list(s) + char_list.sort() + i = 0 + n = len(char_list) + result = [] + while i < n: + code1 = ord(char_list[i]) + code2 = code1 + 1 + i += 1 + while i < n and code2 >= ord(char_list[i]): + code2 += 1 + i += 1 + result.append(code1) + result.append(code2) + return result + + +def uppercase_range(code1, code2): + """ + If the range of characters from code1 to code2-1 includes any + lower case letters, return the corresponding upper case range. + """ + code3 = max(code1, ord('a')) + code4 = min(code2, ord('z') + 1) + if code3 < code4: + d = ord('A') - ord('a') + return (code3 + d, code4 + d) + else: + return None + + +def lowercase_range(code1, code2): + """ + If the range of characters from code1 to code2-1 includes any + upper case letters, return the corresponding lower case range. + """ + code3 = max(code1, ord('A')) + code4 = min(code2, ord('Z') + 1) + if code3 < code4: + d = ord('a') - ord('A') + return (code3 + d, code4 + d) + else: + return None + + +def CodeRanges(code_list): + """ + Given a list of codes as returned by chars_to_ranges, return + an RE which will match a character in any of the ranges. + """ + re_list = [CodeRange(code_list[i], code_list[i + 1]) for i in range(0, len(code_list), 2)] + return Alt(*re_list) + + +def CodeRange(code1, code2): + """ + CodeRange(code1, code2) is an RE which matches any character + with a code |c| in the range |code1| <= |c| < |code2|. + """ + if code1 <= nl_code < code2: + return Alt(RawCodeRange(code1, nl_code), + RawNewline, + RawCodeRange(nl_code + 1, code2)) + else: + return RawCodeRange(code1, code2) + + +# +# Abstract classes +# + +class RE(object): + """RE is the base class for regular expression constructors. + The following operators are defined on REs: + + re1 + re2 is an RE which matches |re1| followed by |re2| + re1 | re2 is an RE which matches either |re1| or |re2| + """ + + nullable = 1 # True if this RE can match 0 input symbols + match_nl = 1 # True if this RE can match a string ending with '\n' + str = None # Set to a string to override the class's __str__ result + + def build_machine(self, machine, initial_state, final_state, + match_bol, nocase): + """ + This method should add states to |machine| to implement this + RE, starting at |initial_state| and ending at |final_state|. + If |match_bol| is true, the RE must be able to match at the + beginning of a line. If nocase is true, upper and lower case + letters should be treated as equivalent. + """ + raise NotImplementedError("%s.build_machine not implemented" % + self.__class__.__name__) + + def build_opt(self, m, initial_state, c): + """ + Given a state |s| of machine |m|, return a new state + reachable from |s| on character |c| or epsilon. + """ + s = m.new_state() + initial_state.link_to(s) + initial_state.add_transition(c, s) + return s + + def __add__(self, other): + return Seq(self, other) + + def __or__(self, other): + return Alt(self, other) + + def __str__(self): + if self.str: + return self.str + else: + return self.calc_str() + + def check_re(self, num, value): + if not isinstance(value, RE): + self.wrong_type(num, value, "Plex.RE instance") + + def check_string(self, num, value): + if type(value) != type(''): + self.wrong_type(num, value, "string") + + def check_char(self, num, value): + self.check_string(num, value) + if len(value) != 1: + raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s." + "Expected a string of length 1, got: %s" % ( + num, self.__class__.__name__, repr(value))) + + def wrong_type(self, num, value, expected): + if type(value) == types.InstanceType: + got = "%s.%s instance" % ( + value.__class__.__module__, value.__class__.__name__) + else: + got = type(value).__name__ + raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s " + "(expected %s, got %s" % ( + num, self.__class__.__name__, expected, got)) + +# +# Primitive RE constructors +# ------------------------- +# +# These are the basic REs from which all others are built. +# + +## class Char(RE): +## """ +## Char(c) is an RE which matches the character |c|. +## """ + +## nullable = 0 + +## def __init__(self, char): +## self.char = char +## self.match_nl = char == '\n' + +## def build_machine(self, m, initial_state, final_state, match_bol, nocase): +## c = self.char +## if match_bol and c != BOL: +## s1 = self.build_opt(m, initial_state, BOL) +## else: +## s1 = initial_state +## if c == '\n' or c == EOF: +## s1 = self.build_opt(m, s1, EOL) +## if len(c) == 1: +## code = ord(self.char) +## s1.add_transition((code, code+1), final_state) +## if nocase and is_letter_code(code): +## code2 = other_case_code(code) +## s1.add_transition((code2, code2+1), final_state) +## else: +## s1.add_transition(c, final_state) + +## def calc_str(self): +## return "Char(%s)" % repr(self.char) + + +def Char(c): + """ + Char(c) is an RE which matches the character |c|. + """ + if len(c) == 1: + result = CodeRange(ord(c), ord(c) + 1) + else: + result = SpecialSymbol(c) + result.str = "Char(%s)" % repr(c) + return result + + +class RawCodeRange(RE): + """ + RawCodeRange(code1, code2) is a low-level RE which matches any character + with a code |c| in the range |code1| <= |c| < |code2|, where the range + does not include newline. For internal use only. + """ + nullable = 0 + match_nl = 0 + range = None # (code, code) + uppercase_range = None # (code, code) or None + lowercase_range = None # (code, code) or None + + def __init__(self, code1, code2): + self.range = (code1, code2) + self.uppercase_range = uppercase_range(code1, code2) + self.lowercase_range = lowercase_range(code1, code2) + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + if match_bol: + initial_state = self.build_opt(m, initial_state, BOL) + initial_state.add_transition(self.range, final_state) + if nocase: + if self.uppercase_range: + initial_state.add_transition(self.uppercase_range, final_state) + if self.lowercase_range: + initial_state.add_transition(self.lowercase_range, final_state) + + def calc_str(self): + return "CodeRange(%d,%d)" % (self.code1, self.code2) + + +class _RawNewline(RE): + """ + RawNewline is a low-level RE which matches a newline character. + For internal use only. + """ + nullable = 0 + match_nl = 1 + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + if match_bol: + initial_state = self.build_opt(m, initial_state, BOL) + s = self.build_opt(m, initial_state, EOL) + s.add_transition((nl_code, nl_code + 1), final_state) + + +RawNewline = _RawNewline() + + +class SpecialSymbol(RE): + """ + SpecialSymbol(sym) is an RE which matches the special input + symbol |sym|, which is one of BOL, EOL or EOF. + """ + nullable = 0 + match_nl = 0 + sym = None + + def __init__(self, sym): + self.sym = sym + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + # Sequences 'bol bol' and 'bol eof' are impossible, so only need + # to allow for bol if sym is eol + if match_bol and self.sym == EOL: + initial_state = self.build_opt(m, initial_state, BOL) + initial_state.add_transition(self.sym, final_state) + + +class Seq(RE): + """Seq(re1, re2, re3...) is an RE which matches |re1| followed by + |re2| followed by |re3|...""" + + def __init__(self, *re_list): + nullable = 1 + for i, re in enumerate(re_list): + self.check_re(i, re) + nullable = nullable and re.nullable + self.re_list = re_list + self.nullable = nullable + i = len(re_list) + match_nl = 0 + while i: + i -= 1 + re = re_list[i] + if re.match_nl: + match_nl = 1 + break + if not re.nullable: + break + self.match_nl = match_nl + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + re_list = self.re_list + if len(re_list) == 0: + initial_state.link_to(final_state) + else: + s1 = initial_state + n = len(re_list) + for i, re in enumerate(re_list): + if i < n - 1: + s2 = m.new_state() + else: + s2 = final_state + re.build_machine(m, s1, s2, match_bol, nocase) + s1 = s2 + match_bol = re.match_nl or (match_bol and re.nullable) + + def calc_str(self): + return "Seq(%s)" % ','.join(map(str, self.re_list)) + + +class Alt(RE): + """Alt(re1, re2, re3...) is an RE which matches either |re1| or + |re2| or |re3|...""" + + def __init__(self, *re_list): + self.re_list = re_list + nullable = 0 + match_nl = 0 + nullable_res = [] + non_nullable_res = [] + i = 1 + for re in re_list: + self.check_re(i, re) + if re.nullable: + nullable_res.append(re) + nullable = 1 + else: + non_nullable_res.append(re) + if re.match_nl: + match_nl = 1 + i += 1 + self.nullable_res = nullable_res + self.non_nullable_res = non_nullable_res + self.nullable = nullable + self.match_nl = match_nl + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + for re in self.nullable_res: + re.build_machine(m, initial_state, final_state, match_bol, nocase) + if self.non_nullable_res: + if match_bol: + initial_state = self.build_opt(m, initial_state, BOL) + for re in self.non_nullable_res: + re.build_machine(m, initial_state, final_state, 0, nocase) + + def calc_str(self): + return "Alt(%s)" % ','.join(map(str, self.re_list)) + + +class Rep1(RE): + """Rep1(re) is an RE which matches one or more repetitions of |re|.""" + + def __init__(self, re): + self.check_re(1, re) + self.re = re + self.nullable = re.nullable + self.match_nl = re.match_nl + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + s1 = m.new_state() + s2 = m.new_state() + initial_state.link_to(s1) + self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase) + s2.link_to(s1) + s2.link_to(final_state) + + def calc_str(self): + return "Rep1(%s)" % self.re + + +class SwitchCase(RE): + """ + SwitchCase(re, nocase) is an RE which matches the same strings as RE, + but treating upper and lower case letters according to |nocase|. If + |nocase| is true, case is ignored, otherwise it is not. + """ + re = None + nocase = None + + def __init__(self, re, nocase): + self.re = re + self.nocase = nocase + self.nullable = re.nullable + self.match_nl = re.match_nl + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + self.re.build_machine(m, initial_state, final_state, match_bol, + self.nocase) + + def calc_str(self): + if self.nocase: + name = "NoCase" + else: + name = "Case" + return "%s(%s)" % (name, self.re) + +# +# Composite RE constructors +# ------------------------- +# +# These REs are defined in terms of the primitive REs. +# + +Empty = Seq() +Empty.__doc__ = \ + """ + Empty is an RE which matches the empty string. + """ +Empty.str = "Empty" + + +def Str1(s): + """ + Str1(s) is an RE which matches the literal string |s|. + """ + result = Seq(*tuple(map(Char, s))) + result.str = "Str(%s)" % repr(s) + return result + + +def Str(*strs): + """ + Str(s) is an RE which matches the literal string |s|. + Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|... + """ + if len(strs) == 1: + return Str1(strs[0]) + else: + result = Alt(*tuple(map(Str1, strs))) + result.str = "Str(%s)" % ','.join(map(repr, strs)) + return result + + +def Any(s): + """ + Any(s) is an RE which matches any character in the string |s|. + """ + #result = apply(Alt, tuple(map(Char, s))) + result = CodeRanges(chars_to_ranges(s)) + result.str = "Any(%s)" % repr(s) + return result + + +def AnyBut(s): + """ + AnyBut(s) is an RE which matches any character (including + newline) which is not in the string |s|. + """ + ranges = chars_to_ranges(s) + ranges.insert(0, -maxint) + ranges.append(maxint) + result = CodeRanges(ranges) + result.str = "AnyBut(%s)" % repr(s) + return result + + +AnyChar = AnyBut("") +AnyChar.__doc__ = \ + """ + AnyChar is an RE which matches any single character (including a newline). + """ +AnyChar.str = "AnyChar" + + +def Range(s1, s2=None): + """ + Range(c1, c2) is an RE which matches any single character in the range + |c1| to |c2| inclusive. + Range(s) where |s| is a string of even length is an RE which matches + any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,... + """ + if s2: + result = CodeRange(ord(s1), ord(s2) + 1) + result.str = "Range(%s,%s)" % (s1, s2) + else: + ranges = [] + for i in range(0, len(s1), 2): + ranges.append(CodeRange(ord(s1[i]), ord(s1[i + 1]) + 1)) + result = Alt(*ranges) + result.str = "Range(%s)" % repr(s1) + return result + + +def Opt(re): + """ + Opt(re) is an RE which matches either |re| or the empty string. + """ + result = Alt(re, Empty) + result.str = "Opt(%s)" % re + return result + + +def Rep(re): + """ + Rep(re) is an RE which matches zero or more repetitions of |re|. + """ + result = Opt(Rep1(re)) + result.str = "Rep(%s)" % re + return result + + +def NoCase(re): + """ + NoCase(re) is an RE which matches the same strings as RE, but treating + upper and lower case letters as equivalent. + """ + return SwitchCase(re, nocase=1) + + +def Case(re): + """ + Case(re) is an RE which matches the same strings as RE, but treating + upper and lower case letters as distinct, i.e. it cancels the effect + of any enclosing NoCase(). + """ + return SwitchCase(re, nocase=0) + +# +# RE Constants +# + +Bol = Char(BOL) +Bol.__doc__ = \ + """ + Bol is an RE which matches the beginning of a line. + """ +Bol.str = "Bol" + +Eol = Char(EOL) +Eol.__doc__ = \ + """ + Eol is an RE which matches the end of a line. + """ +Eol.str = "Eol" + +Eof = Char(EOF) +Eof.__doc__ = \ + """ + Eof is an RE which matches the end of the file. + """ +Eof.str = "Eof" + diff --git a/contrib/tools/cython/Cython/Plex/Scanners.pxd b/contrib/tools/cython/Cython/Plex/Scanners.pxd new file mode 100644 index 0000000000..6e75f55e61 --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Scanners.pxd @@ -0,0 +1,50 @@ +from __future__ import absolute_import + +import cython + +from Cython.Plex.Actions cimport Action + +cdef class Scanner: + + cdef public lexicon + cdef public stream + cdef public name + cdef public unicode buffer + cdef public Py_ssize_t buf_start_pos + cdef public Py_ssize_t next_pos + cdef public Py_ssize_t cur_pos + cdef public Py_ssize_t cur_line + cdef public Py_ssize_t cur_line_start + cdef public Py_ssize_t start_pos + cdef public Py_ssize_t start_line + cdef public Py_ssize_t start_col + cdef public text + cdef public initial_state # int? + cdef public state_name + cdef public list queue + cdef public bint trace + cdef public cur_char + cdef public long input_state + + cdef public level + + @cython.final + @cython.locals(input_state=long) + cdef next_char(self) + @cython.locals(action=Action) + cpdef tuple read(self) + @cython.final + cdef tuple scan_a_token(self) + ##cdef tuple position(self) # used frequently by Parsing.py + + @cython.final + @cython.locals(cur_pos=Py_ssize_t, cur_line=Py_ssize_t, cur_line_start=Py_ssize_t, + input_state=long, next_pos=Py_ssize_t, state=dict, + buf_start_pos=Py_ssize_t, buf_len=Py_ssize_t, buf_index=Py_ssize_t, + trace=bint, discard=Py_ssize_t, data=unicode, buffer=unicode) + cdef run_machine_inlined(self) + + @cython.final + cdef begin(self, state) + @cython.final + cdef produce(self, value, text = *) diff --git a/contrib/tools/cython/Cython/Plex/Scanners.py b/contrib/tools/cython/Cython/Plex/Scanners.py new file mode 100644 index 0000000000..88f7e2da3b --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Scanners.py @@ -0,0 +1,338 @@ +# cython: auto_pickle=False +#======================================================================= +# +# Python Lexical Analyser +# +# +# Scanning an input stream +# +#======================================================================= + +from __future__ import absolute_import + +import cython + +cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object) + +from . import Errors +from .Regexps import BOL, EOL, EOF + +NOT_FOUND = object() + + +class Scanner(object): + """ + A Scanner is used to read tokens from a stream of characters + using the token set specified by a Plex.Lexicon. + + Constructor: + + Scanner(lexicon, stream, name = '') + + See the docstring of the __init__ method for details. + + Methods: + + See the docstrings of the individual methods for more + information. + + read() --> (value, text) + Reads the next lexical token from the stream. + + position() --> (name, line, col) + Returns the position of the last token read using the + read() method. + + begin(state_name) + Causes scanner to change state. + + produce(value [, text]) + Causes return of a token value to the caller of the + Scanner. + + """ + + # lexicon = None # Lexicon + # stream = None # file-like object + # name = '' + # buffer = '' + # buf_start_pos = 0 # position in input of start of buffer + # next_pos = 0 # position in input of next char to read + # cur_pos = 0 # position in input of current char + # cur_line = 1 # line number of current char + # cur_line_start = 0 # position in input of start of current line + # start_pos = 0 # position in input of start of token + # start_line = 0 # line number of start of token + # start_col = 0 # position in line of start of token + # text = None # text of last token read + # initial_state = None # Node + # state_name = '' # Name of initial state + # queue = None # list of tokens to be returned + # trace = 0 + + def __init__(self, lexicon, stream, name='', initial_pos=None): + """ + Scanner(lexicon, stream, name = '') + + |lexicon| is a Plex.Lexicon instance specifying the lexical tokens + to be recognised. + + |stream| can be a file object or anything which implements a + compatible read() method. + + |name| is optional, and may be the name of the file being + scanned or any other identifying string. + """ + self.trace = 0 + + self.buffer = u'' + self.buf_start_pos = 0 + self.next_pos = 0 + self.cur_pos = 0 + self.cur_line = 1 + self.start_pos = 0 + self.start_line = 0 + self.start_col = 0 + self.text = None + self.state_name = None + + self.lexicon = lexicon + self.stream = stream + self.name = name + self.queue = [] + self.initial_state = None + self.begin('') + self.next_pos = 0 + self.cur_pos = 0 + self.cur_line_start = 0 + self.cur_char = BOL + self.input_state = 1 + if initial_pos is not None: + self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2] + + def read(self): + """ + Read the next lexical token from the stream and return a + tuple (value, text), where |value| is the value associated with + the token as specified by the Lexicon, and |text| is the actual + string read from the stream. Returns (None, '') on end of file. + """ + queue = self.queue + while not queue: + self.text, action = self.scan_a_token() + if action is None: + self.produce(None) + self.eof() + else: + value = action.perform(self, self.text) + if value is not None: + self.produce(value) + result = queue[0] + del queue[0] + return result + + def scan_a_token(self): + """ + Read the next input sequence recognised by the machine + and return (text, action). Returns ('', None) on end of + file. + """ + self.start_pos = self.cur_pos + self.start_line = self.cur_line + self.start_col = self.cur_pos - self.cur_line_start + action = self.run_machine_inlined() + if action is not None: + if self.trace: + print("Scanner: read: Performing %s %d:%d" % ( + action, self.start_pos, self.cur_pos)) + text = self.buffer[ + self.start_pos - self.buf_start_pos: + self.cur_pos - self.buf_start_pos] + return (text, action) + else: + if self.cur_pos == self.start_pos: + if self.cur_char is EOL: + self.next_char() + if self.cur_char is None or self.cur_char is EOF: + return (u'', None) + raise Errors.UnrecognizedInput(self, self.state_name) + + def run_machine_inlined(self): + """ + Inlined version of run_machine for speed. + """ + state = self.initial_state + cur_pos = self.cur_pos + cur_line = self.cur_line + cur_line_start = self.cur_line_start + cur_char = self.cur_char + input_state = self.input_state + next_pos = self.next_pos + buffer = self.buffer + buf_start_pos = self.buf_start_pos + buf_len = len(buffer) + b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \ + None, 0, 0, 0, u'', 0, 0 + trace = self.trace + while 1: + if trace: #TRACE# + print("State %d, %d/%d:%s -->" % ( #TRACE# + state['number'], input_state, cur_pos, repr(cur_char))) #TRACE# + # Begin inlined self.save_for_backup() + #action = state.action #@slow + action = state['action'] #@fast + if action is not None: + b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \ + action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos + # End inlined self.save_for_backup() + c = cur_char + #new_state = state.new_state(c) #@slow + new_state = state.get(c, NOT_FOUND) #@fast + if new_state is NOT_FOUND: #@fast + new_state = c and state.get('else') #@fast + if new_state: + if trace: #TRACE# + print("State %d" % new_state['number']) #TRACE# + state = new_state + # Begin inlined: self.next_char() + if input_state == 1: + cur_pos = next_pos + # Begin inlined: c = self.read_char() + buf_index = next_pos - buf_start_pos + if buf_index < buf_len: + c = buffer[buf_index] + next_pos += 1 + else: + discard = self.start_pos - buf_start_pos + data = self.stream.read(0x1000) + buffer = self.buffer[discard:] + data + self.buffer = buffer + buf_start_pos += discard + self.buf_start_pos = buf_start_pos + buf_len = len(buffer) + buf_index -= discard + if data: + c = buffer[buf_index] + next_pos += 1 + else: + c = u'' + # End inlined: c = self.read_char() + if c == u'\n': + cur_char = EOL + input_state = 2 + elif not c: + cur_char = EOL + input_state = 4 + else: + cur_char = c + elif input_state == 2: + cur_char = u'\n' + input_state = 3 + elif input_state == 3: + cur_line += 1 + cur_line_start = cur_pos = next_pos + cur_char = BOL + input_state = 1 + elif input_state == 4: + cur_char = EOF + input_state = 5 + else: # input_state = 5 + cur_char = u'' + # End inlined self.next_char() + else: # not new_state + if trace: #TRACE# + print("blocked") #TRACE# + # Begin inlined: action = self.back_up() + if b_action is not None: + (action, cur_pos, cur_line, cur_line_start, + cur_char, input_state, next_pos) = \ + (b_action, b_cur_pos, b_cur_line, b_cur_line_start, + b_cur_char, b_input_state, b_next_pos) + else: + action = None + break # while 1 + # End inlined: action = self.back_up() + self.cur_pos = cur_pos + self.cur_line = cur_line + self.cur_line_start = cur_line_start + self.cur_char = cur_char + self.input_state = input_state + self.next_pos = next_pos + if trace: #TRACE# + if action is not None: #TRACE# + print("Doing %s" % action) #TRACE# + return action + + def next_char(self): + input_state = self.input_state + if self.trace: + print("Scanner: next: %s [%d] %d" % (" " * 20, input_state, self.cur_pos)) + if input_state == 1: + self.cur_pos = self.next_pos + c = self.read_char() + if c == u'\n': + self.cur_char = EOL + self.input_state = 2 + elif not c: + self.cur_char = EOL + self.input_state = 4 + else: + self.cur_char = c + elif input_state == 2: + self.cur_char = u'\n' + self.input_state = 3 + elif input_state == 3: + self.cur_line += 1 + self.cur_line_start = self.cur_pos = self.next_pos + self.cur_char = BOL + self.input_state = 1 + elif input_state == 4: + self.cur_char = EOF + self.input_state = 5 + else: # input_state = 5 + self.cur_char = u'' + if self.trace: + print("--> [%d] %d %r" % (input_state, self.cur_pos, self.cur_char)) + + def position(self): + """ + Return a tuple (name, line, col) representing the location of + the last token read using the read() method. |name| is the + name that was provided to the Scanner constructor; |line| + is the line number in the stream (1-based); |col| is the + position within the line of the first character of the token + (0-based). + """ + return (self.name, self.start_line, self.start_col) + + def get_position(self): + """Python accessible wrapper around position(), only for error reporting. + """ + return self.position() + + def begin(self, state_name): + """Set the current state of the scanner to the named state.""" + self.initial_state = ( + self.lexicon.get_initial_state(state_name)) + self.state_name = state_name + + def produce(self, value, text=None): + """ + Called from an action procedure, causes |value| to be returned + as the token value from read(). If |text| is supplied, it is + returned in place of the scanned text. + + produce() can be called more than once during a single call to an action + procedure, in which case the tokens are queued up and returned one + at a time by subsequent calls to read(), until the queue is empty, + whereupon scanning resumes. + """ + if text is None: + text = self.text + self.queue.append((value, text)) + + def eof(self): + """ + Override this method if you want something to be done at + end of file. + """ diff --git a/contrib/tools/cython/Cython/Plex/Timing.py b/contrib/tools/cython/Cython/Plex/Timing.py new file mode 100644 index 0000000000..5c3692693b --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Timing.py @@ -0,0 +1,23 @@ +# +# Get time in platform-dependent way +# + +from __future__ import absolute_import + +import os +from sys import platform, exit, stderr + +if platform == 'mac': + import MacOS + def time(): + return MacOS.GetTicks() / 60.0 + timekind = "real" +elif hasattr(os, 'times'): + def time(): + t = os.times() + return t[0] + t[1] + timekind = "cpu" +else: + stderr.write( + "Don't know how to get time on platform %s\n" % repr(platform)) + exit(1) diff --git a/contrib/tools/cython/Cython/Plex/Traditional.py b/contrib/tools/cython/Cython/Plex/Traditional.py new file mode 100644 index 0000000000..ec7252daed --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Traditional.py @@ -0,0 +1,158 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Traditional Regular Expression Syntax +# +#======================================================================= + +from __future__ import absolute_import + +from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char +from .Errors import PlexError + + +class RegexpSyntaxError(PlexError): + pass + + +def re(s): + """ + Convert traditional string representation of regular expression |s| + into Plex representation. + """ + return REParser(s).parse_re() + + +class REParser(object): + def __init__(self, s): + self.s = s + self.i = -1 + self.end = 0 + self.next() + + def parse_re(self): + re = self.parse_alt() + if not self.end: + self.error("Unexpected %s" % repr(self.c)) + return re + + def parse_alt(self): + """Parse a set of alternative regexps.""" + re = self.parse_seq() + if self.c == '|': + re_list = [re] + while self.c == '|': + self.next() + re_list.append(self.parse_seq()) + re = Alt(*re_list) + return re + + def parse_seq(self): + """Parse a sequence of regexps.""" + re_list = [] + while not self.end and not self.c in "|)": + re_list.append(self.parse_mod()) + return Seq(*re_list) + + def parse_mod(self): + """Parse a primitive regexp followed by *, +, ? modifiers.""" + re = self.parse_prim() + while not self.end and self.c in "*+?": + if self.c == '*': + re = Rep(re) + elif self.c == '+': + re = Rep1(re) + else: # self.c == '?' + re = Opt(re) + self.next() + return re + + def parse_prim(self): + """Parse a primitive regexp.""" + c = self.get() + if c == '.': + re = AnyBut("\n") + elif c == '^': + re = Bol + elif c == '$': + re = Eol + elif c == '(': + re = self.parse_alt() + self.expect(')') + elif c == '[': + re = self.parse_charset() + self.expect(']') + else: + if c == '\\': + c = self.get() + re = Char(c) + return re + + def parse_charset(self): + """Parse a charset. Does not include the surrounding [].""" + char_list = [] + invert = 0 + if self.c == '^': + invert = 1 + self.next() + if self.c == ']': + char_list.append(']') + self.next() + while not self.end and self.c != ']': + c1 = self.get() + if self.c == '-' and self.lookahead(1) != ']': + self.next() + c2 = self.get() + for a in range(ord(c1), ord(c2) + 1): + char_list.append(chr(a)) + else: + char_list.append(c1) + chars = ''.join(char_list) + if invert: + return AnyBut(chars) + else: + return Any(chars) + + def next(self): + """Advance to the next char.""" + s = self.s + i = self.i = self.i + 1 + if i < len(s): + self.c = s[i] + else: + self.c = '' + self.end = 1 + + def get(self): + if self.end: + self.error("Premature end of string") + c = self.c + self.next() + return c + + def lookahead(self, n): + """Look ahead n chars.""" + j = self.i + n + if j < len(self.s): + return self.s[j] + else: + return '' + + def expect(self, c): + """ + Expect to find character |c| at current position. + Raises an exception otherwise. + """ + if self.c == c: + self.next() + else: + self.error("Missing %s" % repr(c)) + + def error(self, mess): + """Raise exception to signal syntax error in regexp.""" + raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % ( + repr(self.s), self.i, mess)) + + + diff --git a/contrib/tools/cython/Cython/Plex/Transitions.py b/contrib/tools/cython/Cython/Plex/Transitions.py new file mode 100644 index 0000000000..3833817946 --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/Transitions.py @@ -0,0 +1,251 @@ +# +# Plex - Transition Maps +# +# This version represents state sets directly as dicts for speed. +# + +from __future__ import absolute_import + +try: + from sys import maxsize as maxint +except ImportError: + from sys import maxint + + +class TransitionMap(object): + """ + A TransitionMap maps an input event to a set of states. + An input event is one of: a range of character codes, + the empty string (representing an epsilon move), or one + of the special symbols BOL, EOL, EOF. + + For characters, this implementation compactly represents + the map by means of a list: + + [code_0, states_0, code_1, states_1, code_2, states_2, + ..., code_n-1, states_n-1, code_n] + + where |code_i| is a character code, and |states_i| is a + set of states corresponding to characters with codes |c| + in the range |code_i| <= |c| <= |code_i+1|. + + The following invariants hold: + n >= 1 + code_0 == -maxint + code_n == maxint + code_i < code_i+1 for i in 0..n-1 + states_0 == states_n-1 + + Mappings for the special events '', BOL, EOL, EOF are + kept separately in a dictionary. + """ + + map = None # The list of codes and states + special = None # Mapping for special events + + def __init__(self, map=None, special=None): + if not map: + map = [-maxint, {}, maxint] + if not special: + special = {} + self.map = map + self.special = special + #self.check() ### + + def add(self, event, new_state, + TupleType=tuple): + """ + Add transition to |new_state| on |event|. + """ + if type(event) is TupleType: + code0, code1 = event + i = self.split(code0) + j = self.split(code1) + map = self.map + while i < j: + map[i + 1][new_state] = 1 + i += 2 + else: + self.get_special(event)[new_state] = 1 + + def add_set(self, event, new_set, + TupleType=tuple): + """ + Add transitions to the states in |new_set| on |event|. + """ + if type(event) is TupleType: + code0, code1 = event + i = self.split(code0) + j = self.split(code1) + map = self.map + while i < j: + map[i + 1].update(new_set) + i += 2 + else: + self.get_special(event).update(new_set) + + def get_epsilon(self, + none=None): + """ + Return the mapping for epsilon, or None. + """ + return self.special.get('', none) + + def iteritems(self, + len=len): + """ + Return the mapping as an iterable of ((code1, code2), state_set) and + (special_event, state_set) pairs. + """ + result = [] + map = self.map + else_set = map[1] + i = 0 + n = len(map) - 1 + code0 = map[0] + while i < n: + set = map[i + 1] + code1 = map[i + 2] + if set or else_set: + result.append(((code0, code1), set)) + code0 = code1 + i += 2 + for event, set in self.special.items(): + if set: + result.append((event, set)) + return iter(result) + + items = iteritems + + # ------------------- Private methods -------------------- + + def split(self, code, + len=len, maxint=maxint): + """ + Search the list for the position of the split point for |code|, + inserting a new split point if necessary. Returns index |i| such + that |code| == |map[i]|. + """ + # We use a funky variation on binary search. + map = self.map + hi = len(map) - 1 + # Special case: code == map[-1] + if code == maxint: + return hi + # General case + lo = 0 + # loop invariant: map[lo] <= code < map[hi] and hi - lo >= 2 + while hi - lo >= 4: + # Find midpoint truncated to even index + mid = ((lo + hi) // 2) & ~1 + if code < map[mid]: + hi = mid + else: + lo = mid + # map[lo] <= code < map[hi] and hi - lo == 2 + if map[lo] == code: + return lo + else: + map[hi:hi] = [code, map[hi - 1].copy()] + #self.check() ### + return hi + + def get_special(self, event): + """ + Get state set for special event, adding a new entry if necessary. + """ + special = self.special + set = special.get(event, None) + if not set: + set = {} + special[event] = set + return set + + # --------------------- Conversion methods ----------------------- + + def __str__(self): + map_strs = [] + map = self.map + n = len(map) + i = 0 + while i < n: + code = map[i] + if code == -maxint: + code_str = "-inf" + elif code == maxint: + code_str = "inf" + else: + code_str = str(code) + map_strs.append(code_str) + i += 1 + if i < n: + map_strs.append(state_set_str(map[i])) + i += 1 + special_strs = {} + for event, set in self.special.items(): + special_strs[event] = state_set_str(set) + return "[%s]+%s" % ( + ','.join(map_strs), + special_strs + ) + + # --------------------- Debugging methods ----------------------- + + def check(self): + """Check data structure integrity.""" + if not self.map[-3] < self.map[-1]: + print(self) + assert 0 + + def dump(self, file): + map = self.map + i = 0 + n = len(map) - 1 + while i < n: + self.dump_range(map[i], map[i + 2], map[i + 1], file) + i += 2 + for event, set in self.special.items(): + if set: + if not event: + event = 'empty' + self.dump_trans(event, set, file) + + def dump_range(self, code0, code1, set, file): + if set: + if code0 == -maxint: + if code1 == maxint: + k = "any" + else: + k = "< %s" % self.dump_char(code1) + elif code1 == maxint: + k = "> %s" % self.dump_char(code0 - 1) + elif code0 == code1 - 1: + k = self.dump_char(code0) + else: + k = "%s..%s" % (self.dump_char(code0), + self.dump_char(code1 - 1)) + self.dump_trans(k, set, file) + + def dump_char(self, code): + if 0 <= code <= 255: + return repr(chr(code)) + else: + return "chr(%d)" % code + + def dump_trans(self, key, set, file): + file.write(" %s --> %s\n" % (key, self.dump_set(set))) + + def dump_set(self, set): + return state_set_str(set) + + +# +# State set manipulation functions +# + +#def merge_state_sets(set1, set2): +# for state in set2.keys(): +# set1[state] = 1 + +def state_set_str(set): + return "[%s]" % ','.join(["S%d" % state.number for state in set]) diff --git a/contrib/tools/cython/Cython/Plex/__init__.py b/contrib/tools/cython/Cython/Plex/__init__.py new file mode 100644 index 0000000000..81a066f782 --- /dev/null +++ b/contrib/tools/cython/Cython/Plex/__init__.py @@ -0,0 +1,39 @@ +#======================================================================= +# +# Python Lexical Analyser +# +#======================================================================= + +""" +The Plex module provides lexical analysers with similar capabilities +to GNU Flex. The following classes and functions are exported; +see the attached docstrings for more information. + + Scanner For scanning a character stream under the + direction of a Lexicon. + + Lexicon For constructing a lexical definition + to be used by a Scanner. + + Str, Any, AnyBut, AnyChar, Seq, Alt, Opt, Rep, Rep1, + Bol, Eol, Eof, Empty + + Regular expression constructors, for building pattern + definitions for a Lexicon. + + State For defining scanner states when creating a + Lexicon. + + TEXT, IGNORE, Begin + + Actions for associating with patterns when + creating a Lexicon. +""" + +from __future__ import absolute_import + +from .Actions import TEXT, IGNORE, Begin +from .Lexicons import Lexicon, State +from .Regexps import RE, Seq, Alt, Rep1, Empty, Str, Any, AnyBut, AnyChar, Range +from .Regexps import Opt, Rep, Bol, Eol, Eof, Case, NoCase +from .Scanners import Scanner |