diff options
author | orivej <orivej@yandex-team.ru> | 2022-02-10 16:44:49 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:49 +0300 |
commit | 718c552901d703c502ccbefdfc3c9028d608b947 (patch) | |
tree | 46534a98bbefcd7b1f3faa5b52c138ab27db75b7 /contrib/tools/cython/Cython/Plex/Lexicons.py | |
parent | e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (diff) | |
download | ydb-718c552901d703c502ccbefdfc3c9028d608b947.tar.gz |
Restoring authorship annotation for <orivej@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/tools/cython/Cython/Plex/Lexicons.py')
-rw-r--r-- | contrib/tools/cython/Cython/Plex/Lexicons.py | 328 |
1 files changed, 164 insertions, 164 deletions
diff --git a/contrib/tools/cython/Cython/Plex/Lexicons.py b/contrib/tools/cython/Cython/Plex/Lexicons.py index 787f5854b8..eaacbc9c54 100644 --- a/contrib/tools/cython/Cython/Plex/Lexicons.py +++ b/contrib/tools/cython/Cython/Plex/Lexicons.py @@ -22,179 +22,179 @@ DUMP_DFA = 2 class State(object): - """ - This class is used as part of a Plex.Lexicon specification to - introduce a user-defined state. + """ + This class is used as part of a Plex.Lexicon specification to + introduce a user-defined state. - Constructor: + Constructor: - State(name, token_specifications) - """ + State(name, token_specifications) + """ - name = None - tokens = None - - def __init__(self, name, tokens): - self.name = name - self.tokens = tokens + name = None + tokens = None + def __init__(self, name, tokens): + self.name = name + self.tokens = tokens + class Lexicon(object): - """ - Lexicon(specification) builds a lexical analyser from the given - |specification|. The specification consists of a list of - specification items. Each specification item may be either: + """ + Lexicon(specification) builds a lexical analyser from the given + |specification|. The specification consists of a list of + specification items. Each specification item may be either: - 1) A token definition, which is a tuple: + 1) A token definition, which is a tuple: - (pattern, action) - - The |pattern| is a regular axpression built using the - constructors defined in the Plex module. - - The |action| is the action to be performed when this pattern - is recognised (see below). - - 2) A state definition: - - State(name, tokens) - - where |name| is a character string naming the state, - and |tokens| is a list of token definitions as - above. The meaning and usage of states is described - below. - - Actions - ------- - - The |action| in a token specication may be one of three things: - - 1) A function, which is called as follows: - - function(scanner, text) - - where |scanner| is the relevant Scanner instance, and |text| - is the matched text. If the function returns anything - other than None, that value is returned as the value of the - token. If it returns None, scanning continues as if the IGNORE - action were specified (see below). - - 2) One of the following special actions: - - IGNORE means that the recognised characters will be treated as - white space and ignored. Scanning will continue until - the next non-ignored token is recognised before returning. - - TEXT causes the scanned text itself to be returned as the - value of the token. - - 3) Any other value, which is returned as the value of the token. - - States - ------ - - At any given time, the scanner is in one of a number of states. - Associated with each state is a set of possible tokens. When scanning, - only tokens associated with the current state are recognised. - - There is a default state, whose name is the empty string. Token - definitions which are not inside any State definition belong to - the default state. - - The initial state of the scanner is the default state. The state can - be changed in one of two ways: - - 1) Using Begin(state_name) as the action of a token. - - 2) Calling the begin(state_name) method of the Scanner. - - To change back to the default state, use '' as the state name. - """ - - machine = None # Machine - tables = None # StateTableMachine - - def __init__(self, specifications, debug=None, debug_flags=7, timings=None): - if not isinstance(specifications, list): - raise Errors.InvalidScanner("Scanner definition is not a list") - if timings: - from .Timing import time - - total_time = 0.0 - time1 = time() - nfa = Machines.Machine() - default_initial_state = nfa.new_initial_state('') - token_number = 1 - for spec in specifications: - if isinstance(spec, State): - user_initial_state = nfa.new_initial_state(spec.name) - for token in spec.tokens: - self.add_token_to_machine( - nfa, user_initial_state, token, token_number) - token_number += 1 - elif isinstance(spec, tuple): - self.add_token_to_machine( - nfa, default_initial_state, spec, token_number) - token_number += 1 - else: - raise Errors.InvalidToken( - token_number, - "Expected a token definition (tuple) or State instance") - if timings: - time2 = time() - total_time = total_time + (time2 - time1) - time3 = time() - if debug and (debug_flags & 1): - debug.write("\n============= NFA ===========\n") - nfa.dump(debug) - dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug) - if timings: - time4 = time() - total_time = total_time + (time4 - time3) - if debug and (debug_flags & 2): - debug.write("\n============= DFA ===========\n") - dfa.dump(debug) - if timings: - timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) - timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) - timings.write("TOTAL : %5.2f\n" % total_time) - self.machine = dfa - - def add_token_to_machine(self, machine, initial_state, token_spec, token_number): + (pattern, action) + + The |pattern| is a regular axpression built using the + constructors defined in the Plex module. + + The |action| is the action to be performed when this pattern + is recognised (see below). + + 2) A state definition: + + State(name, tokens) + + where |name| is a character string naming the state, + and |tokens| is a list of token definitions as + above. The meaning and usage of states is described + below. + + Actions + ------- + + The |action| in a token specication may be one of three things: + + 1) A function, which is called as follows: + + function(scanner, text) + + where |scanner| is the relevant Scanner instance, and |text| + is the matched text. If the function returns anything + other than None, that value is returned as the value of the + token. If it returns None, scanning continues as if the IGNORE + action were specified (see below). + + 2) One of the following special actions: + + IGNORE means that the recognised characters will be treated as + white space and ignored. Scanning will continue until + the next non-ignored token is recognised before returning. + + TEXT causes the scanned text itself to be returned as the + value of the token. + + 3) Any other value, which is returned as the value of the token. + + States + ------ + + At any given time, the scanner is in one of a number of states. + Associated with each state is a set of possible tokens. When scanning, + only tokens associated with the current state are recognised. + + There is a default state, whose name is the empty string. Token + definitions which are not inside any State definition belong to + the default state. + + The initial state of the scanner is the default state. The state can + be changed in one of two ways: + + 1) Using Begin(state_name) as the action of a token. + + 2) Calling the begin(state_name) method of the Scanner. + + To change back to the default state, use '' as the state name. + """ + + machine = None # Machine + tables = None # StateTableMachine + + def __init__(self, specifications, debug=None, debug_flags=7, timings=None): + if not isinstance(specifications, list): + raise Errors.InvalidScanner("Scanner definition is not a list") + if timings: + from .Timing import time + + total_time = 0.0 + time1 = time() + nfa = Machines.Machine() + default_initial_state = nfa.new_initial_state('') + token_number = 1 + for spec in specifications: + if isinstance(spec, State): + user_initial_state = nfa.new_initial_state(spec.name) + for token in spec.tokens: + self.add_token_to_machine( + nfa, user_initial_state, token, token_number) + token_number += 1 + elif isinstance(spec, tuple): + self.add_token_to_machine( + nfa, default_initial_state, spec, token_number) + token_number += 1 + else: + raise Errors.InvalidToken( + token_number, + "Expected a token definition (tuple) or State instance") + if timings: + time2 = time() + total_time = total_time + (time2 - time1) + time3 = time() + if debug and (debug_flags & 1): + debug.write("\n============= NFA ===========\n") + nfa.dump(debug) + dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug) + if timings: + time4 = time() + total_time = total_time + (time4 - time3) + if debug and (debug_flags & 2): + debug.write("\n============= DFA ===========\n") + dfa.dump(debug) + if timings: + timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) + timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) + timings.write("TOTAL : %5.2f\n" % total_time) + self.machine = dfa + + def add_token_to_machine(self, machine, initial_state, token_spec, token_number): try: - (re, action_spec) = self.parse_token_definition(token_spec) - # Disabled this -- matching empty strings can be useful - #if re.nullable: - # raise Errors.InvalidToken( - # token_number, "Pattern can match 0 input symbols") - if isinstance(action_spec, Actions.Action): - action = action_spec - else: - try: - action_spec.__call__ - except AttributeError: - action = Actions.Return(action_spec) - else: - action = Actions.Call(action_spec) - final_state = machine.new_state() - re.build_machine(machine, initial_state, final_state, - match_bol=1, nocase=0) - final_state.set_action(action, priority=-token_number) - except Errors.PlexError as e: - raise e.__class__("Token number %d: %s" % (token_number, e)) - - def parse_token_definition(self, token_spec): - if not isinstance(token_spec, tuple): - raise Errors.InvalidToken("Token definition is not a tuple") - if len(token_spec) != 2: - raise Errors.InvalidToken("Wrong number of items in token definition") - pattern, action = token_spec - if not isinstance(pattern, Regexps.RE): - raise Errors.InvalidToken("Pattern is not an RE instance") - return (pattern, action) - - def get_initial_state(self, name): - return self.machine.get_initial_state(name) + (re, action_spec) = self.parse_token_definition(token_spec) + # Disabled this -- matching empty strings can be useful + #if re.nullable: + # raise Errors.InvalidToken( + # token_number, "Pattern can match 0 input symbols") + if isinstance(action_spec, Actions.Action): + action = action_spec + else: + try: + action_spec.__call__ + except AttributeError: + action = Actions.Return(action_spec) + else: + action = Actions.Call(action_spec) + final_state = machine.new_state() + re.build_machine(machine, initial_state, final_state, + match_bol=1, nocase=0) + final_state.set_action(action, priority=-token_number) + except Errors.PlexError as e: + raise e.__class__("Token number %d: %s" % (token_number, e)) + + def parse_token_definition(self, token_spec): + if not isinstance(token_spec, tuple): + raise Errors.InvalidToken("Token definition is not a tuple") + if len(token_spec) != 2: + raise Errors.InvalidToken("Wrong number of items in token definition") + pattern, action = token_spec + if not isinstance(pattern, Regexps.RE): + raise Errors.InvalidToken("Pattern is not an RE instance") + return (pattern, action) + + def get_initial_state(self, name): + return self.machine.get_initial_state(name) |