1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
// rbbiscan.h
//
// Copyright (C) 2002-2016, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for class RBBIRuleScanner
//
#ifndef RBBISCAN_H
#define RBBISCAN_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/rbbi.h"
#include "unicode/uniset.h"
#include "unicode/parseerr.h"
#include "uhash.h"
#include "uvector.h"
#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
// looks up references to $variables within a set.
#include "rbbinode.h"
#include "rbbirpt.h"
U_NAMESPACE_BEGIN
class RBBIRuleBuilder;
class RBBISymbolTable;
//--------------------------------------------------------------------------------
//
// class RBBIRuleScanner does the lowest level, character-at-a-time
// scanning of break iterator rules.
//
// The output of the scanner is parse trees for
// the rule expressions and a list of all Unicode Sets
// encountered.
//
//--------------------------------------------------------------------------------
class RBBIRuleScanner : public UMemory {
public:
enum {
kStackSize = 100 // The size of the state stack for
}; // rules parsing. Corresponds roughly
// to the depth of parentheses nesting
// that is allowed in the rules.
struct RBBIRuleChar {
UChar32 fChar;
UBool fEscaped;
RBBIRuleChar() : fChar(0), fEscaped(false) {}
};
RBBIRuleScanner(RBBIRuleBuilder *rb);
virtual ~RBBIRuleScanner();
void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
// Return false if at end.
UBool push(const RBBIRuleChar &c); // Push (unget) one character.
// Only a single character may be pushed.
void parse(); // Parse the rules, generating two parse
// trees, one each for the forward and
// reverse rules,
// and a list of UnicodeSets encountered.
int32_t numRules(); // Return the number of rules that have been seen.
/**
* Return a rules string without unnecessary
* characters.
*/
static UnicodeString stripRules(const UnicodeString &rules);
private:
UBool doParseActions(int32_t a);
void error(UErrorCode e); // error reporting convenience function.
void fixOpStack(RBBINode::OpPrecedence p);
// a character.
void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = nullptr);
UChar32 nextCharLL();
#ifdef RBBI_DEBUG
void printNodeStack(const char *title);
#endif
RBBINode *pushNewNode(RBBINode::NodeType t);
void scanSet();
RBBIRuleBuilder *fRB; // The rule builder that we are part of.
int32_t fScanIndex; // Index of current character being processed
// in the rule input string.
int32_t fNextIndex; // Index of the next character, which
// is the first character not yet scanned.
UBool fQuoteMode; // Scan is in a 'quoted region'
int32_t fLineNum; // Line number in input file.
int32_t fCharNum; // Char position within the line.
UChar32 fLastChar; // Previous char, needed to count CR-LF
// as a single line, not two.
RBBIRuleChar fC; // Current char for parse state machine
// processing.
UnicodeString fVarName; // $variableName, valid when we've just
// scanned one.
RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
// parsing. index by p[state][char-class]
uint16_t fStack[kStackSize]; // State stack, holds state pushes
int32_t fStackPtr; // and pops as specified in the state
// transition rules.
RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
// during the parse of a rule
int32_t fNodeStackPtr;
UBool fReverseRule; // True if the rule currently being scanned
// is a reverse direction rule (if it
// starts with a '!')
UBool fLookAheadRule; // True if the rule includes a '/'
// somewhere within it.
UBool fNoChainInRule; // True if the current rule starts with a '^'.
RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
// $variable symbols.
UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
// the sets created while parsing rules.
// The key is the string used for creating
// the set.
UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during
// the scanning of RBBI rules. The
// indices for these are assigned by the
// perl script that builds the state tables.
// See rbbirpt.h.
int32_t fRuleNum; // Counts each rule as it is scanned.
int32_t fOptionStart; // Input index of start of a !!option
// keyword, while being scanned.
UnicodeSet *gRuleSet_rule_char;
UnicodeSet *gRuleSet_white_space;
UnicodeSet *gRuleSet_name_char;
UnicodeSet *gRuleSet_name_start_char;
RBBIRuleScanner(const RBBIRuleScanner &other) = delete; // forbid copying of this class
RBBIRuleScanner &operator=(const RBBIRuleScanner &other) = delete; // forbid copying of this class
};
U_NAMESPACE_END
#endif
|