1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2008-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 05/11/2008 Andy Heninger Port from Java
**********************************************************************
*/
#include <utility>
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
#include "unicode/brkiter.h"
#include "unicode/localpointer.h"
#include "unicode/uchar.h"
#include "unicode/unifilt.h"
#include "unicode/uniset.h"
#include "brktrans.h"
#include "cmemory.h"
#include "mutex.h"
#include "uprops.h"
#include "uinvchar.h"
#include "util.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
static const UChar SPACE = 32; // ' '
/**
* Constructs a transliterator with the default delimiters '{' and
* '}'.
*/
BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
}
/**
* Destructor.
*/
BreakTransliterator::~BreakTransliterator() {
}
/**
* Copy constructor.
*/
BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
}
/**
* Transliterator API.
*/
BreakTransliterator* BreakTransliterator::clone() const {
return new BreakTransliterator(*this);
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
UBool isIncremental ) const {
UErrorCode status = U_ZERO_ERROR;
LocalPointer<BreakIterator> bi;
LocalPointer<UVector32> boundaries;
{
Mutex m;
BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
boundaries = std::move(nonConstThis->cachedBoundaries);
bi = std::move(nonConstThis->cachedBI);
}
if (bi.isNull()) {
bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
}
if (boundaries.isNull()) {
boundaries.adoptInstead(new UVector32(status));
}
if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
return;
}
boundaries->removeAllElements();
UnicodeString sText = replaceableAsString(text);
bi->setText(sText);
bi->preceding(offsets.start);
// To make things much easier, we will stack the boundaries, and then insert at the end.
// generally, we won't need too many, since we will be filtered.
int32_t boundary;
for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
if (boundary == 0) continue;
// HACK: Check to see that preceding item was a letter
UChar32 cp = sText.char32At(boundary-1);
int type = u_charType(cp);
//System.out.println(Integer.toString(cp,16) + " (before): " + type);
if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
cp = sText.char32At(boundary);
type = u_charType(cp);
//System.out.println(Integer.toString(cp,16) + " (after): " + type);
if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
boundaries->addElement(boundary, status);
// printf("Boundary at %d\n", boundary);
}
int delta = 0;
int lastBoundary = 0;
if (boundaries->size() != 0) { // if we found something, adjust
delta = boundaries->size() * fInsertion.length();
lastBoundary = boundaries->lastElementi();
// we do this from the end backwards, so that we don't have to keep updating.
while (boundaries->size() > 0) {
boundary = boundaries->popi();
text.handleReplaceBetween(boundary, boundary, fInsertion);
}
}
// Now fix up the return values
offsets.contextLimit += delta;
offsets.limit += delta;
offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
// Return break iterator & boundaries vector to the cache.
{
Mutex m;
BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
if (nonConstThis->cachedBI.isNull()) {
nonConstThis->cachedBI = std::move(bi);
}
if (nonConstThis->cachedBoundaries.isNull()) {
nonConstThis->cachedBoundaries = std::move(boundaries);
}
}
// TODO: do something with U_FAILURE(status);
// (need to look at transliterators overall, not just here.)
}
//
// getInsertion()
//
const UnicodeString &BreakTransliterator::getInsertion() const {
return fInsertion;
}
//
// setInsertion()
//
void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
this->fInsertion = insertion;
}
//
// replaceableAsString Hack to let break iterators work
// on the replaceable text from transliterators.
// In practice, the only real Replaceable type that we
// will be seeing is UnicodeString, so this function
// will normally be efficient.
//
UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
UnicodeString s;
UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
if (rs != NULL) {
s = *rs;
} else {
r.extractBetween(0, r.length(), s);
}
return s;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|