summaryrefslogtreecommitdiffstats
path: root/contrib/libs/icu/common/uniset.cpp
diff options
context:
space:
mode:
authorromankoshelev <[email protected]>2023-08-09 20:07:20 +0300
committerromankoshelev <[email protected]>2023-08-09 20:59:13 +0300
commitfd82fb12fb45e71a02c628e45b12c50c0dd0d308 (patch)
treef582b79f9002ab1d083e9acda600dfb3551c47b6 /contrib/libs/icu/common/uniset.cpp
parentbf862ddf5c6178e1bb5e4fb3f7c61015deebe284 (diff)
Update ICU to 70.1
Diffstat (limited to 'contrib/libs/icu/common/uniset.cpp')
-rw-r--r--contrib/libs/icu/common/uniset.cpp245
1 files changed, 122 insertions, 123 deletions
diff --git a/contrib/libs/icu/common/uniset.cpp b/contrib/libs/icu/common/uniset.cpp
index b73d612f246..92a81a1a02d 100644
--- a/contrib/libs/icu/common/uniset.cpp
+++ b/contrib/libs/icu/common/uniset.cpp
@@ -30,24 +30,6 @@
#include "bmpset.h"
#include "unisetspan.h"
-// Define UChar constants using hex for EBCDIC compatibility
-// Used #define to reduce private static exports and memory access time.
-#define SET_OPEN ((UChar)0x005B) /*[*/
-#define SET_CLOSE ((UChar)0x005D) /*]*/
-#define HYPHEN ((UChar)0x002D) /*-*/
-#define COMPLEMENT ((UChar)0x005E) /*^*/
-#define COLON ((UChar)0x003A) /*:*/
-#define BACKSLASH ((UChar)0x005C) /*\*/
-#define INTERSECTION ((UChar)0x0026) /*&*/
-#define UPPER_U ((UChar)0x0055) /*U*/
-#define LOWER_U ((UChar)0x0075) /*u*/
-#define OPEN_BRACE ((UChar)123) /*{*/
-#define CLOSE_BRACE ((UChar)125) /*}*/
-#define UPPER_P ((UChar)0x0050) /*P*/
-#define LOWER_P ((UChar)0x0070) /*p*/
-#define UPPER_N ((UChar)78) /*N*/
-#define EQUALS ((UChar)0x003D) /*=*/
-
// HIGH_VALUE > all valid values. 110000 for codepoints
#define UNICODESET_HIGH 0x0110000
@@ -129,7 +111,7 @@ static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
}
-static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
+static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
const UnicodeString &a = *(const UnicodeString*)t1.pointer;
const UnicodeString &b = *(const UnicodeString*)t2.pointer;
return a.compare(b);
@@ -296,14 +278,14 @@ UnicodeSet *UnicodeSet::cloneAsThawed() const {
* @param o set to be compared for equality with this set.
* @return <tt>true</tt> if the specified set is equal to this set.
*/
-UBool UnicodeSet::operator==(const UnicodeSet& o) const {
- if (len != o.len) return FALSE;
+bool UnicodeSet::operator==(const UnicodeSet& o) const {
+ if (len != o.len) return false;
for (int32_t i = 0; i < len; ++i) {
- if (list[i] != o.list[i]) return FALSE;
+ if (list[i] != o.list[i]) return false;
}
- if (hasStrings() != o.hasStrings()) { return FALSE; }
- if (hasStrings() && *strings != *o.strings) return FALSE;
- return TRUE;
+ if (hasStrings() != o.hasStrings()) { return false; }
+ if (hasStrings() && *strings != *o.strings) return false;
+ return true;
}
/**
@@ -444,7 +426,6 @@ UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
* @return <tt>true</tt> if this set contains the specified string
*/
UBool UnicodeSet::contains(const UnicodeString& s) const {
- if (s.length() == 0) return FALSE;
int32_t cp = getSingleCP(s);
if (cp < 0) {
return stringsContains(s);
@@ -559,11 +540,9 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
if (hasStrings()) {
for (i=0; i<strings->size(); ++i) {
const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
- //if (s.length() == 0) {
- // // Empty strings match everything
- // return TRUE;
- //}
- // assert(s.length() != 0); // We enforce this elsewhere
+ if (s.isEmpty()) {
+ continue; // skip the empty string
+ }
UChar32 c = s.char32At(0);
if ((c & 0xFF) == v) {
return TRUE;
@@ -582,9 +561,6 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
int32_t limit,
UBool incremental) {
if (offset == limit) {
- // Strings, if any, have length != 0, so we don't worry
- // about them here. If we ever allow zero-length strings
- // we much check for them here.
if (contains(U_ETHER)) {
return incremental ? U_PARTIAL_MATCH : U_MATCH;
} else {
@@ -614,11 +590,9 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
for (i=0; i<strings->size(); ++i) {
const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
-
- //if (trial.length() == 0) {
- // return U_MATCH; // null-string always matches
- //}
- // assert(trial.length() != 0); // We ensure this elsewhere
+ if (trial.isEmpty()) {
+ continue; // skip the empty string
+ }
UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
@@ -971,12 +945,12 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
* present. If this set already contains the multicharacter,
* the call leaves this set unchanged.
* Thus "ch" => {"ch"}
- * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ *
* @param s the source string
* @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
- if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+ if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (!stringsContains(s)) {
@@ -991,8 +965,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
/**
* Adds the given string, in order, to 'strings'. The given string
- * must have been checked by the caller to not be empty and to not
- * already be in 'strings'.
+ * must have been checked by the caller to not already be in 'strings'.
*/
void UnicodeSet::_add(const UnicodeString& s) {
if (isFrozen() || isBogus()) {
@@ -1011,7 +984,6 @@ void UnicodeSet::_add(const UnicodeString& s) {
strings->sortedInsert(t, compareUnicodeString, ec);
if (U_FAILURE(ec)) {
setToBogus();
- delete t;
}
}
@@ -1021,16 +993,13 @@ void UnicodeSet::_add(const UnicodeString& s) {
* @param string to test
*/
int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
- //if (s.length() < 1) {
- // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
- //}
- if (s.length() > 2) return -1;
- if (s.length() == 1) return s.charAt(0);
-
- // at this point, len = 2
- UChar32 cp = s.char32At(0);
- if (cp > 0xFFFF) { // is surrogate pair
- return cp;
+ int32_t sLength = s.length();
+ if (sLength == 1) return s.charAt(0);
+ if (sLength == 2) {
+ UChar32 cp = s.char32At(0);
+ if (cp > 0xFFFF) { // is surrogate pair
+ return cp;
+ }
}
return -1;
}
@@ -1150,6 +1119,26 @@ UnicodeSet& UnicodeSet::retain(UChar32 c) {
return retain(c, c);
}
+UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
+ if (isFrozen() || isBogus()) { return *this; }
+ UChar32 cp = getSingleCP(s);
+ if (cp < 0) {
+ bool isIn = stringsContains(s);
+ // Check for getRangeCount() first to avoid somewhat-expensive size()
+ // when there are single code points.
+ if (isIn && getRangeCount() == 0 && size() == 1) {
+ return *this;
+ }
+ clear();
+ if (isIn) {
+ _add(s);
+ }
+ } else {
+ retain(cp, cp);
+ }
+ return *this;
+}
+
/**
* Removes the specified range from this set if it is present.
* The set will not contain the specified range once the call
@@ -1186,7 +1175,7 @@ UnicodeSet& UnicodeSet::remove(UChar32 c) {
* @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
- if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+ if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (strings != nullptr && strings->removeElement((void*) &s)) {
@@ -1252,12 +1241,12 @@ UnicodeSet& UnicodeSet::complement(void) {
* Complement the specified string in this set.
* The set will not contain the specified string once the call
* returns.
- * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ *
* @param s the string to complement
* @return this object, for chaining
*/
UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
- if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+ if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (stringsContains(s)) {
@@ -1978,8 +1967,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
* Append the <code>toPattern()</code> representation of a
* string to the given <code>StringBuffer</code>.
*/
-void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
-escapeUnprintable) {
+void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
UChar32 cp;
for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
@@ -1990,39 +1978,50 @@ escapeUnprintable) {
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
*/
-void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
-escapeUnprintable) {
- if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
+ if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
// unprintable
- if (ICU_Utility::escapeUnprintable(buf, c)) {
- return;
- }
+ ICU_Utility::escape(buf, c);
+ return;
}
// Okay to let ':' pass through
switch (c) {
- case SET_OPEN:
- case SET_CLOSE:
- case HYPHEN:
- case COMPLEMENT:
- case INTERSECTION:
- case BACKSLASH:
- case OPEN_BRACE:
- case CLOSE_BRACE:
- case COLON:
+ case u'[':
+ case u']':
+ case u'-':
+ case u'^':
+ case u'&':
+ case u'\\':
+ case u'{':
+ case u'}':
+ case u':':
case SymbolTable::SYMBOL_REF:
- buf.append(BACKSLASH);
+ buf.append(u'\\');
break;
default:
// Escape whitespace
if (PatternProps::isWhiteSpace(c)) {
- buf.append(BACKSLASH);
+ buf.append(u'\\');
}
break;
}
buf.append(c);
}
+void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
+ UBool escapeUnprintable) {
+ _appendToPat(result, start, escapeUnprintable);
+ if (start != end) {
+ if ((start+1) != end ||
+ // Avoid writing what looks like a lead+trail surrogate pair.
+ start == 0xdbff) {
+ result.append(u'-');
+ }
+ _appendToPat(result, end, escapeUnprintable);
+ }
+}
+
/**
* Append a string representation of this set to result. This will be
* a cleaned version of the string passed to applyPattern(), if there
@@ -2037,7 +2036,8 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
for (i=0; i<patLen; ) {
UChar32 c;
U16_NEXT(pat, i, patLen, c);
- if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+ if (escapeUnprintable ?
+ ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
// Before unescaping it, we delete the final
@@ -2045,11 +2045,11 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
if ((backslashCount % 2) == 1) {
result.truncate(result.length() - 1);
}
- ICU_Utility::escapeUnprintable(result, c);
+ ICU_Utility::escape(result, c);
backslashCount = 0;
} else {
result.append(c);
- if (c == BACKSLASH) {
+ if (c == u'\\') {
++backslashCount;
} else {
backslashCount = 0;
@@ -2082,68 +2082,67 @@ UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
UBool escapeUnprintable) const
{
- result.append(SET_OPEN);
-
-// // Check against the predefined categories. We implicitly build
-// // up ALL category sets the first time toPattern() is called.
-// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
-// if (*this == getCategorySet(cat)) {
-// result.append(COLON);
-// result.append(CATEGORY_NAMES, cat*2, 2);
-// return result.append(CATEGORY_CLOSE);
-// }
-// }
+ result.append(u'[');
- int32_t count = getRangeCount();
+ int32_t i = 0;
+ int32_t limit = len & ~1; // = 2 * getRangeCount()
// If the set contains at least 2 intervals and includes both
// MIN_VALUE and MAX_VALUE, then the inverse representation will
// be more economical.
- if (count > 1 &&
- getRangeStart(0) == MIN_VALUE &&
- getRangeEnd(count-1) == MAX_VALUE) {
-
+ // if (getRangeCount() >= 2 &&
+ // getRangeStart(0) == MIN_VALUE &&
+ // getRangeEnd(last) == MAX_VALUE)
+ // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
+ // If limit == len then len is even and the last range ends with MAX_VALUE.
+ //
+ // *But* do not write the inverse (complement) if there are strings.
+ // Since ICU 70, the '^' performs a code point complement which removes all strings.
+ if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
// Emit the inverse
- result.append(COMPLEMENT);
-
- for (int32_t i = 1; i < count; ++i) {
- UChar32 start = getRangeEnd(i-1)+1;
- UChar32 end = getRangeStart(i)-1;
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append(HYPHEN);
- }
- _appendToPat(result, end, escapeUnprintable);
+ result.append(u'^');
+ // Offsetting the inversion list index by one lets us
+ // iterate over the ranges of the set complement.
+ i = 1;
+ --limit;
+ }
+
+ // Emit the ranges as pairs.
+ while (i < limit) {
+ UChar32 start = list[i]; // getRangeStart()
+ UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
+ if (!(0xd800 <= end && end <= 0xdbff)) {
+ _appendToPat(result, start, end, escapeUnprintable);
+ i += 2;
+ } else {
+ // The range ends with a lead surrogate.
+ // Avoid writing what looks like a lead+trail surrogate pair.
+ // 1. Postpone ranges that start with a lead surrogate code point.
+ int32_t firstLead = i;
+ while ((i += 2) < limit && list[i] <= 0xdbff) {}
+ int32_t firstAfterLead = i;
+ // 2. Write following ranges that start with a trail surrogate code point.
+ while (i < limit && (start = list[i]) <= 0xdfff) {
+ _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
+ i += 2;
}
- }
- }
-
- // Default; emit the ranges as pairs
- else {
- for (int32_t i = 0; i < count; ++i) {
- UChar32 start = getRangeStart(i);
- UChar32 end = getRangeEnd(i);
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append(HYPHEN);
- }
- _appendToPat(result, end, escapeUnprintable);
+ // 3. Now write the postponed ranges.
+ for (int j = firstLead; j < firstAfterLead; j += 2) {
+ _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
}
}
}
if (strings != nullptr) {
for (int32_t i = 0; i<strings->size(); ++i) {
- result.append(OPEN_BRACE);
+ result.append(u'{');
_appendToPat(result,
*(const UnicodeString*) strings->elementAt(i),
escapeUnprintable);
- result.append(CLOSE_BRACE);
+ result.append(u'}');
}
}
- return result.append(SET_CLOSE);
+ return result.append(u']');
}
/**