diff options
author | romankoshelev <[email protected]> | 2023-08-09 20:07:20 +0300 |
---|---|---|
committer | romankoshelev <[email protected]> | 2023-08-09 20:59:13 +0300 |
commit | fd82fb12fb45e71a02c628e45b12c50c0dd0d308 (patch) | |
tree | f582b79f9002ab1d083e9acda600dfb3551c47b6 /contrib/libs/icu/common/uniset.cpp | |
parent | bf862ddf5c6178e1bb5e4fb3f7c61015deebe284 (diff) |
Update ICU to 70.1
Diffstat (limited to 'contrib/libs/icu/common/uniset.cpp')
-rw-r--r-- | contrib/libs/icu/common/uniset.cpp | 245 |
1 files changed, 122 insertions, 123 deletions
diff --git a/contrib/libs/icu/common/uniset.cpp b/contrib/libs/icu/common/uniset.cpp index b73d612f246..92a81a1a02d 100644 --- a/contrib/libs/icu/common/uniset.cpp +++ b/contrib/libs/icu/common/uniset.cpp @@ -30,24 +30,6 @@ #include "bmpset.h" #include "unisetspan.h" -// Define UChar constants using hex for EBCDIC compatibility -// Used #define to reduce private static exports and memory access time. -#define SET_OPEN ((UChar)0x005B) /*[*/ -#define SET_CLOSE ((UChar)0x005D) /*]*/ -#define HYPHEN ((UChar)0x002D) /*-*/ -#define COMPLEMENT ((UChar)0x005E) /*^*/ -#define COLON ((UChar)0x003A) /*:*/ -#define BACKSLASH ((UChar)0x005C) /*\*/ -#define INTERSECTION ((UChar)0x0026) /*&*/ -#define UPPER_U ((UChar)0x0055) /*U*/ -#define LOWER_U ((UChar)0x0075) /*u*/ -#define OPEN_BRACE ((UChar)123) /*{*/ -#define CLOSE_BRACE ((UChar)125) /*}*/ -#define UPPER_P ((UChar)0x0050) /*P*/ -#define LOWER_P ((UChar)0x0070) /*p*/ -#define UPPER_N ((UChar)78) /*N*/ -#define EQUALS ((UChar)0x003D) /*=*/ - // HIGH_VALUE > all valid values. 110000 for codepoints #define UNICODESET_HIGH 0x0110000 @@ -129,7 +111,7 @@ static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer); } -static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { +static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { const UnicodeString &a = *(const UnicodeString*)t1.pointer; const UnicodeString &b = *(const UnicodeString*)t2.pointer; return a.compare(b); @@ -296,14 +278,14 @@ UnicodeSet *UnicodeSet::cloneAsThawed() const { * @param o set to be compared for equality with this set. * @return <tt>true</tt> if the specified set is equal to this set. */ -UBool UnicodeSet::operator==(const UnicodeSet& o) const { - if (len != o.len) return FALSE; +bool UnicodeSet::operator==(const UnicodeSet& o) const { + if (len != o.len) return false; for (int32_t i = 0; i < len; ++i) { - if (list[i] != o.list[i]) return FALSE; + if (list[i] != o.list[i]) return false; } - if (hasStrings() != o.hasStrings()) { return FALSE; } - if (hasStrings() && *strings != *o.strings) return FALSE; - return TRUE; + if (hasStrings() != o.hasStrings()) { return false; } + if (hasStrings() && *strings != *o.strings) return false; + return true; } /** @@ -444,7 +426,6 @@ UBool UnicodeSet::contains(UChar32 start, UChar32 end) const { * @return <tt>true</tt> if this set contains the specified string */ UBool UnicodeSet::contains(const UnicodeString& s) const { - if (s.length() == 0) return FALSE; int32_t cp = getSingleCP(s); if (cp < 0) { return stringsContains(s); @@ -559,11 +540,9 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const { if (hasStrings()) { for (i=0; i<strings->size(); ++i) { const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); - //if (s.length() == 0) { - // // Empty strings match everything - // return TRUE; - //} - // assert(s.length() != 0); // We enforce this elsewhere + if (s.isEmpty()) { + continue; // skip the empty string + } UChar32 c = s.char32At(0); if ((c & 0xFF) == v) { return TRUE; @@ -582,9 +561,6 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text, int32_t limit, UBool incremental) { if (offset == limit) { - // Strings, if any, have length != 0, so we don't worry - // about them here. If we ever allow zero-length strings - // we much check for them here. if (contains(U_ETHER)) { return incremental ? U_PARTIAL_MATCH : U_MATCH; } else { @@ -614,11 +590,9 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text, for (i=0; i<strings->size(); ++i) { const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i); - - //if (trial.length() == 0) { - // return U_MATCH; // null-string always matches - //} - // assert(trial.length() != 0); // We ensure this elsewhere + if (trial.isEmpty()) { + continue; // skip the empty string + } UChar c = trial.charAt(forward ? 0 : trial.length() - 1); @@ -971,12 +945,12 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { * present. If this set already contains the multicharacter, * the call leaves this set unchanged. * Thus "ch" => {"ch"} - * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> + * * @param s the source string * @return the modified set, for chaining */ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { - if (s.length() == 0 || isFrozen() || isBogus()) return *this; + if (isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { if (!stringsContains(s)) { @@ -991,8 +965,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { /** * Adds the given string, in order, to 'strings'. The given string - * must have been checked by the caller to not be empty and to not - * already be in 'strings'. + * must have been checked by the caller to not already be in 'strings'. */ void UnicodeSet::_add(const UnicodeString& s) { if (isFrozen() || isBogus()) { @@ -1011,7 +984,6 @@ void UnicodeSet::_add(const UnicodeString& s) { strings->sortedInsert(t, compareUnicodeString, ec); if (U_FAILURE(ec)) { setToBogus(); - delete t; } } @@ -1021,16 +993,13 @@ void UnicodeSet::_add(const UnicodeString& s) { * @param string to test */ int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { - //if (s.length() < 1) { - // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); - //} - if (s.length() > 2) return -1; - if (s.length() == 1) return s.charAt(0); - - // at this point, len = 2 - UChar32 cp = s.char32At(0); - if (cp > 0xFFFF) { // is surrogate pair - return cp; + int32_t sLength = s.length(); + if (sLength == 1) return s.charAt(0); + if (sLength == 2) { + UChar32 cp = s.char32At(0); + if (cp > 0xFFFF) { // is surrogate pair + return cp; + } } return -1; } @@ -1150,6 +1119,26 @@ UnicodeSet& UnicodeSet::retain(UChar32 c) { return retain(c, c); } +UnicodeSet& UnicodeSet::retain(const UnicodeString &s) { + if (isFrozen() || isBogus()) { return *this; } + UChar32 cp = getSingleCP(s); + if (cp < 0) { + bool isIn = stringsContains(s); + // Check for getRangeCount() first to avoid somewhat-expensive size() + // when there are single code points. + if (isIn && getRangeCount() == 0 && size() == 1) { + return *this; + } + clear(); + if (isIn) { + _add(s); + } + } else { + retain(cp, cp); + } + return *this; +} + /** * Removes the specified range from this set if it is present. * The set will not contain the specified range once the call @@ -1186,7 +1175,7 @@ UnicodeSet& UnicodeSet::remove(UChar32 c) { * @return the modified set, for chaining */ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { - if (s.length() == 0 || isFrozen() || isBogus()) return *this; + if (isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { if (strings != nullptr && strings->removeElement((void*) &s)) { @@ -1252,12 +1241,12 @@ UnicodeSet& UnicodeSet::complement(void) { * Complement the specified string in this set. * The set will not contain the specified string once the call * returns. - * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> + * * @param s the string to complement * @return this object, for chaining */ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { - if (s.length() == 0 || isFrozen() || isBogus()) return *this; + if (isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { if (stringsContains(s)) { @@ -1978,8 +1967,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) * Append the <code>toPattern()</code> representation of a * string to the given <code>StringBuffer</code>. */ -void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool -escapeUnprintable) { +void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) { UChar32 cp; for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); @@ -1990,39 +1978,50 @@ escapeUnprintable) { * Append the <code>toPattern()</code> representation of a * character to the given <code>StringBuffer</code>. */ -void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool -escapeUnprintable) { - if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { +void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) { + if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) { // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything // unprintable - if (ICU_Utility::escapeUnprintable(buf, c)) { - return; - } + ICU_Utility::escape(buf, c); + return; } // Okay to let ':' pass through switch (c) { - case SET_OPEN: - case SET_CLOSE: - case HYPHEN: - case COMPLEMENT: - case INTERSECTION: - case BACKSLASH: - case OPEN_BRACE: - case CLOSE_BRACE: - case COLON: + case u'[': + case u']': + case u'-': + case u'^': + case u'&': + case u'\\': + case u'{': + case u'}': + case u':': case SymbolTable::SYMBOL_REF: - buf.append(BACKSLASH); + buf.append(u'\\'); break; default: // Escape whitespace if (PatternProps::isWhiteSpace(c)) { - buf.append(BACKSLASH); + buf.append(u'\\'); } break; } buf.append(c); } +void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end, + UBool escapeUnprintable) { + _appendToPat(result, start, escapeUnprintable); + if (start != end) { + if ((start+1) != end || + // Avoid writing what looks like a lead+trail surrogate pair. + start == 0xdbff) { + result.append(u'-'); + } + _appendToPat(result, end, escapeUnprintable); + } +} + /** * Append a string representation of this set to result. This will be * a cleaned version of the string passed to applyPattern(), if there @@ -2037,7 +2036,8 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, for (i=0; i<patLen; ) { UChar32 c; U16_NEXT(pat, i, patLen, c); - if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { + if (escapeUnprintable ? + ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) { // If the unprintable character is preceded by an odd // number of backslashes, then it has been escaped. // Before unescaping it, we delete the final @@ -2045,11 +2045,11 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, if ((backslashCount % 2) == 1) { result.truncate(result.length() - 1); } - ICU_Utility::escapeUnprintable(result, c); + ICU_Utility::escape(result, c); backslashCount = 0; } else { result.append(c); - if (c == BACKSLASH) { + if (c == u'\\') { ++backslashCount; } else { backslashCount = 0; @@ -2082,68 +2082,67 @@ UnicodeString& UnicodeSet::toPattern(UnicodeString& result, UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, UBool escapeUnprintable) const { - result.append(SET_OPEN); - -// // Check against the predefined categories. We implicitly build -// // up ALL category sets the first time toPattern() is called. -// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) { -// if (*this == getCategorySet(cat)) { -// result.append(COLON); -// result.append(CATEGORY_NAMES, cat*2, 2); -// return result.append(CATEGORY_CLOSE); -// } -// } + result.append(u'['); - int32_t count = getRangeCount(); + int32_t i = 0; + int32_t limit = len & ~1; // = 2 * getRangeCount() // If the set contains at least 2 intervals and includes both // MIN_VALUE and MAX_VALUE, then the inverse representation will // be more economical. - if (count > 1 && - getRangeStart(0) == MIN_VALUE && - getRangeEnd(count-1) == MAX_VALUE) { - + // if (getRangeCount() >= 2 && + // getRangeStart(0) == MIN_VALUE && + // getRangeEnd(last) == MAX_VALUE) + // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 + // If limit == len then len is even and the last range ends with MAX_VALUE. + // + // *But* do not write the inverse (complement) if there are strings. + // Since ICU 70, the '^' performs a code point complement which removes all strings. + if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) { // Emit the inverse - result.append(COMPLEMENT); - - for (int32_t i = 1; i < count; ++i) { - UChar32 start = getRangeEnd(i-1)+1; - UChar32 end = getRangeStart(i)-1; - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append(HYPHEN); - } - _appendToPat(result, end, escapeUnprintable); + result.append(u'^'); + // Offsetting the inversion list index by one lets us + // iterate over the ranges of the set complement. + i = 1; + --limit; + } + + // Emit the ranges as pairs. + while (i < limit) { + UChar32 start = list[i]; // getRangeStart() + UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one + if (!(0xd800 <= end && end <= 0xdbff)) { + _appendToPat(result, start, end, escapeUnprintable); + i += 2; + } else { + // The range ends with a lead surrogate. + // Avoid writing what looks like a lead+trail surrogate pair. + // 1. Postpone ranges that start with a lead surrogate code point. + int32_t firstLead = i; + while ((i += 2) < limit && list[i] <= 0xdbff) {} + int32_t firstAfterLead = i; + // 2. Write following ranges that start with a trail surrogate code point. + while (i < limit && (start = list[i]) <= 0xdfff) { + _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable); + i += 2; } - } - } - - // Default; emit the ranges as pairs - else { - for (int32_t i = 0; i < count; ++i) { - UChar32 start = getRangeStart(i); - UChar32 end = getRangeEnd(i); - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append(HYPHEN); - } - _appendToPat(result, end, escapeUnprintable); + // 3. Now write the postponed ranges. + for (int j = firstLead; j < firstAfterLead; j += 2) { + _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable); } } } if (strings != nullptr) { for (int32_t i = 0; i<strings->size(); ++i) { - result.append(OPEN_BRACE); + result.append(u'{'); _appendToPat(result, *(const UnicodeString*) strings->elementAt(i), escapeUnprintable); - result.append(CLOSE_BRACE); + result.append(u'}'); } } - return result.append(SET_CLOSE); + return result.append(u']'); } /** |