diff options
author | romankoshelev <romankoshelev@yandex-team.com> | 2023-08-09 20:07:20 +0300 |
---|---|---|
committer | romankoshelev <romankoshelev@yandex-team.com> | 2023-08-09 20:59:13 +0300 |
commit | fd82fb12fb45e71a02c628e45b12c50c0dd0d308 (patch) | |
tree | f582b79f9002ab1d083e9acda600dfb3551c47b6 /contrib/libs/icu/i18n/regexcmp.cpp | |
parent | bf862ddf5c6178e1bb5e4fb3f7c61015deebe284 (diff) | |
download | ydb-fd82fb12fb45e71a02c628e45b12c50c0dd0d308.tar.gz |
Update ICU to 70.1
Diffstat (limited to 'contrib/libs/icu/i18n/regexcmp.cpp')
-rw-r--r-- | contrib/libs/icu/i18n/regexcmp.cpp | 137 |
1 files changed, 79 insertions, 58 deletions
diff --git a/contrib/libs/icu/i18n/regexcmp.cpp b/contrib/libs/icu/i18n/regexcmp.cpp index dd777b7538..89cb658425 100644 --- a/contrib/libs/icu/i18n/regexcmp.cpp +++ b/contrib/libs/icu/i18n/regexcmp.cpp @@ -53,7 +53,7 @@ U_NAMESPACE_BEGIN // //------------------------------------------------------------------------------ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : - fParenStack(status), fSetStack(status), fSetOpStack(status) + fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status) { // Lazy init of all shared global sets (needed for init()'s empty text) RegexStaticSets::initGlobals(&status); @@ -278,11 +278,6 @@ void RegexCompile::compile( if (U_FAILURE(*fStatus)) { // Bail out if the pattern had errors. - // Set stack cleanup: a successful compile would have left it empty, - // but errors can leave temporary sets hanging around. - while (!fSetStack.empty()) { - delete (UnicodeSet *)fSetStack.pop(); - } return; } @@ -473,7 +468,7 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_START_CAPTURE, varsLoc); appendOp(URX_NOP, 0); - // On the Parentheses stack, start a new frame and add the postions + // On the Parentheses stack, start a new frame and add the positions // of the two NOPs. Depending on what follows in the pattern, the // NOPs may be changed to SAVE_STATE or JMP ops, with a target // address of the end of the parenthesized group. @@ -515,7 +510,7 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); - // On the Parentheses stack, start a new frame and add the postions + // On the Parentheses stack, start a new frame and add the positions // of the two NOPs. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(plain, *fStatus); // Begin a new frame. @@ -540,7 +535,7 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_STO_SP, varLoc); appendOp(URX_NOP, 0); - // On the Parentheses stack, start a new frame and add the postions + // On the Parentheses stack, start a new frame and add the positions // of the two NOPs. Depending on what follows in the pattern, the // NOPs may be changed to SAVE_STATE or JMP ops, with a target // address of the end of the parenthesized group. @@ -557,7 +552,7 @@ UBool RegexCompile::doParseActions(int32_t action) // // Note: Addition of transparent input regions, with the need to // restore the original regions when failing out of a lookahead - // block, complicated this sequence. Some conbined opcodes + // block, complicated this sequence. Some combined opcodes // might make sense - or might not, lookahead aren't that common. // // Caution: min match length optimization knows about this @@ -594,7 +589,7 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); - // On the Parentheses stack, start a new frame and add the postions + // On the Parentheses stack, start a new frame and add the positions // of the NOPs. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(lookAhead, *fStatus); // Frame type. @@ -627,7 +622,7 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_STATE_SAVE, 0); // dest address will be patched later. appendOp(URX_NOP, 0); - // On the Parentheses stack, start a new frame and add the postions + // On the Parentheses stack, start a new frame and add the positions // of the StateSave and NOP. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(negLookAhead, *fStatus); // Frame type @@ -679,7 +674,7 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); - // On the Parentheses stack, start a new frame and add the postions + // On the Parentheses stack, start a new frame and add the positions // of the URX_LB_CONT and the NOP. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(lookBehind, *fStatus); // Frame type @@ -734,7 +729,7 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); - // On the Parentheses stack, start a new frame and add the postions + // On the Parentheses stack, start a new frame and add the positions // of the URX_LB_CONT and the NOP. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(lookBehindN, *fStatus); // Frame type @@ -748,7 +743,7 @@ UBool RegexCompile::doParseActions(int32_t action) case doConditionalExpr: // Conditionals such as (?(1)a:b) case doPerlInline: - // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. + // Perl inline-conditionals. (?{perl code}a|b) We're not perl, no way to do them. error(U_REGEX_UNIMPLEMENTED); break; @@ -1009,7 +1004,7 @@ UBool RegexCompile::doParseActions(int32_t action) case doIntervalInit: // The '{' opening an interval quantifier was just scanned. - // Init the counter varaiables that will accumulate the values as the digits + // Init the counter variables that will accumulate the values as the digits // are scanned. fIntervalLow = 0; fIntervalUpper = -1; @@ -1485,8 +1480,8 @@ UBool RegexCompile::doParseActions(int32_t action) case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; case 0x2d: /* '-' */ fSetModeFlag = FALSE; break; default: - UPRV_UNREACHABLE; // Should never happen. Other chars are filtered out - // by the scanner. + UPRV_UNREACHABLE_EXIT; // Should never happen. Other chars are filtered out + // by the scanner. } if (fSetModeFlag) { fNewModeFlags |= bit; @@ -1522,9 +1517,9 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); - // On the Parentheses stack, start a new frame and add the postions + // On the Parentheses stack, start a new frame and add the positions // of the two NOPs (a normal non-capturing () frame, except for the - // saving of the orignal mode flags.) + // saving of the original mode flags.) fParenStack.push(fModeFlags, *fStatus); fParenStack.push(flags, *fStatus); // Frame Marker fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP @@ -1656,13 +1651,16 @@ UBool RegexCompile::doParseActions(int32_t action) } case doSetBegin: - fixLiterals(FALSE); - fSetStack.push(new UnicodeSet(), *fStatus); - fSetOpStack.push(setStart, *fStatus); - if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { - fSetOpStack.push(setCaseClose, *fStatus); + { + fixLiterals(FALSE); + LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus); + fSetStack.push(lpSet.orphan(), *fStatus); + fSetOpStack.push(setStart, *fStatus); + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { + fSetOpStack.push(setCaseClose, *fStatus); + } + break; } - break; case doSetBeginDifference1: // We have scanned something like [[abc]-[ @@ -1860,7 +1858,7 @@ UBool RegexCompile::doParseActions(int32_t action) } default: - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } if (U_FAILURE(*fStatus)) { @@ -1967,17 +1965,17 @@ int32_t RegexCompile::buildOp(int32_t type, int32_t val) { return 0; } if (type < 0 || type > 255) { - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } if (val > 0x00ffffff) { - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } if (val < 0) { if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } if (URX_TYPE(val) != 0xff) { - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } type = URX_RESERVED_OP_N; } @@ -2373,7 +2371,7 @@ void RegexCompile::handleCloseParen() { default: - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } // remember the next location in the compiled pattern. @@ -2396,8 +2394,8 @@ void RegexCompile::compileSet(UnicodeSet *theSet) return; } // Remove any strings from the set. - // There shoudn't be any, but just in case. - // (Case Closure can add them; if we had a simple case closure avaialble that + // There shouldn't be any, but just in case. + // (Case Closure can add them; if we had a simple case closure available that // ignored strings, that would be better.) theSet->removeAllStrings(); int32_t setSize = theSet->size(); @@ -2428,7 +2426,11 @@ void RegexCompile::compileSet(UnicodeSet *theSet) theSet->freeze(); int32_t setNumber = fRXPat->fSets->size(); fRXPat->fSets->addElement(theSet, *fStatus); - appendOp(URX_SETREF, setNumber); + if (U_SUCCESS(*fStatus)) { + appendOp(URX_SETREF, setNumber); + } else { + delete theSet; + } } } } @@ -2485,7 +2487,7 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); - // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. + // Append the CTR_LOOP op. The operand is the location of the CTR_INIT op. // Goes at end of the block being looped over, so just append to the code so far. appendOp(LoopOp, topOfBlock); @@ -2579,7 +2581,7 @@ UBool RegexCompile::compileInlineInterval() { // The pattern could match a string beginning with a German sharp-s // // To the ordinary case closure for a character c, we add all other -// characters cx where the case closure of cx incudes a string form that begins +// characters cx where the case closure of cx includes a string form that begins // with the original character c. // // This function could be made smarter. The full pattern string is available @@ -2593,7 +2595,8 @@ void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterCh // Machine Generated below. // It may need updating with new versions of Unicode. // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed. -// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing +// The update tool is here: +// https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing // Machine Generated Data. Do not hand edit. static const UChar32 RECaseFixCodePoints[] = { @@ -2634,7 +2637,7 @@ void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterCh if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) { // This function should never be called with an invalid input character. - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT); starterChars->set(caseFoldedC, caseFoldedC); @@ -2919,7 +2922,7 @@ void RegexCompile::matchStartType() { break; - case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. + case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded. case URX_DOTANY_ALL: // . matches one or two. case URX_DOTANY: case URX_DOTANY_UNIX: @@ -3127,10 +3130,10 @@ void RegexCompile::matchStartType() { case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: - UPRV_UNREACHABLE; // Shouldn't get here. These ops should be - // consumed by the scan in URX_LA_START and LB_START + UPRV_UNREACHABLE_EXIT; // Shouldn't get here. These ops should be + // consumed by the scan in URX_LA_START and LB_START default: - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } } @@ -3286,7 +3289,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { case URX_BACKSLASH_R: case URX_BACKSLASH_V: case URX_ONECHAR_I: - case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. + case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded. case URX_DOTANY_ALL: // . matches one or two. case URX_DOTANY: case URX_DOTANY_UNIX: @@ -3406,7 +3409,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { loc++; op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); if (URX_TYPE(op) == URX_LA_START) { - // The boilerplate for look-ahead includes two LA_END insturctions, + // The boilerplate for look-ahead includes two LA_END instructions, // Depth will be decremented by each one when it is seen. depth += 2; } @@ -3450,7 +3453,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { break; default: - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } } @@ -3475,6 +3478,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { // value may be longer than the actual maximum; it must // never be shorter. // +// start, end: the range of the pattern to check. +// end is inclusive. +// //------------------------------------------------------------------------------ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { if (U_FAILURE(*fStatus)) { @@ -3543,7 +3549,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { // Call the max length unbounded, and stop further checking. case URX_BACKREF: // BackRef. Must assume that it might be a zero length match case URX_BACKREF_I: - case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. + case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded. currentLen = INT32_MAX; break; @@ -3693,7 +3699,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { case URX_CTR_LOOP_NG: // These opcodes will be skipped over by code for URX_CTR_INIT. // We shouldn't encounter them here. - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; case URX_LOOP_SR_I: case URX_LOOP_DOT_I: @@ -3713,26 +3719,26 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { // End of look-ahead ops should always be consumed by the processing at // the URX_LA_START op. - // UPRV_UNREACHABLE; + // UPRV_UNREACHABLE_EXIT; case URX_LB_START: { // Look-behind. Scan forward until the matching look-around end, // without processing the look-behind block. int32_t dataLoc = URX_VAL(op); - for (loc = loc + 1; loc < end; ++loc) { + for (loc = loc + 1; loc <= end; ++loc) { op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); int32_t opType = URX_TYPE(op); if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) { break; } } - U_ASSERT(loc < end); + U_ASSERT(loc <= end); } break; default: - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } @@ -3887,7 +3893,7 @@ void RegexCompile::stripNOPs() { default: // Some op is unaccounted for. - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } } @@ -3924,7 +3930,7 @@ void RegexCompile::error(UErrorCode e) { UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context // Fill in the context. - // Note: extractBetween() pins supplied indicies to the string bounds. + // Note: extractBetween() pins supplied indices to the string bounds. uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); @@ -4054,7 +4060,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { // // We are in free-spacing and comments mode. // Scan through any white space and comments, until we - // reach a significant character or the end of inut. + // reach a significant character or the end of input. for (;;) { if (c.fChar == (UChar32)-1) { break; // End of Input @@ -4382,7 +4388,7 @@ static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { // // Create a Unicode Set from a Unicode Property expression. -// This is common code underlying both \p{...} ane [:...:] expressions. +// This is common code underlying both \p{...} and [:...:] expressions. // Includes trying the Java "properties" that aren't supported as // normal ICU UnicodeSet properties // @@ -4575,6 +4581,13 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure. if (U_SUCCESS(status)) { + // ICU 70 adds emoji properties of strings, but as long as Java does not say how to + // deal with properties of strings and character classes with strings, we ignore them. + // Just in case something downstream might stumble over the strings, + // we remove them from the set. + // Note that when we support strings, the complement of a property (as with \P) + // should be implemented as .complement().removeAllStrings() (code point complement). + set->removeAllStrings(); U_ASSERT(set.isValid()); if (negated) { set->complement(); @@ -4608,6 +4621,13 @@ void RegexCompile::setEval(int32_t nextOp) { fSetOpStack.popi(); U_ASSERT(fSetStack.empty() == FALSE); rightOperand = (UnicodeSet *)fSetStack.peek(); + // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings + // (see comments there). + // We also do not yet support string literals in character classes, + // so there should not be any strings. + // Note that when we support strings, the complement of a set (as with ^ or \P) + // should be implemented as .complement().removeAllStrings() (code point complement). + U_ASSERT(!rightOperand->hasStrings()); switch (pendingSetOperation) { case setNegation: rightOperand->complement(); @@ -4638,7 +4658,7 @@ void RegexCompile::setEval(int32_t nextOp) { delete rightOperand; break; default: - UPRV_UNREACHABLE; + UPRV_UNREACHABLE_EXIT; } } } @@ -4646,7 +4666,8 @@ void RegexCompile::setEval(int32_t nextOp) { void RegexCompile::setPushOp(int32_t op) { setEval(op); fSetOpStack.push(op, *fStatus); - fSetStack.push(new UnicodeSet(), *fStatus); + LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus); + fSetStack.push(lpSet.orphan(), *fStatus); } U_NAMESPACE_END |