diff options
author | romankoshelev <[email protected]> | 2023-08-09 20:07:20 +0300 |
---|---|---|
committer | romankoshelev <[email protected]> | 2023-08-09 20:59:13 +0300 |
commit | fd82fb12fb45e71a02c628e45b12c50c0dd0d308 (patch) | |
tree | f582b79f9002ab1d083e9acda600dfb3551c47b6 /contrib/libs/icu/common/rbbi.cpp | |
parent | bf862ddf5c6178e1bb5e4fb3f7c61015deebe284 (diff) |
Update ICU to 70.1
Diffstat (limited to 'contrib/libs/icu/common/rbbi.cpp')
-rw-r--r-- | contrib/libs/icu/common/rbbi.cpp | 232 |
1 files changed, 131 insertions, 101 deletions
diff --git a/contrib/libs/icu/common/rbbi.cpp b/contrib/libs/icu/common/rbbi.cpp index 43ba58ba9e6..f65177f2323 100644 --- a/contrib/libs/icu/common/rbbi.cpp +++ b/contrib/libs/icu/common/rbbi.cpp @@ -68,10 +68,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode init(status); fData = new RBBIDataWrapper(data, status); // status checked in constructor if (U_FAILURE(status)) {return;} - if(fData == 0) { + if(fData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } + if (fData->fForwardTable->fLookAheadResultsSize > 0) { + fLookAheadMatches = static_cast<int32_t *>( + uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); + if (fLookAheadMatches == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + } } // @@ -98,10 +106,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, } fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); if (U_FAILURE(status)) {return;} - if(fData == 0) { + if(fData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } + if (fData->fForwardTable->fLookAheadResultsSize > 0) { + fLookAheadMatches = static_cast<int32_t *>( + uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); + if (fLookAheadMatches == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + } } @@ -117,10 +133,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta init(status); fData = new RBBIDataWrapper(udm, status); // status checked in constructor if (U_FAILURE(status)) {return;} - if(fData == 0) { + if(fData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } + if (fData->fForwardTable->fLookAheadResultsSize > 0) { + fLookAheadMatches = static_cast<int32_t *>( + uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); + if (fLookAheadMatches == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + } } @@ -188,30 +212,34 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() { // fCharIter was adopted from the outside. delete fCharIter; } - fCharIter = NULL; + fCharIter = nullptr; utext_close(&fText); - if (fData != NULL) { + if (fData != nullptr) { fData->removeReference(); - fData = NULL; + fData = nullptr; } delete fBreakCache; - fBreakCache = NULL; + fBreakCache = nullptr; delete fDictionaryCache; - fDictionaryCache = NULL; + fDictionaryCache = nullptr; delete fLanguageBreakEngines; - fLanguageBreakEngines = NULL; + fLanguageBreakEngines = nullptr; delete fUnhandledBreakEngine; - fUnhandledBreakEngine = NULL; + fUnhandledBreakEngine = nullptr; + + uprv_free(fLookAheadMatches); + fLookAheadMatches = nullptr; } /** * Assignment operator. Sets this iterator to have the same behavior, * and iterate over the same text, as the one passed in. + * TODO: needs better handling of memory allocation errors. */ RuleBasedBreakIterator& RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { @@ -234,7 +262,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { fCharIter = &fSCharIter; if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) { - // This is a little bit tricky - it will intially appear that + // This is a little bit tricky - it will initially appear that // this->fCharIter is adopted, even if that->fCharIter was // not adopted. That's ok. fCharIter = that.fCharIter->clone(); @@ -252,6 +280,14 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { fData = that.fData->addReference(); } + uprv_free(fLookAheadMatches); + fLookAheadMatches = nullptr; + if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) { + fLookAheadMatches = static_cast<int32_t *>( + uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); + } + + fPosition = that.fPosition; fRuleStatusIndex = that.fRuleStatusIndex; fDone = that.fDone; @@ -275,16 +311,17 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { // //----------------------------------------------------------------------------- void RuleBasedBreakIterator::init(UErrorCode &status) { - fCharIter = NULL; - fData = NULL; + fCharIter = nullptr; + fData = nullptr; fPosition = 0; fRuleStatusIndex = 0; fDone = false; fDictionaryCharCount = 0; - fLanguageBreakEngines = NULL; - fUnhandledBreakEngine = NULL; - fBreakCache = NULL; - fDictionaryCache = NULL; + fLanguageBreakEngines = nullptr; + fUnhandledBreakEngine = nullptr; + fBreakCache = nullptr; + fDictionaryCache = nullptr; + fLookAheadMatches = nullptr; // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER. // fText = UTEXT_INITIALIZER; @@ -329,16 +366,16 @@ RuleBasedBreakIterator::clone() const { } /** - * Equality operator. Returns TRUE if both BreakIterators are of the + * Equality operator. Returns true if both BreakIterators are of the * same class, have the same behavior, and iterate over the same text. */ -UBool +bool RuleBasedBreakIterator::operator==(const BreakIterator& that) const { if (typeid(*this) != typeid(that)) { - return FALSE; + return false; } if (this == &that) { - return TRUE; + return true; } // The base class BreakIterator carries no state that participates in equality, @@ -351,21 +388,21 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { // The two break iterators are operating on different text, // or have a different iteration position. // Note that fText's position is always the same as the break iterator's position. - return FALSE; + return false; } if (!(fPosition == that2.fPosition && fRuleStatusIndex == that2.fRuleStatusIndex && fDone == that2.fDone)) { - return FALSE; + return false; } if (that2.fData == fData || (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { // The two break iterators are using the same rules. - return TRUE; + return true; } - return FALSE; + return false; } /** @@ -634,7 +671,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { } /** - * Returns true if the specfied position is a boundary position. As a side + * Returns true if the specified position is a boundary position. As a side * effect, leaves the iterator pointing to the first boundary position at * or after "offset". * @@ -700,50 +737,53 @@ enum RBBIRunMode { }; -// Map from look-ahead break states (corresponds to rules) to boundary positions. -// Allows multiple lookahead break rules to be in flight at the same time. -// -// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers -// in the state table be sequential, then we can just index an array. And the -// table could also tell us in advance how big that array needs to be. +// Wrapper functions to select the appropriate handleNext() or handleSafePrevious() +// instantiation, based on whether an 8 or 16 bit table is required. // -// Before ICU 57 there was just a single simple variable for a look-ahead match that -// was in progress. Two rules at once did not work. - -static const int32_t kMaxLookaheads = 8; -struct LookAheadResults { - int32_t fUsedSlotLimit; - int32_t fPositions[8]; - int16_t fKeys[8]; +// These Trie access functions will be inlined within the handleNext()/Previous() instantions. +static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) { + return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c); +} - LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {} +static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) { + return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c); +} - int32_t getPosition(int16_t key) { - for (int32_t i=0; i<fUsedSlotLimit; ++i) { - if (fKeys[i] == key) { - return fPositions[i]; - } +int32_t RuleBasedBreakIterator::handleNext() { + const RBBIStateTable *statetable = fData->fForwardTable; + bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8; + if (statetable->fFlags & RBBI_8BITS_ROWS) { + if (use8BitsTrie) { + return handleNext<RBBIStateTableRow8, TrieFunc8>(); + } else { + return handleNext<RBBIStateTableRow8, TrieFunc16>(); + } + } else { + if (use8BitsTrie) { + return handleNext<RBBIStateTableRow16, TrieFunc8>(); + } else { + return handleNext<RBBIStateTableRow16, TrieFunc16>(); } - UPRV_UNREACHABLE; } +} - void setPosition(int16_t key, int32_t position) { - int32_t i; - for (i=0; i<fUsedSlotLimit; ++i) { - if (fKeys[i] == key) { - fPositions[i] = position; - return; - } +int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { + const RBBIStateTable *statetable = fData->fReverseTable; + bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8; + if (statetable->fFlags & RBBI_8BITS_ROWS) { + if (use8BitsTrie) { + return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition); + } else { + return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition); } - if (i >= kMaxLookaheads) { - UPRV_UNREACHABLE; + } else { + if (use8BitsTrie) { + return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition); + } else { + return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition); } - fKeys[i] = key; - fPositions[i] = position; - U_ASSERT(fUsedSlotLimit == i); - fUsedSlotLimit = i + 1; } -}; +} //----------------------------------------------------------------------------------- @@ -752,26 +792,27 @@ struct LookAheadResults { // Run the state machine to find a boundary // //----------------------------------------------------------------------------------- +template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc> int32_t RuleBasedBreakIterator::handleNext() { int32_t state; uint16_t category = 0; RBBIRunMode mode; - RBBIStateTableRow *row; + RowType *row; UChar32 c; - LookAheadResults lookAheadMatches; int32_t result = 0; int32_t initialPosition = 0; const RBBIStateTable *statetable = fData->fForwardTable; const char *tableData = statetable->fTableData; uint32_t tableRowLen = statetable->fRowLen; + uint32_t dictStart = statetable->fDictCategoriesStart; #ifdef RBBI_DEBUG if (gTrace) { RBBIDebugPuts("Handle Next pos char state category"); } #endif - // handleNext alway sets the break tag value. + // handleNext always sets the break tag value. // Set the default for it. fRuleStatusIndex = 0; @@ -789,7 +830,7 @@ int32_t RuleBasedBreakIterator::handleNext() { // Set the initial state for the state machine state = START_STATE; - row = (RBBIStateTableRow *) + row = (RowType *) //(statetable->fTableData + (statetable->fRowLen * state)); (tableData + tableRowLen * state); @@ -825,21 +866,8 @@ int32_t RuleBasedBreakIterator::handleNext() { if (mode == RBBI_RUN) { // look up the current character's character category, which tells us // which column in the state table to look at. - // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, - // not the size of the character going in, which is a UChar32. - // - category = UTRIE2_GET16(fData->fTrie, c); - - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iteration. - // Chars that need to be handled by a dictionary have a flag bit set - // in their category values. - // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~0x4000; - } + category = trieFunc(fData->fTrie, c); + fDictionaryCharCount += (category >= dictStart); } #ifdef RBBI_DEBUG @@ -860,25 +888,24 @@ int32_t RuleBasedBreakIterator::handleNext() { // fNextState is a variable-length array. U_ASSERT(category<fData->fHeader->fCatCount); state = row->fNextState[category]; /*Not accessing beyond memory*/ - row = (RBBIStateTableRow *) + row = (RowType *) // (statetable->fTableData + (statetable->fRowLen * state)); (tableData + tableRowLen * state); - if (row->fAccepting == -1) { + uint16_t accepting = row->fAccepting; + if (accepting == ACCEPTING_UNCONDITIONAL) { // Match found, common case. if (mode != RBBI_START) { result = (int32_t)UTEXT_GETNATIVEINDEX(&fText); } - fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. - } - - int16_t completedRule = row->fAccepting; - if (completedRule > 0) { + fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values. + } else if (accepting > ACCEPTING_UNCONDITIONAL) { // Lookahead match is completed. - int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize); + int32_t lookaheadResult = fLookAheadMatches[accepting]; if (lookaheadResult >= 0) { - fRuleStatusIndex = row->fTagIdx; + fRuleStatusIndex = row->fTagsIdx; fPosition = lookaheadResult; return lookaheadResult; } @@ -890,10 +917,12 @@ int32_t RuleBasedBreakIterator::handleNext() { // This would enable hard-break rules with no following context. // But there are line break test failures when trying this. Investigate. // Issue ICU-20837 - int16_t rule = row->fLookAhead; - if (rule != 0) { + uint16_t rule = row->fLookAhead; + U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL); + U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize); + if (rule > ACCEPTING_UNCONDITIONAL) { int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText); - lookAheadMatches.setPosition(rule, pos); + fLookAheadMatches[rule] = pos; } if (state == STOP_STATE) { @@ -948,10 +977,12 @@ int32_t RuleBasedBreakIterator::handleNext() { // because the safe table does not require as many options. // //----------------------------------------------------------------------------------- +template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc> int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { + int32_t state; uint16_t category = 0; - RBBIStateTableRow *row; + RowType *row; UChar32 c; int32_t result = 0; @@ -971,7 +1002,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { // Set the initial state for the state machine c = UTEXT_PREVIOUS32(&fText); state = START_STATE; - row = (RBBIStateTableRow *) + row = (RowType *) (stateTable->fTableData + (stateTable->fRowLen * state)); // loop until we reach the start of the text or transition to state 0 @@ -980,12 +1011,9 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { // look up the current character's character category, which tells us // which column in the state table to look at. - // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, - // not the size of the character going in, which is a UChar32. // - // And off the dictionary flag bit. For reverse iteration it is not used. - category = UTRIE2_GET16(fData->fTrie, c); - category &= ~0x4000; + // Off the dictionary flag bit. For reverse iteration it is not used. + category = trieFunc(fData->fTrie, c); #ifdef RBBI_DEBUG if (gTrace) { @@ -1004,12 +1032,12 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { // fNextState is a variable-length array. U_ASSERT(category<fData->fHeader->fCatCount); state = row->fNextState[category]; /*Not accessing beyond memory*/ - row = (RBBIStateTableRow *) + row = (RowType *) (stateTable->fTableData + (stateTable->fRowLen * state)); if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. - // Transistion to state zero means we have found a safe point. + // Transition to state zero means we have found a safe point. break; } } @@ -1024,6 +1052,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { return result; } + //------------------------------------------------------------------------------- // // getRuleStatus() Return the break rule tag associated with the current @@ -1231,6 +1260,7 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { // first. fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); // If we can't insert it, or creation failed, get rid of it + U_ASSERT(!fLanguageBreakEngines->hasDeleter()); if (U_FAILURE(status)) { delete fUnhandledBreakEngine; fUnhandledBreakEngine = 0; |