diff options
author | romankoshelev <romankoshelev@yandex-team.com> | 2023-08-14 19:51:50 +0300 |
---|---|---|
committer | romankoshelev <romankoshelev@yandex-team.com> | 2023-08-15 01:24:11 +0300 |
commit | cfcd865e05c0d0525ea27d1e153a043b32a85138 (patch) | |
tree | 68d3b3b25271e8a4998505897a269ff7ce119b76 /contrib/libs/icu/common/rbbi.cpp | |
parent | ccb790c507bd5e8ffe2ef9886ce5ee0a7ce22a15 (diff) | |
download | ydb-cfcd865e05c0d0525ea27d1e153a043b32a85138.tar.gz |
Update ICU to 73.2
Diffstat (limited to 'contrib/libs/icu/common/rbbi.cpp')
-rw-r--r-- | contrib/libs/icu/common/rbbi.cpp | 232 |
1 files changed, 108 insertions, 124 deletions
diff --git a/contrib/libs/icu/common/rbbi.cpp b/contrib/libs/icu/common/rbbi.cpp index f65177f232..73716ab406 100644 --- a/contrib/libs/icu/common/rbbi.cpp +++ b/contrib/libs/icu/common/rbbi.cpp @@ -39,7 +39,7 @@ #include "uvectr32.h" #ifdef RBBI_DEBUG -static UBool gTrace = FALSE; +static UBool gTrace = false; #endif U_NAMESPACE_BEGIN @@ -63,9 +63,8 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) * tables object that is passed in as a parameter. */ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) - : fSCharIter(UnicodeString()) + : RuleBasedBreakIterator(&status) { - init(status); fData = new RBBIDataWrapper(data, status); // status checked in constructor if (U_FAILURE(status)) {return;} if(fData == nullptr) { @@ -82,6 +81,19 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode } } +//------------------------------------------------------------------------------- +// +// Constructor from a UDataMemory handle to precompiled break rules +// stored in an ICU data file. This construcotr is private API, +// only for internal use. +// +//------------------------------------------------------------------------------- +RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking, + UErrorCode &status) : RuleBasedBreakIterator(udm, status) +{ + fIsPhraseBreaking = isPhraseBreaking; +} + // // Construct from precompiled binary rules (tables). This constructor is public API, // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). @@ -89,13 +101,12 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, uint32_t ruleLength, UErrorCode &status) - : fSCharIter(UnicodeString()) + : RuleBasedBreakIterator(&status) { - init(status); if (U_FAILURE(status)) { return; } - if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { + if (compiledRules == nullptr || ruleLength < sizeof(RBBIDataHeader)) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } @@ -128,9 +139,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, // //------------------------------------------------------------------------------- RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) - : fSCharIter(UnicodeString()) + : RuleBasedBreakIterator(&status) { - init(status); fData = new RBBIDataWrapper(udm, status); // status checked in constructor if (U_FAILURE(status)) {return;} if(fData == nullptr) { @@ -157,9 +167,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, UParseError &parseError, UErrorCode &status) - : fSCharIter(UnicodeString()) + : RuleBasedBreakIterator(&status) { - init(status); if (U_FAILURE(status)) {return;} RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); @@ -181,10 +190,39 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, // of rules. //------------------------------------------------------------------------------- RuleBasedBreakIterator::RuleBasedBreakIterator() - : fSCharIter(UnicodeString()) + : RuleBasedBreakIterator(nullptr) { - UErrorCode status = U_ZERO_ERROR; - init(status); +} + +/** + * Simple Constructor with an error code. + * Handles common initialization for all other constructors. + */ +RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) { + UErrorCode ec = U_ZERO_ERROR; + if (status == nullptr) { + status = &ec; + } + utext_openUChars(&fText, nullptr, 0, status); + LocalPointer<DictionaryCache> lpDictionaryCache(new DictionaryCache(this, *status), *status); + LocalPointer<BreakCache> lpBreakCache(new BreakCache(this, *status), *status); + if (U_FAILURE(*status)) { + fErrorCode = *status; + return; + } + fDictionaryCache = lpDictionaryCache.orphan(); + fBreakCache = lpBreakCache.orphan(); + +#ifdef RBBI_DEBUG + static UBool debugInitDone = false; + if (debugInitDone == false) { + char *debugEnv = getenv("U_RBBIDEBUG"); + if (debugEnv && uprv_strstr(debugEnv, "trace")) { + gTrace = true; + } + debugInitDone = true; + } +#endif } @@ -195,11 +233,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() // //------------------------------------------------------------------------------- RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) -: BreakIterator(other), - fSCharIter(UnicodeString()) +: RuleBasedBreakIterator() { - UErrorCode status = U_ZERO_ERROR; - this->init(status); *this = other; } @@ -248,35 +283,35 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { } BreakIterator::operator=(that); - if (fLanguageBreakEngines != NULL) { + if (fLanguageBreakEngines != nullptr) { delete fLanguageBreakEngines; - fLanguageBreakEngines = NULL; // Just rebuild for now + fLanguageBreakEngines = nullptr; // Just rebuild for now } // TODO: clone fLanguageBreakEngines from "that" UErrorCode status = U_ZERO_ERROR; - utext_clone(&fText, &that.fText, FALSE, TRUE, &status); + utext_clone(&fText, &that.fText, false, true, &status); if (fCharIter != &fSCharIter) { delete fCharIter; } fCharIter = &fSCharIter; - if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) { + if (that.fCharIter != nullptr && that.fCharIter != &that.fSCharIter) { // This is a little bit tricky - it will initially appear that // this->fCharIter is adopted, even if that->fCharIter was // not adopted. That's ok. fCharIter = that.fCharIter->clone(); } fSCharIter = that.fSCharIter; - if (fCharIter == NULL) { + if (fCharIter == nullptr) { fCharIter = &fSCharIter; } - if (fData != NULL) { + if (fData != nullptr) { fData->removeReference(); - fData = NULL; + fData = nullptr; } - if (that.fData != NULL) { + if (that.fData != nullptr) { fData = that.fData->addReference(); } @@ -302,57 +337,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { return *this; } - - -//----------------------------------------------------------------------------- -// -// init() Shared initialization routine. Used by all the constructors. -// Initializes all fields, leaving the object in a consistent state. -// -//----------------------------------------------------------------------------- -void RuleBasedBreakIterator::init(UErrorCode &status) { - fCharIter = nullptr; - fData = nullptr; - fPosition = 0; - fRuleStatusIndex = 0; - fDone = false; - fDictionaryCharCount = 0; - fLanguageBreakEngines = nullptr; - fUnhandledBreakEngine = nullptr; - fBreakCache = nullptr; - fDictionaryCache = nullptr; - fLookAheadMatches = nullptr; - - // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER. - // fText = UTEXT_INITIALIZER; - static const UText initializedUText = UTEXT_INITIALIZER; - uprv_memcpy(&fText, &initializedUText, sizeof(UText)); - - if (U_FAILURE(status)) { - return; - } - - utext_openUChars(&fText, NULL, 0, &status); - fDictionaryCache = new DictionaryCache(this, status); - fBreakCache = new BreakCache(this, status); - if (U_SUCCESS(status) && (fDictionaryCache == NULL || fBreakCache == NULL)) { - status = U_MEMORY_ALLOCATION_ERROR; - } - -#ifdef RBBI_DEBUG - static UBool debugInitDone = FALSE; - if (debugInitDone == FALSE) { - char *debugEnv = getenv("U_RBBIDEBUG"); - if (debugEnv && uprv_strstr(debugEnv, "trace")) { - gTrace = TRUE; - } - debugInitDone = TRUE; - } -#endif -} - - - //----------------------------------------------------------------------------- // // clone - Returns a newly-constructed RuleBasedBreakIterator with the same @@ -382,7 +366,7 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { // and does not implement an equality function that would otherwise be // checked at this point. - const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; + const RuleBasedBreakIterator& that2 = static_cast<const RuleBasedBreakIterator&>(that); if (!utext_equals(&fText, &that2.fText)) { // The two break iterators are operating on different text, @@ -398,7 +382,7 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { } if (that2.fData == fData || - (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { + (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) { // The two break iterators are using the same rules. return true; } @@ -410,9 +394,9 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { * @return A hash code */ int32_t -RuleBasedBreakIterator::hashCode(void) const { +RuleBasedBreakIterator::hashCode() const { int32_t hash = 0; - if (fData != NULL) { + if (fData != nullptr) { hash = fData->hashCode(); } return hash; @@ -425,7 +409,7 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { } fBreakCache->reset(); fDictionaryCache->reset(); - utext_clone(&fText, ut, FALSE, TRUE, &status); + utext_clone(&fText, ut, false, true, &status); // Set up a dummy CharacterIterator to be returned if anyone // calls getText(). With input from UText, there is no reasonable @@ -433,7 +417,7 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { // Return one over an empty string instead - this is the closest // we can come to signaling a failure. // (GetText() is obsolete, this failure is sort of OK) - fSCharIter.setText(UnicodeString()); + fSCharIter.setText(u"", 0); if (fCharIter != &fSCharIter) { // existing fCharIter was adopted from the outside. Delete it now. @@ -446,7 +430,7 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { - UText *result = utext_clone(fillIn, &fText, FALSE, TRUE, &status); + UText *result = utext_clone(fillIn, &fText, false, true, &status); return result; } @@ -480,10 +464,10 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { UErrorCode status = U_ZERO_ERROR; fBreakCache->reset(); fDictionaryCache->reset(); - if (newText==NULL || newText->startIndex() != 0) { + if (newText==nullptr || newText->startIndex() != 0) { // startIndex !=0 wants to be an error, but there's no way to report it. // Make the iterator text be an empty string. - utext_openUChars(&fText, NULL, 0, &status); + utext_openUChars(&fText, nullptr, 0, &status); } else { utext_openCharacterIterator(&fText, newText, &status); } @@ -506,7 +490,7 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) { // Needed in case someone calls getText(). // Can not, unfortunately, do this lazily on the (probably never) // call to getText(), because getText is const. - fSCharIter.setText(newText); + fSCharIter.setText(newText.getBuffer(), newText.length()); if (fCharIter != &fSCharIter) { // old fCharIter was adopted from the outside. Delete it. @@ -528,13 +512,13 @@ RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, U if (U_FAILURE(status)) { return *this; } - if (input == NULL) { + if (input == nullptr) { status = U_ILLEGAL_ARGUMENT_ERROR; return *this; } int64_t pos = utext_getNativeIndex(&fText); // Shallow read-only clone of the new UText into the existing input UText - utext_clone(&fText, input, FALSE, TRUE, &status); + utext_clone(&fText, input, false, true, &status); if (U_FAILURE(status)) { return *this; } @@ -554,7 +538,7 @@ RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, U * Sets the current iteration position to the beginning of the text, position zero. * @return The new iterator position, which is zero. */ -int32_t RuleBasedBreakIterator::first(void) { +int32_t RuleBasedBreakIterator::first() { UErrorCode status = U_ZERO_ERROR; if (!fBreakCache->seek(0)) { fBreakCache->populateNear(0, status); @@ -568,7 +552,7 @@ int32_t RuleBasedBreakIterator::first(void) { * Sets the current iteration position to the end of the text. * @return The text's past-the-end offset. */ -int32_t RuleBasedBreakIterator::last(void) { +int32_t RuleBasedBreakIterator::last() { int32_t endPos = (int32_t)utext_nativeLength(&fText); UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. (void)endShouldBeBoundary; @@ -606,7 +590,7 @@ int32_t RuleBasedBreakIterator::next(int32_t n) { * Advances the iterator to the next boundary position. * @return The position of the first boundary after this one. */ -int32_t RuleBasedBreakIterator::next(void) { +int32_t RuleBasedBreakIterator::next() { fBreakCache->next(); return fDone ? UBRK_DONE : fPosition; } @@ -619,7 +603,7 @@ int32_t RuleBasedBreakIterator::next(void) { * * @return The position of the boundary position immediately preceding the starting position. */ -int32_t RuleBasedBreakIterator::previous(void) { +int32_t RuleBasedBreakIterator::previous() { UErrorCode status = U_ZERO_ERROR; fBreakCache->previous(status); return fDone ? UBRK_DONE : fPosition; @@ -682,7 +666,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { // out-of-range indexes are never boundary positions if (offset < 0) { first(); // For side effects on current position, tag values. - return FALSE; + return false; } // Adjust offset to be on a code point boundary and not beyond the end of the text. @@ -699,9 +683,9 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { } if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) { - // Original offset is beyond the end of the text. Return FALSE, it's not a boundary, + // Original offset is beyond the end of the text. Return false, it's not a boundary, // but the iteration position remains set to the end of the text, which is a boundary. - return FALSE; + return false; } if (!result) { // Not on a boundary. isBoundary() must leave iterator on the following boundary. @@ -716,7 +700,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { * Returns the current iteration position. * @return The current iteration position. */ -int32_t RuleBasedBreakIterator::current(void) const { +int32_t RuleBasedBreakIterator::current() const { return fPosition; } @@ -824,7 +808,7 @@ int32_t RuleBasedBreakIterator::handleNext() { result = initialPosition; c = UTEXT_NEXT32(&fText); if (c==U_SENTINEL) { - fDone = TRUE; + fDone = true; return UBRK_DONE; } @@ -995,7 +979,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { #endif // if we're already at the start of the text, return DONE. - if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) { + if (fData == nullptr || UTEXT_GETNATIVEINDEX(&fText)==0) { return BreakIterator::DONE; } @@ -1103,10 +1087,10 @@ int32_t RuleBasedBreakIterator::getRuleStatusVec( // //------------------------------------------------------------------------------- const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { - const uint8_t *retPtr = NULL; + const uint8_t *retPtr = nullptr; length = 0; - if (fData != NULL) { + if (fData != nullptr) { retPtr = (const uint8_t *)fData->fHeader; length = fData->fHeader->fLength; } @@ -1117,16 +1101,16 @@ const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { RuleBasedBreakIterator *RuleBasedBreakIterator::createBufferClone( void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) { if (U_FAILURE(status)){ - return NULL; + return nullptr; } if (bufferSize == 0) { bufferSize = 1; // preflighting for deprecated functionality - return NULL; + return nullptr; } BreakIterator *clonedBI = clone(); - if (clonedBI == NULL) { + if (clonedBI == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { status = U_SAFECLONE_ALLOCATED_WARNING; @@ -1139,21 +1123,21 @@ U_NAMESPACE_END static icu::UStack *gLanguageBreakFactories = nullptr; static const icu::UnicodeString *gEmptyString = nullptr; -static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; -static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER; +static icu::UInitOnce gLanguageBreakFactoriesInitOnce {}; +static icu::UInitOnce gRBBIInitOnce {}; /** * Release all static memory held by breakiterator. */ U_CDECL_BEGIN -UBool U_CALLCONV rbbi_cleanup(void) { +UBool U_CALLCONV rbbi_cleanup() { delete gLanguageBreakFactories; gLanguageBreakFactories = nullptr; delete gEmptyString; gEmptyString = nullptr; gLanguageBreakFactoriesInitOnce.reset(); gRBBIInitOnce.reset(); - return TRUE; + return true; } U_CDECL_END @@ -1171,14 +1155,14 @@ static void U_CALLCONV rbbiInit() { static void U_CALLCONV initLanguageFactories() { UErrorCode status = U_ZERO_ERROR; - U_ASSERT(gLanguageBreakFactories == NULL); - gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); - if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { + U_ASSERT(gLanguageBreakFactories == nullptr); + gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status); + if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) { ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); gLanguageBreakFactories->push(builtIn, status); #ifdef U_LOCAL_SERVICE_HOOK LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); - if (extra != NULL) { + if (extra != nullptr) { gLanguageBreakFactories->push(extra, status); } #endif @@ -1191,16 +1175,16 @@ static const LanguageBreakEngine* getLanguageBreakEngineFromFactory(UChar32 c) { umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); - if (gLanguageBreakFactories == NULL) { - return NULL; + if (gLanguageBreakFactories == nullptr) { + return nullptr; } int32_t i = gLanguageBreakFactories->size(); - const LanguageBreakEngine *lbe = NULL; + const LanguageBreakEngine *lbe = nullptr; while (--i >= 0) { LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); lbe = factory->getEngineFor(c); - if (lbe != NULL) { + if (lbe != nullptr) { break; } } @@ -1216,15 +1200,15 @@ getLanguageBreakEngineFromFactory(UChar32 c) //------------------------------------------------------------------------------- const LanguageBreakEngine * RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { - const LanguageBreakEngine *lbe = NULL; + const LanguageBreakEngine *lbe = nullptr; UErrorCode status = U_ZERO_ERROR; - if (fLanguageBreakEngines == NULL) { + if (fLanguageBreakEngines == nullptr) { fLanguageBreakEngines = new UStack(status); - if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { + if (fLanguageBreakEngines == nullptr || U_FAILURE(status)) { delete fLanguageBreakEngines; fLanguageBreakEngines = 0; - return NULL; + return nullptr; } } @@ -1241,7 +1225,7 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { lbe = getLanguageBreakEngineFromFactory(c); // If we got one, use it and push it on our stack. - if (lbe != NULL) { + if (lbe != nullptr) { fLanguageBreakEngines->push((void *)lbe, status); // Even if we can't remember it, we can keep looking it up, so // return it even if the push fails. @@ -1250,9 +1234,9 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { // No engine is forthcoming for this character. Add it to the // reject set. Create the reject break engine if needed. - if (fUnhandledBreakEngine == NULL) { + if (fUnhandledBreakEngine == nullptr) { fUnhandledBreakEngine = new UnhandledEngine(status); - if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { + if (U_SUCCESS(status) && fUnhandledBreakEngine == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } @@ -1264,7 +1248,7 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { if (U_FAILURE(status)) { delete fUnhandledBreakEngine; fUnhandledBreakEngine = 0; - return NULL; + return nullptr; } } @@ -1289,7 +1273,7 @@ void RuleBasedBreakIterator::dumpTables() { const UnicodeString& RuleBasedBreakIterator::getRules() const { - if (fData != NULL) { + if (fData != nullptr) { return fData->getRuleSourceString(); } else { umtx_initOnce(gRBBIInitOnce, &rbbiInit); |