diff options
author | neksard <neksard@yandex-team.ru> | 2022-02-10 16:45:23 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:23 +0300 |
commit | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (patch) | |
tree | 83bf5c8c8047c42d8475e6095df90ccdc3d1b57f /contrib/libs/icu/i18n/coleitr.cpp | |
parent | d3a398281c6fd1d3672036cb2d63f842d2cb28c5 (diff) | |
download | ydb-8f7cf138264e0caa318144bf8a2c950e0b0a8593.tar.gz |
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/i18n/coleitr.cpp')
-rw-r--r-- | contrib/libs/icu/i18n/coleitr.cpp | 938 |
1 files changed, 469 insertions, 469 deletions
diff --git a/contrib/libs/icu/i18n/coleitr.cpp b/contrib/libs/icu/i18n/coleitr.cpp index 64d3ab4d2b..912ae63afc 100644 --- a/contrib/libs/icu/i18n/coleitr.cpp +++ b/contrib/libs/icu/i18n/coleitr.cpp @@ -1,473 +1,473 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* Copyright (C) 1996-2014, International Business Machines Corporation and -* others. All Rights Reserved. -******************************************************************************* -*/ - -/* -* File coleitr.cpp -* -* Created by: Helena Shih -* -* Modification History: -* -* Date Name Description -* -* 6/23/97 helena Adding comments to make code more readable. -* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java -* 12/10/99 aliu Ported Thai collation support from Java. -* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) -* 02/19/01 swquek Removed CollationElementIterator() since it is -* private constructor and no calls are made to it -* 2012-2014 markus Rewritten in C++ again. -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_COLLATION - +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 1996-2014, International Business Machines Corporation and +* others. All Rights Reserved. +******************************************************************************* +*/ + +/* +* File coleitr.cpp +* +* Created by: Helena Shih +* +* Modification History: +* +* Date Name Description +* +* 6/23/97 helena Adding comments to make code more readable. +* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java +* 12/10/99 aliu Ported Thai collation support from Java. +* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) +* 02/19/01 swquek Removed CollationElementIterator() since it is +* private constructor and no calls are made to it +* 2012-2014 markus Rewritten in C++ again. +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + #include "unicode/chariter.h" -#include "unicode/coleitr.h" -#include "unicode/tblcoll.h" -#include "unicode/ustring.h" -#include "cmemory.h" -#include "collation.h" -#include "collationdata.h" -#include "collationiterator.h" -#include "collationsets.h" -#include "collationtailoring.h" -#include "uassert.h" -#include "uhash.h" -#include "utf16collationiterator.h" -#include "uvectr32.h" - -/* Constants --------------------------------------------------------------- */ - -U_NAMESPACE_BEGIN - -UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) - -/* CollationElementIterator public constructor/destructor ------------------ */ - -CollationElementIterator::CollationElementIterator( - const CollationElementIterator& other) - : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { - *this = other; -} - -CollationElementIterator::~CollationElementIterator() -{ - delete iter_; - delete offsets_; -} - -/* CollationElementIterator public methods --------------------------------- */ - -namespace { - -uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { - return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); -} -uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { - return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); -} -UBool ceNeedsTwoParts(int64_t ce) { - return (ce & INT64_C(0xffff00ff003f)) != 0; -} - -} // namespace - -int32_t CollationElementIterator::getOffset() const -{ - if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { - // CollationIterator::previousCE() decrements the CEs length - // while it pops CEs from its internal buffer. - int32_t i = iter_->getCEsLength(); - if (otherHalf_ != 0) { - // Return the trailing CE offset while we are in the middle of a 64-bit CE. - ++i; - } - U_ASSERT(i < offsets_->size()); - return offsets_->elementAti(i); - } - return iter_->getOffset(); -} - -/** -* Get the ordering priority of the next character in the string. -* @return the next character's ordering. Returns NULLORDER if an error has -* occured or if the end of string has been reached -*/ -int32_t CollationElementIterator::next(UErrorCode& status) -{ - if (U_FAILURE(status)) { return NULLORDER; } - if (dir_ > 1) { - // Continue forward iteration. Test this first. - if (otherHalf_ != 0) { - uint32_t oh = otherHalf_; - otherHalf_ = 0; - return oh; - } - } else if (dir_ == 1) { - // next() after setOffset() - dir_ = 2; - } else if (dir_ == 0) { - // The iter_ is already reset to the start of the text. - dir_ = 2; - } else /* dir_ < 0 */ { - // illegal change of direction - status = U_INVALID_STATE_ERROR; - return NULLORDER; - } - // No need to keep all CEs in the buffer when we iterate. - iter_->clearCEsIfNoneRemaining(); - int64_t ce = iter_->nextCE(status); - if (ce == Collation::NO_CE) { return NULLORDER; } - // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. - uint32_t p = (uint32_t)(ce >> 32); - uint32_t lower32 = (uint32_t)ce; - uint32_t firstHalf = getFirstHalf(p, lower32); - uint32_t secondHalf = getSecondHalf(p, lower32); - if (secondHalf != 0) { - otherHalf_ = secondHalf | 0xc0; // continuation CE - } - return firstHalf; -} - -UBool CollationElementIterator::operator!=( - const CollationElementIterator& other) const -{ - return !(*this == other); -} - -UBool CollationElementIterator::operator==( - const CollationElementIterator& that) const -{ - if (this == &that) { - return TRUE; - } - - return - (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && - otherHalf_ == that.otherHalf_ && - normalizeDir() == that.normalizeDir() && - string_ == that.string_ && - *iter_ == *that.iter_; -} - -/** -* Get the ordering priority of the previous collation element in the string. -* @param status the error code status. -* @return the previous element's ordering. Returns NULLORDER if an error has -* occured or if the start of string has been reached. -*/ -int32_t CollationElementIterator::previous(UErrorCode& status) -{ - if (U_FAILURE(status)) { return NULLORDER; } - if (dir_ < 0) { - // Continue backwards iteration. Test this first. - if (otherHalf_ != 0) { - uint32_t oh = otherHalf_; - otherHalf_ = 0; - return oh; - } - } else if (dir_ == 0) { - iter_->resetToOffset(string_.length()); - dir_ = -1; - } else if (dir_ == 1) { - // previous() after setOffset() - dir_ = -1; - } else /* dir_ > 1 */ { - // illegal change of direction - status = U_INVALID_STATE_ERROR; - return NULLORDER; - } - if (offsets_ == NULL) { - offsets_ = new UVector32(status); - if (offsets_ == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return NULLORDER; - } - } - // If we already have expansion CEs, then we also have offsets. - // Otherwise remember the trailing offset in case we need to - // write offsets for an artificial expansion. - int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; - int64_t ce = iter_->previousCE(*offsets_, status); - if (ce == Collation::NO_CE) { return NULLORDER; } - // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. - uint32_t p = (uint32_t)(ce >> 32); - uint32_t lower32 = (uint32_t)ce; - uint32_t firstHalf = getFirstHalf(p, lower32); - uint32_t secondHalf = getSecondHalf(p, lower32); - if (secondHalf != 0) { - if (offsets_->isEmpty()) { - // When we convert a single 64-bit CE into two 32-bit CEs, - // we need to make this artificial expansion behave like a normal expansion. - // See CollationIterator::previousCE(). - offsets_->addElement(iter_->getOffset(), status); - offsets_->addElement(limitOffset, status); - } - otherHalf_ = firstHalf; - return secondHalf | 0xc0; // continuation CE - } - return firstHalf; -} - -/** -* Resets the cursor to the beginning of the string. -*/ -void CollationElementIterator::reset() -{ - iter_ ->resetToOffset(0); - otherHalf_ = 0; - dir_ = 0; -} - -void CollationElementIterator::setOffset(int32_t newOffset, - UErrorCode& status) -{ - if (U_FAILURE(status)) { return; } - if (0 < newOffset && newOffset < string_.length()) { - int32_t offset = newOffset; - do { - UChar c = string_.charAt(offset); - if (!rbc_->isUnsafe(c) || - (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { - break; - } - // Back up to before this unsafe character. - --offset; - } while (offset > 0); - if (offset < newOffset) { - // We might have backed up more than necessary. - // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, - // but for text "chu" setOffset(2) should remain at 2 - // although we initially back up to offset 0. - // Find the last safe offset no greater than newOffset by iterating forward. - int32_t lastSafeOffset = offset; - do { - iter_->resetToOffset(lastSafeOffset); - do { - iter_->nextCE(status); - if (U_FAILURE(status)) { return; } - } while ((offset = iter_->getOffset()) == lastSafeOffset); - if (offset <= newOffset) { - lastSafeOffset = offset; - } - } while (offset < newOffset); - newOffset = lastSafeOffset; - } - } - iter_->resetToOffset(newOffset); - otherHalf_ = 0; - dir_ = 1; -} - -/** -* Sets the source to the new source string. -*/ -void CollationElementIterator::setText(const UnicodeString& source, - UErrorCode& status) -{ - if (U_FAILURE(status)) { - return; - } - - string_ = source; - const UChar *s = string_.getBuffer(); - CollationIterator *newIter; - UBool numeric = rbc_->settings->isNumeric(); - if (rbc_->settings->dontCheckFCD()) { - newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); - } else { - newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); - } - if (newIter == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - delete iter_; - iter_ = newIter; - otherHalf_ = 0; - dir_ = 0; -} - -// Sets the source to the new character iterator. -void CollationElementIterator::setText(CharacterIterator& source, - UErrorCode& status) -{ - if (U_FAILURE(status)) - return; - - source.getText(string_); - setText(string_, status); -} - -int32_t CollationElementIterator::strengthOrder(int32_t order) const -{ - UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); - // Mask off the unwanted differences. - if (s == UCOL_PRIMARY) { - order &= 0xffff0000; - } - else if (s == UCOL_SECONDARY) { - order &= 0xffffff00; - } - - return order; -} - -/* CollationElementIterator private constructors/destructors --------------- */ - +#include "unicode/coleitr.h" +#include "unicode/tblcoll.h" +#include "unicode/ustring.h" +#include "cmemory.h" +#include "collation.h" +#include "collationdata.h" +#include "collationiterator.h" +#include "collationsets.h" +#include "collationtailoring.h" +#include "uassert.h" +#include "uhash.h" +#include "utf16collationiterator.h" +#include "uvectr32.h" + +/* Constants --------------------------------------------------------------- */ + +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) + +/* CollationElementIterator public constructor/destructor ------------------ */ + +CollationElementIterator::CollationElementIterator( + const CollationElementIterator& other) + : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { + *this = other; +} + +CollationElementIterator::~CollationElementIterator() +{ + delete iter_; + delete offsets_; +} + +/* CollationElementIterator public methods --------------------------------- */ + +namespace { + +uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { + return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); +} +uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { + return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); +} +UBool ceNeedsTwoParts(int64_t ce) { + return (ce & INT64_C(0xffff00ff003f)) != 0; +} + +} // namespace + +int32_t CollationElementIterator::getOffset() const +{ + if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { + // CollationIterator::previousCE() decrements the CEs length + // while it pops CEs from its internal buffer. + int32_t i = iter_->getCEsLength(); + if (otherHalf_ != 0) { + // Return the trailing CE offset while we are in the middle of a 64-bit CE. + ++i; + } + U_ASSERT(i < offsets_->size()); + return offsets_->elementAti(i); + } + return iter_->getOffset(); +} + +/** +* Get the ordering priority of the next character in the string. +* @return the next character's ordering. Returns NULLORDER if an error has +* occured or if the end of string has been reached +*/ +int32_t CollationElementIterator::next(UErrorCode& status) +{ + if (U_FAILURE(status)) { return NULLORDER; } + if (dir_ > 1) { + // Continue forward iteration. Test this first. + if (otherHalf_ != 0) { + uint32_t oh = otherHalf_; + otherHalf_ = 0; + return oh; + } + } else if (dir_ == 1) { + // next() after setOffset() + dir_ = 2; + } else if (dir_ == 0) { + // The iter_ is already reset to the start of the text. + dir_ = 2; + } else /* dir_ < 0 */ { + // illegal change of direction + status = U_INVALID_STATE_ERROR; + return NULLORDER; + } + // No need to keep all CEs in the buffer when we iterate. + iter_->clearCEsIfNoneRemaining(); + int64_t ce = iter_->nextCE(status); + if (ce == Collation::NO_CE) { return NULLORDER; } + // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. + uint32_t p = (uint32_t)(ce >> 32); + uint32_t lower32 = (uint32_t)ce; + uint32_t firstHalf = getFirstHalf(p, lower32); + uint32_t secondHalf = getSecondHalf(p, lower32); + if (secondHalf != 0) { + otherHalf_ = secondHalf | 0xc0; // continuation CE + } + return firstHalf; +} + +UBool CollationElementIterator::operator!=( + const CollationElementIterator& other) const +{ + return !(*this == other); +} + +UBool CollationElementIterator::operator==( + const CollationElementIterator& that) const +{ + if (this == &that) { + return TRUE; + } + + return + (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && + otherHalf_ == that.otherHalf_ && + normalizeDir() == that.normalizeDir() && + string_ == that.string_ && + *iter_ == *that.iter_; +} + /** -* This is the "real" constructor for this class; it constructs an iterator -* over the source text using the specified collator -*/ -CollationElementIterator::CollationElementIterator( - const UnicodeString &source, - const RuleBasedCollator *coll, - UErrorCode &status) - : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { - setText(source, status); -} - +* Get the ordering priority of the previous collation element in the string. +* @param status the error code status. +* @return the previous element's ordering. Returns NULLORDER if an error has +* occured or if the start of string has been reached. +*/ +int32_t CollationElementIterator::previous(UErrorCode& status) +{ + if (U_FAILURE(status)) { return NULLORDER; } + if (dir_ < 0) { + // Continue backwards iteration. Test this first. + if (otherHalf_ != 0) { + uint32_t oh = otherHalf_; + otherHalf_ = 0; + return oh; + } + } else if (dir_ == 0) { + iter_->resetToOffset(string_.length()); + dir_ = -1; + } else if (dir_ == 1) { + // previous() after setOffset() + dir_ = -1; + } else /* dir_ > 1 */ { + // illegal change of direction + status = U_INVALID_STATE_ERROR; + return NULLORDER; + } + if (offsets_ == NULL) { + offsets_ = new UVector32(status); + if (offsets_ == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return NULLORDER; + } + } + // If we already have expansion CEs, then we also have offsets. + // Otherwise remember the trailing offset in case we need to + // write offsets for an artificial expansion. + int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; + int64_t ce = iter_->previousCE(*offsets_, status); + if (ce == Collation::NO_CE) { return NULLORDER; } + // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. + uint32_t p = (uint32_t)(ce >> 32); + uint32_t lower32 = (uint32_t)ce; + uint32_t firstHalf = getFirstHalf(p, lower32); + uint32_t secondHalf = getSecondHalf(p, lower32); + if (secondHalf != 0) { + if (offsets_->isEmpty()) { + // When we convert a single 64-bit CE into two 32-bit CEs, + // we need to make this artificial expansion behave like a normal expansion. + // See CollationIterator::previousCE(). + offsets_->addElement(iter_->getOffset(), status); + offsets_->addElement(limitOffset, status); + } + otherHalf_ = firstHalf; + return secondHalf | 0xc0; // continuation CE + } + return firstHalf; +} + /** -* This is the "real" constructor for this class; it constructs an iterator over -* the source text using the specified collator -*/ -CollationElementIterator::CollationElementIterator( - const CharacterIterator &source, - const RuleBasedCollator *coll, - UErrorCode &status) - : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { - // We only call source.getText() which should be const anyway. - setText(const_cast<CharacterIterator &>(source), status); -} - -/* CollationElementIterator private methods -------------------------------- */ - -const CollationElementIterator& CollationElementIterator::operator=( - const CollationElementIterator& other) -{ - if (this == &other) { - return *this; - } - - CollationIterator *newIter; - const FCDUTF16CollationIterator *otherFCDIter = - dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); - if(otherFCDIter != NULL) { - newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); - } else { - const UTF16CollationIterator *otherIter = - dynamic_cast<const UTF16CollationIterator *>(other.iter_); - if(otherIter != NULL) { - newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); - } else { - newIter = NULL; - } - } - if(newIter != NULL) { - delete iter_; - iter_ = newIter; - rbc_ = other.rbc_; - otherHalf_ = other.otherHalf_; - dir_ = other.dir_; - - string_ = other.string_; - } - if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { - UErrorCode errorCode = U_ZERO_ERROR; - if(offsets_ == NULL) { - offsets_ = new UVector32(other.offsets_->size(), errorCode); - } - if(offsets_ != NULL) { - offsets_->assign(*other.offsets_, errorCode); - } - } - return *this; -} - -namespace { - -class MaxExpSink : public ContractionsAndExpansions::CESink { -public: - MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} - virtual ~MaxExpSink(); - virtual void handleCE(int64_t /*ce*/) {} - virtual void handleExpansion(const int64_t ces[], int32_t length) { - if (length <= 1) { - // We do not need to add single CEs into the map. - return; - } - int32_t count = 0; // number of CE "halves" - for (int32_t i = 0; i < length; ++i) { - count += ceNeedsTwoParts(ces[i]) ? 2 : 1; - } - // last "half" of the last CE - int64_t ce = ces[length - 1]; - uint32_t p = (uint32_t)(ce >> 32); - uint32_t lower32 = (uint32_t)ce; - uint32_t lastHalf = getSecondHalf(p, lower32); - if (lastHalf == 0) { - lastHalf = getFirstHalf(p, lower32); - U_ASSERT(lastHalf != 0); - } else { - lastHalf |= 0xc0; // old-style continuation CE - } - if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { - uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); - } - } - -private: - UHashtable *maxExpansions; - UErrorCode &errorCode; -}; - -MaxExpSink::~MaxExpSink() {} - -} // namespace - -UHashtable * -CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { return NULL; } - UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, - uhash_compareLong, &errorCode); - if (U_FAILURE(errorCode)) { return NULL; } - MaxExpSink sink(maxExpansions, errorCode); - ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); - if (U_FAILURE(errorCode)) { - uhash_close(maxExpansions); - return NULL; - } - return maxExpansions; -} - -int32_t -CollationElementIterator::getMaxExpansion(int32_t order) const { - return getMaxExpansion(rbc_->tailoring->maxExpansions, order); -} - -int32_t -CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { - if (order == 0) { return 1; } - int32_t max; - if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { - return max; - } - if ((order & 0xc0) == 0xc0) { - // old-style continuation CE - return 2; - } else { - return 1; - } -} - -U_NAMESPACE_END - -#endif /* #if !UCONFIG_NO_COLLATION */ +* Resets the cursor to the beginning of the string. +*/ +void CollationElementIterator::reset() +{ + iter_ ->resetToOffset(0); + otherHalf_ = 0; + dir_ = 0; +} + +void CollationElementIterator::setOffset(int32_t newOffset, + UErrorCode& status) +{ + if (U_FAILURE(status)) { return; } + if (0 < newOffset && newOffset < string_.length()) { + int32_t offset = newOffset; + do { + UChar c = string_.charAt(offset); + if (!rbc_->isUnsafe(c) || + (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { + break; + } + // Back up to before this unsafe character. + --offset; + } while (offset > 0); + if (offset < newOffset) { + // We might have backed up more than necessary. + // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, + // but for text "chu" setOffset(2) should remain at 2 + // although we initially back up to offset 0. + // Find the last safe offset no greater than newOffset by iterating forward. + int32_t lastSafeOffset = offset; + do { + iter_->resetToOffset(lastSafeOffset); + do { + iter_->nextCE(status); + if (U_FAILURE(status)) { return; } + } while ((offset = iter_->getOffset()) == lastSafeOffset); + if (offset <= newOffset) { + lastSafeOffset = offset; + } + } while (offset < newOffset); + newOffset = lastSafeOffset; + } + } + iter_->resetToOffset(newOffset); + otherHalf_ = 0; + dir_ = 1; +} + +/** +* Sets the source to the new source string. +*/ +void CollationElementIterator::setText(const UnicodeString& source, + UErrorCode& status) +{ + if (U_FAILURE(status)) { + return; + } + + string_ = source; + const UChar *s = string_.getBuffer(); + CollationIterator *newIter; + UBool numeric = rbc_->settings->isNumeric(); + if (rbc_->settings->dontCheckFCD()) { + newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); + } else { + newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); + } + if (newIter == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + delete iter_; + iter_ = newIter; + otherHalf_ = 0; + dir_ = 0; +} + +// Sets the source to the new character iterator. +void CollationElementIterator::setText(CharacterIterator& source, + UErrorCode& status) +{ + if (U_FAILURE(status)) + return; + + source.getText(string_); + setText(string_, status); +} + +int32_t CollationElementIterator::strengthOrder(int32_t order) const +{ + UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); + // Mask off the unwanted differences. + if (s == UCOL_PRIMARY) { + order &= 0xffff0000; + } + else if (s == UCOL_SECONDARY) { + order &= 0xffffff00; + } + + return order; +} + +/* CollationElementIterator private constructors/destructors --------------- */ + +/** +* This is the "real" constructor for this class; it constructs an iterator +* over the source text using the specified collator +*/ +CollationElementIterator::CollationElementIterator( + const UnicodeString &source, + const RuleBasedCollator *coll, + UErrorCode &status) + : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { + setText(source, status); +} + +/** +* This is the "real" constructor for this class; it constructs an iterator over +* the source text using the specified collator +*/ +CollationElementIterator::CollationElementIterator( + const CharacterIterator &source, + const RuleBasedCollator *coll, + UErrorCode &status) + : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { + // We only call source.getText() which should be const anyway. + setText(const_cast<CharacterIterator &>(source), status); +} + +/* CollationElementIterator private methods -------------------------------- */ + +const CollationElementIterator& CollationElementIterator::operator=( + const CollationElementIterator& other) +{ + if (this == &other) { + return *this; + } + + CollationIterator *newIter; + const FCDUTF16CollationIterator *otherFCDIter = + dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); + if(otherFCDIter != NULL) { + newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); + } else { + const UTF16CollationIterator *otherIter = + dynamic_cast<const UTF16CollationIterator *>(other.iter_); + if(otherIter != NULL) { + newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); + } else { + newIter = NULL; + } + } + if(newIter != NULL) { + delete iter_; + iter_ = newIter; + rbc_ = other.rbc_; + otherHalf_ = other.otherHalf_; + dir_ = other.dir_; + + string_ = other.string_; + } + if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { + UErrorCode errorCode = U_ZERO_ERROR; + if(offsets_ == NULL) { + offsets_ = new UVector32(other.offsets_->size(), errorCode); + } + if(offsets_ != NULL) { + offsets_->assign(*other.offsets_, errorCode); + } + } + return *this; +} + +namespace { + +class MaxExpSink : public ContractionsAndExpansions::CESink { +public: + MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} + virtual ~MaxExpSink(); + virtual void handleCE(int64_t /*ce*/) {} + virtual void handleExpansion(const int64_t ces[], int32_t length) { + if (length <= 1) { + // We do not need to add single CEs into the map. + return; + } + int32_t count = 0; // number of CE "halves" + for (int32_t i = 0; i < length; ++i) { + count += ceNeedsTwoParts(ces[i]) ? 2 : 1; + } + // last "half" of the last CE + int64_t ce = ces[length - 1]; + uint32_t p = (uint32_t)(ce >> 32); + uint32_t lower32 = (uint32_t)ce; + uint32_t lastHalf = getSecondHalf(p, lower32); + if (lastHalf == 0) { + lastHalf = getFirstHalf(p, lower32); + U_ASSERT(lastHalf != 0); + } else { + lastHalf |= 0xc0; // old-style continuation CE + } + if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { + uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); + } + } + +private: + UHashtable *maxExpansions; + UErrorCode &errorCode; +}; + +MaxExpSink::~MaxExpSink() {} + +} // namespace + +UHashtable * +CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return NULL; } + UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, + uhash_compareLong, &errorCode); + if (U_FAILURE(errorCode)) { return NULL; } + MaxExpSink sink(maxExpansions, errorCode); + ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); + if (U_FAILURE(errorCode)) { + uhash_close(maxExpansions); + return NULL; + } + return maxExpansions; +} + +int32_t +CollationElementIterator::getMaxExpansion(int32_t order) const { + return getMaxExpansion(rbc_->tailoring->maxExpansions, order); +} + +int32_t +CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { + if (order == 0) { return 1; } + int32_t max; + if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { + return max; + } + if ((order & 0xc0) == 0xc0) { + // old-style continuation CE + return 2; + } else { + return 1; + } +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_COLLATION */ |