diff options
author | mcheshkov <mcheshkov@yandex-team.ru> | 2022-02-10 16:46:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:15 +0300 |
commit | e9d19cec64684c9c1e6b0c98297e5b895cf904fe (patch) | |
tree | 2768b1223e96a8a0610a93d18425d9647c1123c8 /contrib/libs/icu/common/locdistance.cpp | |
parent | 60040c91ffe701a84689b2c6310ff845e65cff42 (diff) | |
download | ydb-e9d19cec64684c9c1e6b0c98297e5b895cf904fe.tar.gz |
Restoring authorship annotation for <mcheshkov@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/common/locdistance.cpp')
-rw-r--r-- | contrib/libs/icu/common/locdistance.cpp | 830 |
1 files changed, 415 insertions, 415 deletions
diff --git a/contrib/libs/icu/common/locdistance.cpp b/contrib/libs/icu/common/locdistance.cpp index 18e4d91bce..eb218b3efe 100644 --- a/contrib/libs/icu/common/locdistance.cpp +++ b/contrib/libs/icu/common/locdistance.cpp @@ -1,415 +1,415 @@ -// © 2019 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html#License - -// locdistance.cpp -// created: 2019may08 Markus W. Scherer - -#include "unicode/utypes.h" -#include "unicode/bytestrie.h" -#include "unicode/localematcher.h" -#include "unicode/locid.h" -#include "unicode/uobject.h" -#include "unicode/ures.h" -#include "cstring.h" -#include "locdistance.h" -#include "loclikelysubtags.h" -#include "uassert.h" -#include "ucln_cmn.h" -#include "uinvchar.h" -#include "umutex.h" - -U_NAMESPACE_BEGIN - -namespace { - -/** - * Bit flag used on the last character of a subtag in the trie. - * Must be set consistently by the builder and the lookup code. - */ -constexpr int32_t END_OF_SUBTAG = 0x80; -/** Distance value bit flag, set by the builder. */ -constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; -/** Distance value bit flag, set by trieNext(). */ -constexpr int32_t DISTANCE_IS_FINAL = 0x100; -constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; - -constexpr int32_t ABOVE_THRESHOLD = 100; - -// Indexes into array of distances. -enum { - IX_DEF_LANG_DISTANCE, - IX_DEF_SCRIPT_DISTANCE, - IX_DEF_REGION_DISTANCE, - IX_MIN_REGION_DISTANCE, - IX_LIMIT -}; - -LocaleDistance *gLocaleDistance = nullptr; -UInitOnce gInitOnce = U_INITONCE_INITIALIZER; - -UBool U_CALLCONV cleanup() { - delete gLocaleDistance; - gLocaleDistance = nullptr; - gInitOnce.reset(); - return TRUE; -} - -} // namespace - -void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { - // This function is invoked only via umtx_initOnce(). - U_ASSERT(gLocaleDistance == nullptr); - const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode); - if (U_FAILURE(errorCode)) { return; } - const LocaleDistanceData &data = likely.getDistanceData(); - if (data.distanceTrieBytes == nullptr || - data.regionToPartitions == nullptr || data.partitions == nullptr || - // ok if no paradigms - data.distances == nullptr) { - errorCode = U_MISSING_RESOURCE_ERROR; - return; - } - gLocaleDistance = new LocaleDistance(data, likely); - if (gLocaleDistance == nullptr) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); -} - -const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { return nullptr; } - umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); - return gLocaleDistance; -} - -LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) : - likelySubtags(likely), - trie(data.distanceTrieBytes), - regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), - paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), - defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), - defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), - defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), - minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { - // For the default demotion value, use the - // default region distance between unrelated Englishes. - // Thus, unless demotion is turned off, - // a mere region difference for one desired locale - // is as good as a perfect match for the next following desired locale. - // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. - LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR); - LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR); - const LSR *p_enGB = &enGB; - int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1, - shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY); - defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance); -} - -int32_t LocaleDistance::getBestIndexAndDistance( - const LSR &desired, - const LSR **supportedLSRs, int32_t supportedLSRsLength, - int32_t shiftedThreshold, - ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const { - BytesTrie iter(trie); - // Look up the desired language only once for all supported LSRs. - // Its "distance" is either a match point value of 0, or a non-match negative value. - // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. - int32_t desLangDistance = trieNext(iter, desired.language, false); - uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; - // Index of the supported LSR with the lowest distance. - int32_t bestIndex = -1; - // Cached lookup info from XLikelySubtags.compareLikely(). - int32_t bestLikelyInfo = -1; - for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { - const LSR &supported = *supportedLSRs[slIndex]; - bool star = false; - int32_t distance = desLangDistance; - if (distance >= 0) { - U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); - if (slIndex != 0) { - iter.resetToState64(desLangState); - } - distance = trieNext(iter, supported.language, true); - } - // Note: The data builder verifies that there are no rules with "any" (*) language and - // real (non *) script or region subtags. - // This means that if the lookup for either language fails we can use - // the default distances without further lookups. - int32_t flags; - if (distance >= 0) { - flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; - distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; - } else { // <*, *> - if (uprv_strcmp(desired.language, supported.language) == 0) { - distance = 0; - } else { - distance = defaultLanguageDistance; - } - flags = 0; - star = true; - } - U_ASSERT(0 <= distance && distance <= 100); - // Round up the shifted threshold (if fraction bits are not 0) - // for comparison with un-shifted distances until we need fraction bits. - // (If we simply shifted non-zero fraction bits away, then we might ignore a language - // when it's really still a micro distance below the threshold.) - int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT; - // We implement "favor subtag" by reducing the language subtag distance - // (unscientifically reducing it to a quarter of the normal value), - // so that the script distance is relatively more important. - // For example, given a default language distance of 80, we reduce it to 20, - // which is below the default threshold of 50, which is the default script distance. - if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { - distance >>= 2; - } - // Let distance == roundedThreshold pass until the tie-breaker logic - // at the end of the loop. - if (distance > roundedThreshold) { - continue; - } - - int32_t scriptDistance; - if (star || flags != 0) { - if (uprv_strcmp(desired.script, supported.script) == 0) { - scriptDistance = 0; - } else { - scriptDistance = defaultScriptDistance; - } - } else { - scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), - desired.script, supported.script); - flags = scriptDistance & DISTANCE_IS_FINAL; - scriptDistance &= ~DISTANCE_IS_FINAL; - } - distance += scriptDistance; - if (distance > roundedThreshold) { - continue; - } - - if (uprv_strcmp(desired.region, supported.region) == 0) { - // regionDistance = 0 - } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { - distance += defaultRegionDistance; - } else { - int32_t remainingThreshold = roundedThreshold - distance; - if (minRegionDistance > remainingThreshold) { - continue; - } - - // From here on we know the regions are not equal. - // Map each region to zero or more partitions. (zero = one non-matching string) - // (Each array of single-character partition strings is encoded as one string.) - // If either side has more than one, then we find the maximum distance. - // This could be optimized by adding some more structure, but probably not worth it. - distance += getRegionPartitionsDistance( - iter, iter.getState64(), - partitionsForRegion(desired), - partitionsForRegion(supported), - remainingThreshold); - } - int32_t shiftedDistance = shiftDistance(distance); - if (shiftedDistance == 0) { - // Distinguish between equivalent but originally unequal locales via an - // additional micro distance. - shiftedDistance |= (desired.flags ^ supported.flags); - if (shiftedDistance < shiftedThreshold) { - if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || - // Is there also a match when we swap desired/supported? - isMatch(supported, desired, shiftedThreshold, favorSubtag)) { - if (shiftedDistance == 0) { - return slIndex << INDEX_SHIFT; - } - bestIndex = slIndex; - shiftedThreshold = shiftedDistance; - bestLikelyInfo = -1; - } - } - } else { - if (shiftedDistance < shiftedThreshold) { - if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || - // Is there also a match when we swap desired/supported? - isMatch(supported, desired, shiftedThreshold, favorSubtag)) { - bestIndex = slIndex; - shiftedThreshold = shiftedDistance; - bestLikelyInfo = -1; - } - } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) { - if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || - // Is there also a match when we swap desired/supported? - isMatch(supported, desired, shiftedThreshold, favorSubtag)) { - bestLikelyInfo = likelySubtags.compareLikely( - supported, *supportedLSRs[bestIndex], bestLikelyInfo); - if ((bestLikelyInfo & 1) != 0) { - // This supported locale matches as well as the previous best match, - // and neither matches perfectly, - // but this one is "more likely" (has more-default subtags). - bestIndex = slIndex; - } - } - } - } - } - return bestIndex >= 0 ? - (bestIndex << INDEX_SHIFT) | shiftedThreshold : - INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD); -} - -int32_t LocaleDistance::getDesSuppScriptDistance( - BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { - // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. - int32_t distance = trieNext(iter, desired, false); - if (distance >= 0) { - distance = trieNext(iter, supported, true); - } - if (distance < 0) { - UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> - U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); - if (uprv_strcmp(desired, supported) == 0) { - distance = 0; // same script - } else { - distance = iter.getValue(); - U_ASSERT(distance >= 0); - } - if (result == USTRINGTRIE_FINAL_VALUE) { - distance |= DISTANCE_IS_FINAL; - } - } - return distance; -} - -int32_t LocaleDistance::getRegionPartitionsDistance( - BytesTrie &iter, uint64_t startState, - const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { - char desired = *desiredPartitions++; - char supported = *supportedPartitions++; - U_ASSERT(desired != 0 && supported != 0); - // See if we have single desired/supported partitions, from NUL-terminated - // partition strings without explicit length. - bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character - // equivalent to: if (desLength == 1 && suppLength == 1) - if (*desiredPartitions == 0 && !suppLengthGt1) { - // Fastpath for single desired/supported partitions. - UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); - if (USTRINGTRIE_HAS_NEXT(result)) { - result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); - if (USTRINGTRIE_HAS_VALUE(result)) { - return iter.getValue(); - } - } - return getFallbackRegionDistance(iter, startState); - } - - const char *supportedStart = supportedPartitions - 1; // for restart of inner loop - int32_t regionDistance = 0; - // Fall back to * only once, not for each pair of partition strings. - bool star = false; - for (;;) { - // Look up each desired-partition string only once, - // not for each (desired, supported) pair. - UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); - if (USTRINGTRIE_HAS_NEXT(result)) { - uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; - for (;;) { - result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); - int32_t d; - if (USTRINGTRIE_HAS_VALUE(result)) { - d = iter.getValue(); - } else if (star) { - d = 0; - } else { - d = getFallbackRegionDistance(iter, startState); - star = true; - } - if (d > threshold) { - return d; - } else if (regionDistance < d) { - regionDistance = d; - } - if ((supported = *supportedPartitions++) != 0) { - iter.resetToState64(desState); - } else { - break; - } - } - } else if (!star) { - int32_t d = getFallbackRegionDistance(iter, startState); - if (d > threshold) { - return d; - } else if (regionDistance < d) { - regionDistance = d; - } - star = true; - } - if ((desired = *desiredPartitions++) != 0) { - iter.resetToState64(startState); - supportedPartitions = supportedStart; - supported = *supportedPartitions++; - } else { - break; - } - } - return regionDistance; -} - -int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { -#if U_DEBUG - UStringTrieResult result = -#endif - iter.resetToState64(startState).next(u'*'); // <*, *> - U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); - int32_t distance = iter.getValue(); - U_ASSERT(distance >= 0); - return distance; -} - -int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { - uint8_t c; - if ((c = *s) == 0) { - return -1; // no empty subtags in the distance data - } - for (;;) { - c = uprv_invCharToAscii(c); - // EBCDIC: If *s is not an invariant character, - // then c is now 0 and will simply not match anything, which is harmless. - uint8_t next = *++s; - if (next != 0) { - if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { - return -1; - } - } else { - // last character of this subtag - UStringTrieResult result = iter.next(c | END_OF_SUBTAG); - if (wantValue) { - if (USTRINGTRIE_HAS_VALUE(result)) { - int32_t value = iter.getValue(); - if (result == USTRINGTRIE_FINAL_VALUE) { - value |= DISTANCE_IS_FINAL; - } - return value; - } - } else { - if (USTRINGTRIE_HAS_NEXT(result)) { - return 0; - } - } - return -1; - } - c = next; - } -} - -UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { - // Linear search for a very short list (length 6 as of 2019), - // because we look for equivalence not equality, and - // because it's easy. - // If there are many paradigm LSRs we should use a hash set - // with custom comparator and hasher. - U_ASSERT(paradigmLSRsLength <= 15); - for (int32_t i = 0; i < paradigmLSRsLength; ++i) { - if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; } - } - return false; -} - -U_NAMESPACE_END +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// locdistance.cpp +// created: 2019may08 Markus W. Scherer + +#include "unicode/utypes.h" +#include "unicode/bytestrie.h" +#include "unicode/localematcher.h" +#include "unicode/locid.h" +#include "unicode/uobject.h" +#include "unicode/ures.h" +#include "cstring.h" +#include "locdistance.h" +#include "loclikelysubtags.h" +#include "uassert.h" +#include "ucln_cmn.h" +#include "uinvchar.h" +#include "umutex.h" + +U_NAMESPACE_BEGIN + +namespace { + +/** + * Bit flag used on the last character of a subtag in the trie. + * Must be set consistently by the builder and the lookup code. + */ +constexpr int32_t END_OF_SUBTAG = 0x80; +/** Distance value bit flag, set by the builder. */ +constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; +/** Distance value bit flag, set by trieNext(). */ +constexpr int32_t DISTANCE_IS_FINAL = 0x100; +constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; + +constexpr int32_t ABOVE_THRESHOLD = 100; + +// Indexes into array of distances. +enum { + IX_DEF_LANG_DISTANCE, + IX_DEF_SCRIPT_DISTANCE, + IX_DEF_REGION_DISTANCE, + IX_MIN_REGION_DISTANCE, + IX_LIMIT +}; + +LocaleDistance *gLocaleDistance = nullptr; +UInitOnce gInitOnce = U_INITONCE_INITIALIZER; + +UBool U_CALLCONV cleanup() { + delete gLocaleDistance; + gLocaleDistance = nullptr; + gInitOnce.reset(); + return TRUE; +} + +} // namespace + +void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { + // This function is invoked only via umtx_initOnce(). + U_ASSERT(gLocaleDistance == nullptr); + const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode); + if (U_FAILURE(errorCode)) { return; } + const LocaleDistanceData &data = likely.getDistanceData(); + if (data.distanceTrieBytes == nullptr || + data.regionToPartitions == nullptr || data.partitions == nullptr || + // ok if no paradigms + data.distances == nullptr) { + errorCode = U_MISSING_RESOURCE_ERROR; + return; + } + gLocaleDistance = new LocaleDistance(data, likely); + if (gLocaleDistance == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); +} + +const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } + umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); + return gLocaleDistance; +} + +LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) : + likelySubtags(likely), + trie(data.distanceTrieBytes), + regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), + paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), + defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), + defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), + defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), + minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { + // For the default demotion value, use the + // default region distance between unrelated Englishes. + // Thus, unless demotion is turned off, + // a mere region difference for one desired locale + // is as good as a perfect match for the next following desired locale. + // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. + LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR); + LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR); + const LSR *p_enGB = &enGB; + int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1, + shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY); + defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance); +} + +int32_t LocaleDistance::getBestIndexAndDistance( + const LSR &desired, + const LSR **supportedLSRs, int32_t supportedLSRsLength, + int32_t shiftedThreshold, + ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const { + BytesTrie iter(trie); + // Look up the desired language only once for all supported LSRs. + // Its "distance" is either a match point value of 0, or a non-match negative value. + // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. + int32_t desLangDistance = trieNext(iter, desired.language, false); + uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; + // Index of the supported LSR with the lowest distance. + int32_t bestIndex = -1; + // Cached lookup info from XLikelySubtags.compareLikely(). + int32_t bestLikelyInfo = -1; + for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { + const LSR &supported = *supportedLSRs[slIndex]; + bool star = false; + int32_t distance = desLangDistance; + if (distance >= 0) { + U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); + if (slIndex != 0) { + iter.resetToState64(desLangState); + } + distance = trieNext(iter, supported.language, true); + } + // Note: The data builder verifies that there are no rules with "any" (*) language and + // real (non *) script or region subtags. + // This means that if the lookup for either language fails we can use + // the default distances without further lookups. + int32_t flags; + if (distance >= 0) { + flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; + distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; + } else { // <*, *> + if (uprv_strcmp(desired.language, supported.language) == 0) { + distance = 0; + } else { + distance = defaultLanguageDistance; + } + flags = 0; + star = true; + } + U_ASSERT(0 <= distance && distance <= 100); + // Round up the shifted threshold (if fraction bits are not 0) + // for comparison with un-shifted distances until we need fraction bits. + // (If we simply shifted non-zero fraction bits away, then we might ignore a language + // when it's really still a micro distance below the threshold.) + int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT; + // We implement "favor subtag" by reducing the language subtag distance + // (unscientifically reducing it to a quarter of the normal value), + // so that the script distance is relatively more important. + // For example, given a default language distance of 80, we reduce it to 20, + // which is below the default threshold of 50, which is the default script distance. + if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { + distance >>= 2; + } + // Let distance == roundedThreshold pass until the tie-breaker logic + // at the end of the loop. + if (distance > roundedThreshold) { + continue; + } + + int32_t scriptDistance; + if (star || flags != 0) { + if (uprv_strcmp(desired.script, supported.script) == 0) { + scriptDistance = 0; + } else { + scriptDistance = defaultScriptDistance; + } + } else { + scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), + desired.script, supported.script); + flags = scriptDistance & DISTANCE_IS_FINAL; + scriptDistance &= ~DISTANCE_IS_FINAL; + } + distance += scriptDistance; + if (distance > roundedThreshold) { + continue; + } + + if (uprv_strcmp(desired.region, supported.region) == 0) { + // regionDistance = 0 + } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { + distance += defaultRegionDistance; + } else { + int32_t remainingThreshold = roundedThreshold - distance; + if (minRegionDistance > remainingThreshold) { + continue; + } + + // From here on we know the regions are not equal. + // Map each region to zero or more partitions. (zero = one non-matching string) + // (Each array of single-character partition strings is encoded as one string.) + // If either side has more than one, then we find the maximum distance. + // This could be optimized by adding some more structure, but probably not worth it. + distance += getRegionPartitionsDistance( + iter, iter.getState64(), + partitionsForRegion(desired), + partitionsForRegion(supported), + remainingThreshold); + } + int32_t shiftedDistance = shiftDistance(distance); + if (shiftedDistance == 0) { + // Distinguish between equivalent but originally unequal locales via an + // additional micro distance. + shiftedDistance |= (desired.flags ^ supported.flags); + if (shiftedDistance < shiftedThreshold) { + if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || + // Is there also a match when we swap desired/supported? + isMatch(supported, desired, shiftedThreshold, favorSubtag)) { + if (shiftedDistance == 0) { + return slIndex << INDEX_SHIFT; + } + bestIndex = slIndex; + shiftedThreshold = shiftedDistance; + bestLikelyInfo = -1; + } + } + } else { + if (shiftedDistance < shiftedThreshold) { + if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || + // Is there also a match when we swap desired/supported? + isMatch(supported, desired, shiftedThreshold, favorSubtag)) { + bestIndex = slIndex; + shiftedThreshold = shiftedDistance; + bestLikelyInfo = -1; + } + } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) { + if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || + // Is there also a match when we swap desired/supported? + isMatch(supported, desired, shiftedThreshold, favorSubtag)) { + bestLikelyInfo = likelySubtags.compareLikely( + supported, *supportedLSRs[bestIndex], bestLikelyInfo); + if ((bestLikelyInfo & 1) != 0) { + // This supported locale matches as well as the previous best match, + // and neither matches perfectly, + // but this one is "more likely" (has more-default subtags). + bestIndex = slIndex; + } + } + } + } + } + return bestIndex >= 0 ? + (bestIndex << INDEX_SHIFT) | shiftedThreshold : + INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD); +} + +int32_t LocaleDistance::getDesSuppScriptDistance( + BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { + // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. + int32_t distance = trieNext(iter, desired, false); + if (distance >= 0) { + distance = trieNext(iter, supported, true); + } + if (distance < 0) { + UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> + U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); + if (uprv_strcmp(desired, supported) == 0) { + distance = 0; // same script + } else { + distance = iter.getValue(); + U_ASSERT(distance >= 0); + } + if (result == USTRINGTRIE_FINAL_VALUE) { + distance |= DISTANCE_IS_FINAL; + } + } + return distance; +} + +int32_t LocaleDistance::getRegionPartitionsDistance( + BytesTrie &iter, uint64_t startState, + const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { + char desired = *desiredPartitions++; + char supported = *supportedPartitions++; + U_ASSERT(desired != 0 && supported != 0); + // See if we have single desired/supported partitions, from NUL-terminated + // partition strings without explicit length. + bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character + // equivalent to: if (desLength == 1 && suppLength == 1) + if (*desiredPartitions == 0 && !suppLengthGt1) { + // Fastpath for single desired/supported partitions. + UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); + if (USTRINGTRIE_HAS_NEXT(result)) { + result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); + if (USTRINGTRIE_HAS_VALUE(result)) { + return iter.getValue(); + } + } + return getFallbackRegionDistance(iter, startState); + } + + const char *supportedStart = supportedPartitions - 1; // for restart of inner loop + int32_t regionDistance = 0; + // Fall back to * only once, not for each pair of partition strings. + bool star = false; + for (;;) { + // Look up each desired-partition string only once, + // not for each (desired, supported) pair. + UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); + if (USTRINGTRIE_HAS_NEXT(result)) { + uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; + for (;;) { + result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); + int32_t d; + if (USTRINGTRIE_HAS_VALUE(result)) { + d = iter.getValue(); + } else if (star) { + d = 0; + } else { + d = getFallbackRegionDistance(iter, startState); + star = true; + } + if (d > threshold) { + return d; + } else if (regionDistance < d) { + regionDistance = d; + } + if ((supported = *supportedPartitions++) != 0) { + iter.resetToState64(desState); + } else { + break; + } + } + } else if (!star) { + int32_t d = getFallbackRegionDistance(iter, startState); + if (d > threshold) { + return d; + } else if (regionDistance < d) { + regionDistance = d; + } + star = true; + } + if ((desired = *desiredPartitions++) != 0) { + iter.resetToState64(startState); + supportedPartitions = supportedStart; + supported = *supportedPartitions++; + } else { + break; + } + } + return regionDistance; +} + +int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { +#if U_DEBUG + UStringTrieResult result = +#endif + iter.resetToState64(startState).next(u'*'); // <*, *> + U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); + int32_t distance = iter.getValue(); + U_ASSERT(distance >= 0); + return distance; +} + +int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { + uint8_t c; + if ((c = *s) == 0) { + return -1; // no empty subtags in the distance data + } + for (;;) { + c = uprv_invCharToAscii(c); + // EBCDIC: If *s is not an invariant character, + // then c is now 0 and will simply not match anything, which is harmless. + uint8_t next = *++s; + if (next != 0) { + if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { + return -1; + } + } else { + // last character of this subtag + UStringTrieResult result = iter.next(c | END_OF_SUBTAG); + if (wantValue) { + if (USTRINGTRIE_HAS_VALUE(result)) { + int32_t value = iter.getValue(); + if (result == USTRINGTRIE_FINAL_VALUE) { + value |= DISTANCE_IS_FINAL; + } + return value; + } + } else { + if (USTRINGTRIE_HAS_NEXT(result)) { + return 0; + } + } + return -1; + } + c = next; + } +} + +UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { + // Linear search for a very short list (length 6 as of 2019), + // because we look for equivalence not equality, and + // because it's easy. + // If there are many paradigm LSRs we should use a hash set + // with custom comparator and hasher. + U_ASSERT(paradigmLSRsLength <= 15); + for (int32_t i = 0; i < paradigmLSRsLength; ++i) { + if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; } + } + return false; +} + +U_NAMESPACE_END |