diff options
author | neksard <neksard@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
commit | 1d9c550e7c38e051d7961f576013a482003a70d9 (patch) | |
tree | b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/i18n/anytrans.cpp | |
parent | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff) | |
download | ydb-1d9c550e7c38e051d7961f576013a482003a70d9.tar.gz |
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/i18n/anytrans.cpp')
-rw-r--r-- | contrib/libs/icu/i18n/anytrans.cpp | 794 |
1 files changed, 397 insertions, 397 deletions
diff --git a/contrib/libs/icu/i18n/anytrans.cpp b/contrib/libs/icu/i18n/anytrans.cpp index aeba2435ec..167b018528 100644 --- a/contrib/libs/icu/i18n/anytrans.cpp +++ b/contrib/libs/icu/i18n/anytrans.cpp @@ -1,411 +1,411 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -***************************************************************** -* Copyright (c) 2002-2014, International Business Machines Corporation -* and others. All Rights Reserved. -***************************************************************** -* Date Name Description -* 06/06/2002 aliu Creation. -***************************************************************** -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_TRANSLITERATION - -#include "unicode/uobject.h" -#include "unicode/uscript.h" - -#include "anytrans.h" -#include "hash.h" -#include "mutex.h" -#include "nultrans.h" -#include "putilimp.h" -#include "tridpars.h" -#include "uinvchar.h" -#include "uvector.h" - -//------------------------------------------------------------ -// Constants - -static const UChar TARGET_SEP = 45; // '-' -static const UChar VARIANT_SEP = 47; // '/' +// License & terms of use: http://www.unicode.org/copyright.html +/* +***************************************************************** +* Copyright (c) 2002-2014, International Business Machines Corporation +* and others. All Rights Reserved. +***************************************************************** +* Date Name Description +* 06/06/2002 aliu Creation. +***************************************************************** +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_TRANSLITERATION + +#include "unicode/uobject.h" +#include "unicode/uscript.h" + +#include "anytrans.h" +#include "hash.h" +#include "mutex.h" +#include "nultrans.h" +#include "putilimp.h" +#include "tridpars.h" +#include "uinvchar.h" +#include "uvector.h" + +//------------------------------------------------------------ +// Constants + +static const UChar TARGET_SEP = 45; // '-' +static const UChar VARIANT_SEP = 47; // '/' static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any" -static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" +static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-" - + // initial size for an Any-XXXX transform's cache of script-XXXX transforms // (will grow as necessary, but we don't expect to have source text with more than 7 scripts) #define ANY_TRANS_CACHE_INIT_SIZE 7 -//------------------------------------------------------------ - -U_CDECL_BEGIN -/** - * Deleter function for Transliterator*. - */ -static void U_CALLCONV -_deleteTransliterator(void *obj) { - delete (icu::Transliterator*) obj; -} -U_CDECL_END - -//------------------------------------------------------------ - -U_NAMESPACE_BEGIN - -//------------------------------------------------------------ -// ScriptRunIterator - -/** - * Returns a series of ranges corresponding to scripts. They will be - * of the form: - * - * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second - * | | - first run (start, limit) - * | | - second run (start, limit) - * - * That is, the runs will overlap. The reason for this is so that a - * transliterator can consider common characters both before and after - * the scripts. - */ -class ScriptRunIterator : public UMemory { -private: - const Replaceable& text; - int32_t textStart; - int32_t textLimit; - -public: - /** - * The code of the current run, valid after next() returns. May - * be USCRIPT_INVALID_CODE if and only if the entire text is - * COMMON/INHERITED. - */ - UScriptCode scriptCode; - - /** - * The start of the run, inclusive, valid after next() returns. - */ - int32_t start; - - /** - * The end of the run, exclusive, valid after next() returns. - */ - int32_t limit; - - /** - * Constructs a run iterator over the given text from start - * (inclusive) to limit (exclusive). - */ - ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); - - /** - * Returns TRUE if there are any more runs. TRUE is always - * returned at least once. Upon return, the caller should - * examine scriptCode, start, and limit. - */ - UBool next(); - - /** - * Adjusts internal indices for a change in the limit index of the - * given delta. A positive delta means the limit has increased. - */ - void adjustLimit(int32_t delta); - -private: - ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class - ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class -}; - -ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, - int32_t myStart, int32_t myLimit) : - text(theText) -{ - textStart = myStart; - textLimit = myLimit; - limit = myStart; -} - -UBool ScriptRunIterator::next() { - UChar32 ch; - UScriptCode s; - UErrorCode ec = U_ZERO_ERROR; - - scriptCode = USCRIPT_INVALID_CODE; // don't know script yet - start = limit; - - // Are we done? - if (start == textLimit) { - return FALSE; - } - - // Move start back to include adjacent COMMON or INHERITED - // characters - while (start > textStart) { - ch = text.char32At(start - 1); // look back - s = uscript_getScript(ch, &ec); - if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { - --start; - } else { - break; - } - } - - // Move limit ahead to include COMMON, INHERITED, and characters - // of the current script. - while (limit < textLimit) { - ch = text.char32At(limit); // look ahead - s = uscript_getScript(ch, &ec); - if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { - if (scriptCode == USCRIPT_INVALID_CODE) { - scriptCode = s; - } else if (s != scriptCode) { - break; - } - } - ++limit; - } - - // Return TRUE even if the entire text is COMMON / INHERITED, in - // which case scriptCode will be USCRIPT_INVALID_CODE. - return TRUE; -} - -void ScriptRunIterator::adjustLimit(int32_t delta) { - limit += delta; - textLimit += delta; -} - -//------------------------------------------------------------ -// AnyTransliterator - -UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) - -AnyTransliterator::AnyTransliterator(const UnicodeString& id, - const UnicodeString& theTarget, - const UnicodeString& theVariant, - UScriptCode theTargetScript, - UErrorCode& ec) : - Transliterator(id, NULL), - targetScript(theTargetScript) -{ +//------------------------------------------------------------ + +U_CDECL_BEGIN +/** + * Deleter function for Transliterator*. + */ +static void U_CALLCONV +_deleteTransliterator(void *obj) { + delete (icu::Transliterator*) obj; +} +U_CDECL_END + +//------------------------------------------------------------ + +U_NAMESPACE_BEGIN + +//------------------------------------------------------------ +// ScriptRunIterator + +/** + * Returns a series of ranges corresponding to scripts. They will be + * of the form: + * + * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second + * | | - first run (start, limit) + * | | - second run (start, limit) + * + * That is, the runs will overlap. The reason for this is so that a + * transliterator can consider common characters both before and after + * the scripts. + */ +class ScriptRunIterator : public UMemory { +private: + const Replaceable& text; + int32_t textStart; + int32_t textLimit; + +public: + /** + * The code of the current run, valid after next() returns. May + * be USCRIPT_INVALID_CODE if and only if the entire text is + * COMMON/INHERITED. + */ + UScriptCode scriptCode; + + /** + * The start of the run, inclusive, valid after next() returns. + */ + int32_t start; + + /** + * The end of the run, exclusive, valid after next() returns. + */ + int32_t limit; + + /** + * Constructs a run iterator over the given text from start + * (inclusive) to limit (exclusive). + */ + ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); + + /** + * Returns TRUE if there are any more runs. TRUE is always + * returned at least once. Upon return, the caller should + * examine scriptCode, start, and limit. + */ + UBool next(); + + /** + * Adjusts internal indices for a change in the limit index of the + * given delta. A positive delta means the limit has increased. + */ + void adjustLimit(int32_t delta); + +private: + ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class + ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class +}; + +ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, + int32_t myStart, int32_t myLimit) : + text(theText) +{ + textStart = myStart; + textLimit = myLimit; + limit = myStart; +} + +UBool ScriptRunIterator::next() { + UChar32 ch; + UScriptCode s; + UErrorCode ec = U_ZERO_ERROR; + + scriptCode = USCRIPT_INVALID_CODE; // don't know script yet + start = limit; + + // Are we done? + if (start == textLimit) { + return FALSE; + } + + // Move start back to include adjacent COMMON or INHERITED + // characters + while (start > textStart) { + ch = text.char32At(start - 1); // look back + s = uscript_getScript(ch, &ec); + if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { + --start; + } else { + break; + } + } + + // Move limit ahead to include COMMON, INHERITED, and characters + // of the current script. + while (limit < textLimit) { + ch = text.char32At(limit); // look ahead + s = uscript_getScript(ch, &ec); + if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { + if (scriptCode == USCRIPT_INVALID_CODE) { + scriptCode = s; + } else if (s != scriptCode) { + break; + } + } + ++limit; + } + + // Return TRUE even if the entire text is COMMON / INHERITED, in + // which case scriptCode will be USCRIPT_INVALID_CODE. + return TRUE; +} + +void ScriptRunIterator::adjustLimit(int32_t delta) { + limit += delta; + textLimit += delta; +} + +//------------------------------------------------------------ +// AnyTransliterator + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) + +AnyTransliterator::AnyTransliterator(const UnicodeString& id, + const UnicodeString& theTarget, + const UnicodeString& theVariant, + UScriptCode theTargetScript, + UErrorCode& ec) : + Transliterator(id, NULL), + targetScript(theTargetScript) +{ cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec); - if (U_FAILURE(ec)) { - return; - } - uhash_setValueDeleter(cache, _deleteTransliterator); - - target = theTarget; - if (theVariant.length() > 0) { - target.append(VARIANT_SEP).append(theVariant); - } -} - -AnyTransliterator::~AnyTransliterator() { - uhash_close(cache); -} - -/** - * Copy constructor. - */ -AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : - Transliterator(o), - target(o.target), - targetScript(o.targetScript) -{ - // Don't copy the cache contents - UErrorCode ec = U_ZERO_ERROR; + if (U_FAILURE(ec)) { + return; + } + uhash_setValueDeleter(cache, _deleteTransliterator); + + target = theTarget; + if (theVariant.length() > 0) { + target.append(VARIANT_SEP).append(theVariant); + } +} + +AnyTransliterator::~AnyTransliterator() { + uhash_close(cache); +} + +/** + * Copy constructor. + */ +AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : + Transliterator(o), + target(o.target), + targetScript(o.targetScript) +{ + // Don't copy the cache contents + UErrorCode ec = U_ZERO_ERROR; cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec); - if (U_FAILURE(ec)) { - return; - } - uhash_setValueDeleter(cache, _deleteTransliterator); -} - -/** - * Transliterator API. - */ + if (U_FAILURE(ec)) { + return; + } + uhash_setValueDeleter(cache, _deleteTransliterator); +} + +/** + * Transliterator API. + */ AnyTransliterator* AnyTransliterator::clone() const { - return new AnyTransliterator(*this); -} - -/** - * Implements {@link Transliterator#handleTransliterate}. - */ -void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, - UBool isIncremental) const { - int32_t allStart = pos.start; - int32_t allLimit = pos.limit; - - ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); - - while (it.next()) { - // Ignore runs in the ante context - if (it.limit <= allStart) continue; - - // Try to instantiate transliterator from it.scriptCode to - // our target or target/variant - Transliterator* t = getTransliterator(it.scriptCode); - - if (t == NULL) { - // We have no transliterator. Do nothing, but keep - // pos.start up to date. - pos.start = it.limit; - continue; - } - - // If the run end is before the transliteration limit, do - // a non-incremental transliteration. Otherwise do an - // incremental one. - UBool incremental = isIncremental && (it.limit >= allLimit); - - pos.start = uprv_max(allStart, it.start); - pos.limit = uprv_min(allLimit, it.limit); - int32_t limit = pos.limit; - t->filteredTransliterate(text, pos, incremental); - int32_t delta = pos.limit - limit; - allLimit += delta; - it.adjustLimit(delta); - - // We're done if we enter the post context - if (it.limit >= allLimit) break; - } - - // Restore limit. pos.start is fine where the last transliterator - // left it, or at the end of the last run. - pos.limit = allLimit; -} - -Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { - - if (source == targetScript || source == USCRIPT_INVALID_CODE) { - return NULL; - } - - Transliterator* t = NULL; - { - Mutex m(NULL); - t = (Transliterator*) uhash_iget(cache, (int32_t) source); - } - if (t == NULL) { - UErrorCode ec = U_ZERO_ERROR; + return new AnyTransliterator(*this); +} + +/** + * Implements {@link Transliterator#handleTransliterate}. + */ +void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, + UBool isIncremental) const { + int32_t allStart = pos.start; + int32_t allLimit = pos.limit; + + ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); + + while (it.next()) { + // Ignore runs in the ante context + if (it.limit <= allStart) continue; + + // Try to instantiate transliterator from it.scriptCode to + // our target or target/variant + Transliterator* t = getTransliterator(it.scriptCode); + + if (t == NULL) { + // We have no transliterator. Do nothing, but keep + // pos.start up to date. + pos.start = it.limit; + continue; + } + + // If the run end is before the transliteration limit, do + // a non-incremental transliteration. Otherwise do an + // incremental one. + UBool incremental = isIncremental && (it.limit >= allLimit); + + pos.start = uprv_max(allStart, it.start); + pos.limit = uprv_min(allLimit, it.limit); + int32_t limit = pos.limit; + t->filteredTransliterate(text, pos, incremental); + int32_t delta = pos.limit - limit; + allLimit += delta; + it.adjustLimit(delta); + + // We're done if we enter the post context + if (it.limit >= allLimit) break; + } + + // Restore limit. pos.start is fine where the last transliterator + // left it, or at the end of the last run. + pos.limit = allLimit; +} + +Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { + + if (source == targetScript || source == USCRIPT_INVALID_CODE) { + return NULL; + } + + Transliterator* t = NULL; + { + Mutex m(NULL); + t = (Transliterator*) uhash_iget(cache, (int32_t) source); + } + if (t == NULL) { + UErrorCode ec = U_ZERO_ERROR; UnicodeString sourceName(uscript_getShortName(source), -1, US_INV); - UnicodeString id(sourceName); - id.append(TARGET_SEP).append(target); - - t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); - if (U_FAILURE(ec) || t == NULL) { - delete t; - - // Try to pivot around Latin, our most common script - id = sourceName; - id.append(LATIN_PIVOT, -1).append(target); - t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); - if (U_FAILURE(ec) || t == NULL) { - delete t; - t = NULL; - } - } - - if (t != NULL) { - Transliterator *rt = NULL; - { - Mutex m(NULL); - rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source)); - if (rt == NULL) { - // Common case, no race to cache this new transliterator. - uhash_iput(cache, (int32_t) source, t, &ec); - } else { - // Race case, some other thread beat us to caching this transliterator. - Transliterator *temp = rt; - rt = t; // Our newly created transliterator that lost the race & now needs deleting. - t = temp; // The transliterator from the cache that we will return. - } - } - delete rt; // will be non-null only in case of races. - } - } - return t; -} - -/** - * Return the script code for a given name, or -1 if not found. - */ -static UScriptCode scriptNameToCode(const UnicodeString& name) { - char buf[128]; - UScriptCode code; - UErrorCode ec = U_ZERO_ERROR; - int32_t nameLen = name.length(); - UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); - - if (isInvariant) { - name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); - buf[127] = 0; // Make sure that we NULL terminate the string. - } - if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) - { - code = USCRIPT_INVALID_CODE; - } - return code; -} - -/** - * Registers standard transliterators with the system. Called by - * Transliterator during initialization. Scan all current targets and - * register those that are scripts T as Any-T/V. - */ -void AnyTransliterator::registerIDs() { - - UErrorCode ec = U_ZERO_ERROR; - Hashtable seen(TRUE, ec); - - int32_t sourceCount = Transliterator::_countAvailableSources(); - for (int32_t s=0; s<sourceCount; ++s) { - UnicodeString source; - Transliterator::_getAvailableSource(s, source); - - // Ignore the "Any" source - if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue; - - int32_t targetCount = Transliterator::_countAvailableTargets(source); - for (int32_t t=0; t<targetCount; ++t) { - UnicodeString target; - Transliterator::_getAvailableTarget(t, source, target); - - // Only process each target once - if (seen.geti(target) != 0) continue; - ec = U_ZERO_ERROR; - seen.puti(target, 1, ec); - - // Get the script code for the target. If not a script, ignore. - UScriptCode targetScript = scriptNameToCode(target); - if (targetScript == USCRIPT_INVALID_CODE) continue; - - int32_t variantCount = Transliterator::_countAvailableVariants(source, target); - // assert(variantCount >= 1); - for (int32_t v=0; v<variantCount; ++v) { - UnicodeString variant; - Transliterator::_getAvailableVariant(v, source, target, variant); - - UnicodeString id; - TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id); - ec = U_ZERO_ERROR; + UnicodeString id(sourceName); + id.append(TARGET_SEP).append(target); + + t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); + if (U_FAILURE(ec) || t == NULL) { + delete t; + + // Try to pivot around Latin, our most common script + id = sourceName; + id.append(LATIN_PIVOT, -1).append(target); + t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); + if (U_FAILURE(ec) || t == NULL) { + delete t; + t = NULL; + } + } + + if (t != NULL) { + Transliterator *rt = NULL; + { + Mutex m(NULL); + rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source)); + if (rt == NULL) { + // Common case, no race to cache this new transliterator. + uhash_iput(cache, (int32_t) source, t, &ec); + } else { + // Race case, some other thread beat us to caching this transliterator. + Transliterator *temp = rt; + rt = t; // Our newly created transliterator that lost the race & now needs deleting. + t = temp; // The transliterator from the cache that we will return. + } + } + delete rt; // will be non-null only in case of races. + } + } + return t; +} + +/** + * Return the script code for a given name, or -1 if not found. + */ +static UScriptCode scriptNameToCode(const UnicodeString& name) { + char buf[128]; + UScriptCode code; + UErrorCode ec = U_ZERO_ERROR; + int32_t nameLen = name.length(); + UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); + + if (isInvariant) { + name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); + buf[127] = 0; // Make sure that we NULL terminate the string. + } + if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) + { + code = USCRIPT_INVALID_CODE; + } + return code; +} + +/** + * Registers standard transliterators with the system. Called by + * Transliterator during initialization. Scan all current targets and + * register those that are scripts T as Any-T/V. + */ +void AnyTransliterator::registerIDs() { + + UErrorCode ec = U_ZERO_ERROR; + Hashtable seen(TRUE, ec); + + int32_t sourceCount = Transliterator::_countAvailableSources(); + for (int32_t s=0; s<sourceCount; ++s) { + UnicodeString source; + Transliterator::_getAvailableSource(s, source); + + // Ignore the "Any" source + if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue; + + int32_t targetCount = Transliterator::_countAvailableTargets(source); + for (int32_t t=0; t<targetCount; ++t) { + UnicodeString target; + Transliterator::_getAvailableTarget(t, source, target); + + // Only process each target once + if (seen.geti(target) != 0) continue; + ec = U_ZERO_ERROR; + seen.puti(target, 1, ec); + + // Get the script code for the target. If not a script, ignore. + UScriptCode targetScript = scriptNameToCode(target); + if (targetScript == USCRIPT_INVALID_CODE) continue; + + int32_t variantCount = Transliterator::_countAvailableVariants(source, target); + // assert(variantCount >= 1); + for (int32_t v=0; v<variantCount; ++v) { + UnicodeString variant; + Transliterator::_getAvailableVariant(v, source, target, variant); + + UnicodeString id; + TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id); + ec = U_ZERO_ERROR; AnyTransliterator* tl = new AnyTransliterator(id, target, variant, - targetScript, ec); - if (U_FAILURE(ec)) { + targetScript, ec); + if (U_FAILURE(ec)) { delete tl; - } else { + } else { Transliterator::_registerInstance(tl); - Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE); - } - } - } - } -} - -U_NAMESPACE_END - -#endif /* #if !UCONFIG_NO_TRANSLITERATION */ - -//eof + Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE); + } + } + } + } +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ + +//eof |