diff options
author | neksard <neksard@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
commit | 1d9c550e7c38e051d7961f576013a482003a70d9 (patch) | |
tree | b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/common/brkeng.cpp | |
parent | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff) | |
download | ydb-1d9c550e7c38e051d7961f576013a482003a70d9.tar.gz |
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/common/brkeng.cpp')
-rw-r--r-- | contrib/libs/icu/common/brkeng.cpp | 496 |
1 files changed, 248 insertions, 248 deletions
diff --git a/contrib/libs/icu/common/brkeng.cpp b/contrib/libs/icu/common/brkeng.cpp index 6392240a03..78492db662 100644 --- a/contrib/libs/icu/common/brkeng.cpp +++ b/contrib/libs/icu/common/brkeng.cpp @@ -1,80 +1,80 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* - ************************************************************************************ - * Copyright (C) 2006-2016, International Business Machines Corporation - * and others. All Rights Reserved. - ************************************************************************************ - */ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_BREAK_ITERATION - -#include "unicode/uchar.h" -#include "unicode/uniset.h" -#include "unicode/chariter.h" -#include "unicode/ures.h" -#include "unicode/udata.h" -#include "unicode/putil.h" -#include "unicode/ustring.h" -#include "unicode/uscript.h" -#include "unicode/ucharstrie.h" -#include "unicode/bytestrie.h" +// License & terms of use: http://www.unicode.org/copyright.html +/* + ************************************************************************************ + * Copyright (C) 2006-2016, International Business Machines Corporation + * and others. All Rights Reserved. + ************************************************************************************ + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_BREAK_ITERATION + +#include "unicode/uchar.h" +#include "unicode/uniset.h" +#include "unicode/chariter.h" +#include "unicode/ures.h" +#include "unicode/udata.h" +#include "unicode/putil.h" +#include "unicode/ustring.h" +#include "unicode/uscript.h" +#include "unicode/ucharstrie.h" +#include "unicode/bytestrie.h" #include "brkeng.h" #include "cmemory.h" #include "dictbe.h" -#include "charstr.h" -#include "dictionarydata.h" -#include "mutex.h" -#include "uvector.h" -#include "umutex.h" -#include "uresimp.h" -#include "ubrkimpl.h" - -U_NAMESPACE_BEGIN - -/* - ****************************************************************** - */ - -LanguageBreakEngine::LanguageBreakEngine() { -} - -LanguageBreakEngine::~LanguageBreakEngine() { -} - -/* - ****************************************************************** - */ - -LanguageBreakFactory::LanguageBreakFactory() { -} - -LanguageBreakFactory::~LanguageBreakFactory() { -} - -/* - ****************************************************************** - */ - +#include "charstr.h" +#include "dictionarydata.h" +#include "mutex.h" +#include "uvector.h" +#include "umutex.h" +#include "uresimp.h" +#include "ubrkimpl.h" + +U_NAMESPACE_BEGIN + +/* + ****************************************************************** + */ + +LanguageBreakEngine::LanguageBreakEngine() { +} + +LanguageBreakEngine::~LanguageBreakEngine() { +} + +/* + ****************************************************************** + */ + +LanguageBreakFactory::LanguageBreakFactory() { +} + +LanguageBreakFactory::~LanguageBreakFactory() { +} + +/* + ****************************************************************** + */ + UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) { (void)status; -} - -UnhandledEngine::~UnhandledEngine() { +} + +UnhandledEngine::~UnhandledEngine() { delete fHandled; fHandled = nullptr; -} - -UBool +} + +UBool UnhandledEngine::handles(UChar32 c) const { return fHandled && fHandled->contains(c); -} - -int32_t -UnhandledEngine::findBreaks( UText *text, +} + +int32_t +UnhandledEngine::findBreaks( UText *text, int32_t /* startPos */, int32_t endPos, UVector32 &/*foundBreaks*/ ) const { @@ -82,203 +82,203 @@ UnhandledEngine::findBreaks( UText *text, while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) { utext_next32(text); // TODO: recast loop to work with post-increment operations. c = utext_current32(text); - } - return 0; -} - -void + } + return 0; +} + +void UnhandledEngine::handleCharacter(UChar32 c) { if (fHandled == nullptr) { fHandled = new UnicodeSet(); if (fHandled == nullptr) { return; - } - } + } + } if (!fHandled->contains(c)) { UErrorCode status = U_ZERO_ERROR; // Apply the entire script of the character. int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status); } -} - -/* - ****************************************************************** - */ - -ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { - fEngines = 0; -} - -ICULanguageBreakFactory::~ICULanguageBreakFactory() { - if (fEngines != 0) { - delete fEngines; - } -} - -U_NAMESPACE_END -U_CDECL_BEGIN -static void U_CALLCONV _deleteEngine(void *obj) { - delete (const icu::LanguageBreakEngine *) obj; -} -U_CDECL_END -U_NAMESPACE_BEGIN - -const LanguageBreakEngine * +} + +/* + ****************************************************************** + */ + +ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { + fEngines = 0; +} + +ICULanguageBreakFactory::~ICULanguageBreakFactory() { + if (fEngines != 0) { + delete fEngines; + } +} + +U_NAMESPACE_END +U_CDECL_BEGIN +static void U_CALLCONV _deleteEngine(void *obj) { + delete (const icu::LanguageBreakEngine *) obj; +} +U_CDECL_END +U_NAMESPACE_BEGIN + +const LanguageBreakEngine * ICULanguageBreakFactory::getEngineFor(UChar32 c) { - const LanguageBreakEngine *lbe = NULL; - UErrorCode status = U_ZERO_ERROR; - + const LanguageBreakEngine *lbe = NULL; + UErrorCode status = U_ZERO_ERROR; + static UMutex gBreakEngineMutex; - Mutex m(&gBreakEngineMutex); - - if (fEngines == NULL) { - UStack *engines = new UStack(_deleteEngine, NULL, status); - if (U_FAILURE(status) || engines == NULL) { - // Note: no way to return error code to caller. - delete engines; - return NULL; - } - fEngines = engines; - } else { - int32_t i = fEngines->size(); - while (--i >= 0) { - lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); + Mutex m(&gBreakEngineMutex); + + if (fEngines == NULL) { + UStack *engines = new UStack(_deleteEngine, NULL, status); + if (U_FAILURE(status) || engines == NULL) { + // Note: no way to return error code to caller. + delete engines; + return NULL; + } + fEngines = engines; + } else { + int32_t i = fEngines->size(); + while (--i >= 0) { + lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); if (lbe != NULL && lbe->handles(c)) { - return lbe; - } - } - } - - // We didn't find an engine. Create one. + return lbe; + } + } + } + + // We didn't find an engine. Create one. lbe = loadEngineFor(c); - if (lbe != NULL) { - fEngines->push((void *)lbe, status); - } - return lbe; -} - -const LanguageBreakEngine * + if (lbe != NULL) { + fEngines->push((void *)lbe, status); + } + return lbe; +} + +const LanguageBreakEngine * ICULanguageBreakFactory::loadEngineFor(UChar32 c) { - UErrorCode status = U_ZERO_ERROR; - UScriptCode code = uscript_getScript(c, &status); - if (U_SUCCESS(status)) { + UErrorCode status = U_ZERO_ERROR; + UScriptCode code = uscript_getScript(c, &status); + if (U_SUCCESS(status)) { DictionaryMatcher *m = loadDictionaryMatcherFor(code); - if (m != NULL) { - const LanguageBreakEngine *engine = NULL; - switch(code) { - case USCRIPT_THAI: - engine = new ThaiBreakEngine(m, status); - break; - case USCRIPT_LAO: - engine = new LaoBreakEngine(m, status); - break; - case USCRIPT_MYANMAR: - engine = new BurmeseBreakEngine(m, status); - break; - case USCRIPT_KHMER: - engine = new KhmerBreakEngine(m, status); - break; - -#if !UCONFIG_NO_NORMALIZATION - // CJK not available w/o normalization - case USCRIPT_HANGUL: - engine = new CjkBreakEngine(m, kKorean, status); - break; - - // use same BreakEngine and dictionary for both Chinese and Japanese - case USCRIPT_HIRAGANA: - case USCRIPT_KATAKANA: - case USCRIPT_HAN: - engine = new CjkBreakEngine(m, kChineseJapanese, status); - break; -#if 0 - // TODO: Have to get some characters with script=common handled - // by CjkBreakEngine (e.g. U+309B). Simply subjecting - // them to CjkBreakEngine does not work. The engine has to - // special-case them. - case USCRIPT_COMMON: - { - UBlockCode block = ublock_getCode(code); - if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) - engine = new CjkBreakEngine(dict, kChineseJapanese, status); - break; - } -#endif -#endif - - default: - break; - } - if (engine == NULL) { - delete m; - } - else if (U_FAILURE(status)) { - delete engine; - engine = NULL; - } - return engine; - } - } - return NULL; -} - -DictionaryMatcher * + if (m != NULL) { + const LanguageBreakEngine *engine = NULL; + switch(code) { + case USCRIPT_THAI: + engine = new ThaiBreakEngine(m, status); + break; + case USCRIPT_LAO: + engine = new LaoBreakEngine(m, status); + break; + case USCRIPT_MYANMAR: + engine = new BurmeseBreakEngine(m, status); + break; + case USCRIPT_KHMER: + engine = new KhmerBreakEngine(m, status); + break; + +#if !UCONFIG_NO_NORMALIZATION + // CJK not available w/o normalization + case USCRIPT_HANGUL: + engine = new CjkBreakEngine(m, kKorean, status); + break; + + // use same BreakEngine and dictionary for both Chinese and Japanese + case USCRIPT_HIRAGANA: + case USCRIPT_KATAKANA: + case USCRIPT_HAN: + engine = new CjkBreakEngine(m, kChineseJapanese, status); + break; +#if 0 + // TODO: Have to get some characters with script=common handled + // by CjkBreakEngine (e.g. U+309B). Simply subjecting + // them to CjkBreakEngine does not work. The engine has to + // special-case them. + case USCRIPT_COMMON: + { + UBlockCode block = ublock_getCode(code); + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) + engine = new CjkBreakEngine(dict, kChineseJapanese, status); + break; + } +#endif +#endif + + default: + break; + } + if (engine == NULL) { + delete m; + } + else if (U_FAILURE(status)) { + delete engine; + engine = NULL; + } + return engine; + } + } + return NULL; +} + +DictionaryMatcher * ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { - UErrorCode status = U_ZERO_ERROR; - // open root from brkitr tree. - UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); - b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); - int32_t dictnlength = 0; - const UChar *dictfname = - ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); - if (U_FAILURE(status)) { - ures_close(b); - return NULL; - } - CharString dictnbuf; - CharString ext; - const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot - if (extStart != NULL) { - int32_t len = (int32_t)(extStart - dictfname); - ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); - dictnlength = len; - } - dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); - ures_close(b); - - UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); - if (U_SUCCESS(status)) { - // build trie - const uint8_t *data = (const uint8_t *)udata_getMemory(file); - const int32_t *indexes = (const int32_t *)data; - const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; - const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; - DictionaryMatcher *m = NULL; - if (trieType == DictionaryData::TRIE_TYPE_BYTES) { - const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; - const char *characters = (const char *)(data + offset); - m = new BytesDictionaryMatcher(characters, transform, file); - } - else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { - const UChar *characters = (const UChar *)(data + offset); - m = new UCharsDictionaryMatcher(characters, file); - } - if (m == NULL) { - // no matcher exists to take ownership - either we are an invalid - // type or memory allocation failed - udata_close(file); - } - return m; - } else if (dictfname != NULL) { - // we don't have a dictionary matcher. - // returning NULL here will cause us to fail to find a dictionary break engine, as expected - status = U_ZERO_ERROR; - return NULL; - } - return NULL; -} - -U_NAMESPACE_END - -#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ + UErrorCode status = U_ZERO_ERROR; + // open root from brkitr tree. + UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); + b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); + int32_t dictnlength = 0; + const UChar *dictfname = + ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); + if (U_FAILURE(status)) { + ures_close(b); + return NULL; + } + CharString dictnbuf; + CharString ext; + const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot + if (extStart != NULL) { + int32_t len = (int32_t)(extStart - dictfname); + ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); + dictnlength = len; + } + dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); + ures_close(b); + + UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); + if (U_SUCCESS(status)) { + // build trie + const uint8_t *data = (const uint8_t *)udata_getMemory(file); + const int32_t *indexes = (const int32_t *)data; + const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; + const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; + DictionaryMatcher *m = NULL; + if (trieType == DictionaryData::TRIE_TYPE_BYTES) { + const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; + const char *characters = (const char *)(data + offset); + m = new BytesDictionaryMatcher(characters, transform, file); + } + else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { + const UChar *characters = (const UChar *)(data + offset); + m = new UCharsDictionaryMatcher(characters, file); + } + if (m == NULL) { + // no matcher exists to take ownership - either we are an invalid + // type or memory allocation failed + udata_close(file); + } + return m; + } else if (dictfname != NULL) { + // we don't have a dictionary matcher. + // returning NULL here will cause us to fail to find a dictionary break engine, as expected + status = U_ZERO_ERROR; + return NULL; + } + return NULL; +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |