diff options
author | mcheshkov <mcheshkov@yandex-team.ru> | 2022-02-10 16:46:16 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:16 +0300 |
commit | 1312621288956f199a5bd5342b0133d4395fa725 (patch) | |
tree | 1a2c5ffcf89eb53ecd79dbc9bc0a195c27404d0c /contrib/libs/icu/common/usc_impl.cpp | |
parent | e9d19cec64684c9c1e6b0c98297e5b895cf904fe (diff) | |
download | ydb-1312621288956f199a5bd5342b0133d4395fa725.tar.gz |
Restoring authorship annotation for <mcheshkov@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/common/usc_impl.cpp')
-rw-r--r-- | contrib/libs/icu/common/usc_impl.cpp | 722 |
1 files changed, 361 insertions, 361 deletions
diff --git a/contrib/libs/icu/common/usc_impl.cpp b/contrib/libs/icu/common/usc_impl.cpp index 5594962637..111029b974 100644 --- a/contrib/libs/icu/common/usc_impl.cpp +++ b/contrib/libs/icu/common/usc_impl.cpp @@ -1,361 +1,361 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -********************************************************************** -* Copyright (C) 1999-2016, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -* -* File USC_IMPL.C -* -* Modification History: -* -* Date Name Description -* 07/08/2002 Eric Mader Creation. -****************************************************************************** -*/ - -#include "unicode/uscript.h" -#include "usc_impl.h" -#include "cmemory.h" - -#define PAREN_STACK_DEPTH 32 - -#define MOD(sp) ((sp) % PAREN_STACK_DEPTH) -#define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH) -#define INC(sp,count) (MOD((sp) + (count))) -#define INC1(sp) (INC(sp, 1)) -#define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count))) -#define DEC1(sp) (DEC(sp, 1)) -#define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0) -#define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun)) -#define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP]) -#define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0) - -struct ParenStackEntry -{ - int32_t pairIndex; - UScriptCode scriptCode; -}; - -struct UScriptRun -{ - int32_t textLength; - const UChar *textArray; - - int32_t scriptStart; - int32_t scriptLimit; - UScriptCode scriptCode; - - struct ParenStackEntry parenStack[PAREN_STACK_DEPTH]; - int32_t parenSP; - int32_t pushCount; - int32_t fixupCount; -}; - -static int8_t highBit(int32_t value); - -static const UChar32 pairedChars[] = { - 0x0028, 0x0029, /* ascii paired punctuation */ - 0x003c, 0x003e, - 0x005b, 0x005d, - 0x007b, 0x007d, - 0x00ab, 0x00bb, /* guillemets */ - 0x2018, 0x2019, /* general punctuation */ - 0x201c, 0x201d, - 0x2039, 0x203a, - 0x3008, 0x3009, /* chinese paired punctuation */ - 0x300a, 0x300b, - 0x300c, 0x300d, - 0x300e, 0x300f, - 0x3010, 0x3011, - 0x3014, 0x3015, - 0x3016, 0x3017, - 0x3018, 0x3019, - 0x301a, 0x301b -}; - -static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode) -{ - scriptRun->pushCount = LIMIT_INC(scriptRun->pushCount); - scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount); - - scriptRun->parenSP = INC1(scriptRun->parenSP); - scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex; - scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode; -} - -static void pop(UScriptRun *scriptRun) -{ - if (STACK_IS_EMPTY(scriptRun)) { - return; - } - - if (scriptRun->fixupCount > 0) { - scriptRun->fixupCount -= 1; - } - - scriptRun->pushCount -= 1; - scriptRun->parenSP = DEC1(scriptRun->parenSP); - - /* If the stack is now empty, reset the stack - pointers to their initial values. - */ - if (STACK_IS_EMPTY(scriptRun)) { - scriptRun->parenSP = -1; - } -} - -static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode) -{ - int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount); - - while (scriptRun->fixupCount-- > 0) { - fixupSP = INC1(fixupSP); - scriptRun->parenStack[fixupSP].scriptCode = scriptCode; - } -} - -static int8_t -highBit(int32_t value) -{ - int8_t bit = 0; - - if (value <= 0) { - return -32; - } - - if (value >= 1 << 16) { - value >>= 16; - bit += 16; - } - - if (value >= 1 << 8) { - value >>= 8; - bit += 8; - } - - if (value >= 1 << 4) { - value >>= 4; - bit += 4; - } - - if (value >= 1 << 2) { - value >>= 2; - bit += 2; - } - - if (value >= 1 << 1) { - //value >>= 1; - bit += 1; - } - - return bit; -} - -static int32_t -getPairIndex(UChar32 ch) -{ - int32_t pairedCharCount = UPRV_LENGTHOF(pairedChars); - int32_t pairedCharPower = 1 << highBit(pairedCharCount); - int32_t pairedCharExtra = pairedCharCount - pairedCharPower; - - int32_t probe = pairedCharPower; - int32_t pairIndex = 0; - - if (ch >= pairedChars[pairedCharExtra]) { - pairIndex = pairedCharExtra; - } - - while (probe > (1 << 0)) { - probe >>= 1; - - if (ch >= pairedChars[pairIndex + probe]) { - pairIndex += probe; - } - } - - if (pairedChars[pairIndex] != ch) { - pairIndex = -1; - } - - return pairIndex; -} - -static UBool -sameScript(UScriptCode scriptOne, UScriptCode scriptTwo) -{ - return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo; -} - -U_CAPI UScriptRun * U_EXPORT2 -uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode) -{ - UScriptRun *result = NULL; - - if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) { - return NULL; - } - - result = (UScriptRun *)uprv_malloc(sizeof (UScriptRun)); - - if (result == NULL) { - *pErrorCode = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - - uscript_setRunText(result, src, length, pErrorCode); - - /* Release the UScriptRun if uscript_setRunText() returns an error */ - if (U_FAILURE(*pErrorCode)) { - uprv_free(result); - result = NULL; - } - - return result; -} - -U_CAPI void U_EXPORT2 -uscript_closeRun(UScriptRun *scriptRun) -{ - if (scriptRun != NULL) { - uprv_free(scriptRun); - } -} - -U_CAPI void U_EXPORT2 -uscript_resetRun(UScriptRun *scriptRun) -{ - if (scriptRun != NULL) { - scriptRun->scriptStart = 0; - scriptRun->scriptLimit = 0; - scriptRun->scriptCode = USCRIPT_INVALID_CODE; - scriptRun->parenSP = -1; - scriptRun->pushCount = 0; - scriptRun->fixupCount = 0; - } -} - -U_CAPI void U_EXPORT2 -uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode) -{ - if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) { - return; - } - - if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - - scriptRun->textArray = src; - scriptRun->textLength = length; - - uscript_resetRun(scriptRun); -} - -U_CAPI UBool U_EXPORT2 -uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript) -{ - UErrorCode error = U_ZERO_ERROR; - - /* if we've fallen off the end of the text, we're done */ - if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) { - return FALSE; - } - - SYNC_FIXUP(scriptRun); - scriptRun->scriptCode = USCRIPT_COMMON; - - for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) { - UChar high = scriptRun->textArray[scriptRun->scriptLimit]; - UChar32 ch = high; - UScriptCode sc; - int32_t pairIndex; - - /* - * if the character is a high surrogate and it's not the last one - * in the text, see if it's followed by a low surrogate - */ - if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) { - UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1]; - - /* - * if it is followed by a low surrogate, - * consume it and form the full character - */ - if (low >= 0xDC00 && low <= 0xDFFF) { - ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000; - scriptRun->scriptLimit += 1; - } - } - - sc = uscript_getScript(ch, &error); - pairIndex = getPairIndex(ch); - - /* - * Paired character handling: - * - * if it's an open character, push it onto the stack. - * if it's a close character, find the matching open on the - * stack, and use that script code. Any non-matching open - * characters above it on the stack will be poped. - */ - if (pairIndex >= 0) { - if ((pairIndex & 1) == 0) { - push(scriptRun, pairIndex, scriptRun->scriptCode); - } else { - int32_t pi = pairIndex & ~1; - - while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) { - pop(scriptRun); - } - - if (STACK_IS_NOT_EMPTY(scriptRun)) { - sc = TOP(scriptRun).scriptCode; - } - } - } - - if (sameScript(scriptRun->scriptCode, sc)) { - if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) { - scriptRun->scriptCode = sc; - - fixup(scriptRun, scriptRun->scriptCode); - } - - /* - * if this character is a close paired character, - * pop the matching open character from the stack - */ - if (pairIndex >= 0 && (pairIndex & 1) != 0) { - pop(scriptRun); - } - } else { - /* - * if the run broke on a surrogate pair, - * end it before the high surrogate - */ - if (ch >= 0x10000) { - scriptRun->scriptLimit -= 1; - } - - break; - } - } - - - if (pRunStart != NULL) { - *pRunStart = scriptRun->scriptStart; - } - - if (pRunLimit != NULL) { - *pRunLimit = scriptRun->scriptLimit; - } - - if (pRunScript != NULL) { - *pRunScript = scriptRun->scriptCode; - } - - return TRUE; -} +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 1999-2016, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* File USC_IMPL.C +* +* Modification History: +* +* Date Name Description +* 07/08/2002 Eric Mader Creation. +****************************************************************************** +*/ + +#include "unicode/uscript.h" +#include "usc_impl.h" +#include "cmemory.h" + +#define PAREN_STACK_DEPTH 32 + +#define MOD(sp) ((sp) % PAREN_STACK_DEPTH) +#define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH) +#define INC(sp,count) (MOD((sp) + (count))) +#define INC1(sp) (INC(sp, 1)) +#define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count))) +#define DEC1(sp) (DEC(sp, 1)) +#define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0) +#define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun)) +#define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP]) +#define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0) + +struct ParenStackEntry +{ + int32_t pairIndex; + UScriptCode scriptCode; +}; + +struct UScriptRun +{ + int32_t textLength; + const UChar *textArray; + + int32_t scriptStart; + int32_t scriptLimit; + UScriptCode scriptCode; + + struct ParenStackEntry parenStack[PAREN_STACK_DEPTH]; + int32_t parenSP; + int32_t pushCount; + int32_t fixupCount; +}; + +static int8_t highBit(int32_t value); + +static const UChar32 pairedChars[] = { + 0x0028, 0x0029, /* ascii paired punctuation */ + 0x003c, 0x003e, + 0x005b, 0x005d, + 0x007b, 0x007d, + 0x00ab, 0x00bb, /* guillemets */ + 0x2018, 0x2019, /* general punctuation */ + 0x201c, 0x201d, + 0x2039, 0x203a, + 0x3008, 0x3009, /* chinese paired punctuation */ + 0x300a, 0x300b, + 0x300c, 0x300d, + 0x300e, 0x300f, + 0x3010, 0x3011, + 0x3014, 0x3015, + 0x3016, 0x3017, + 0x3018, 0x3019, + 0x301a, 0x301b +}; + +static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode) +{ + scriptRun->pushCount = LIMIT_INC(scriptRun->pushCount); + scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount); + + scriptRun->parenSP = INC1(scriptRun->parenSP); + scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex; + scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode; +} + +static void pop(UScriptRun *scriptRun) +{ + if (STACK_IS_EMPTY(scriptRun)) { + return; + } + + if (scriptRun->fixupCount > 0) { + scriptRun->fixupCount -= 1; + } + + scriptRun->pushCount -= 1; + scriptRun->parenSP = DEC1(scriptRun->parenSP); + + /* If the stack is now empty, reset the stack + pointers to their initial values. + */ + if (STACK_IS_EMPTY(scriptRun)) { + scriptRun->parenSP = -1; + } +} + +static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode) +{ + int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount); + + while (scriptRun->fixupCount-- > 0) { + fixupSP = INC1(fixupSP); + scriptRun->parenStack[fixupSP].scriptCode = scriptCode; + } +} + +static int8_t +highBit(int32_t value) +{ + int8_t bit = 0; + + if (value <= 0) { + return -32; + } + + if (value >= 1 << 16) { + value >>= 16; + bit += 16; + } + + if (value >= 1 << 8) { + value >>= 8; + bit += 8; + } + + if (value >= 1 << 4) { + value >>= 4; + bit += 4; + } + + if (value >= 1 << 2) { + value >>= 2; + bit += 2; + } + + if (value >= 1 << 1) { + //value >>= 1; + bit += 1; + } + + return bit; +} + +static int32_t +getPairIndex(UChar32 ch) +{ + int32_t pairedCharCount = UPRV_LENGTHOF(pairedChars); + int32_t pairedCharPower = 1 << highBit(pairedCharCount); + int32_t pairedCharExtra = pairedCharCount - pairedCharPower; + + int32_t probe = pairedCharPower; + int32_t pairIndex = 0; + + if (ch >= pairedChars[pairedCharExtra]) { + pairIndex = pairedCharExtra; + } + + while (probe > (1 << 0)) { + probe >>= 1; + + if (ch >= pairedChars[pairIndex + probe]) { + pairIndex += probe; + } + } + + if (pairedChars[pairIndex] != ch) { + pairIndex = -1; + } + + return pairIndex; +} + +static UBool +sameScript(UScriptCode scriptOne, UScriptCode scriptTwo) +{ + return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo; +} + +U_CAPI UScriptRun * U_EXPORT2 +uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode) +{ + UScriptRun *result = NULL; + + if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) { + return NULL; + } + + result = (UScriptRun *)uprv_malloc(sizeof (UScriptRun)); + + if (result == NULL) { + *pErrorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + + uscript_setRunText(result, src, length, pErrorCode); + + /* Release the UScriptRun if uscript_setRunText() returns an error */ + if (U_FAILURE(*pErrorCode)) { + uprv_free(result); + result = NULL; + } + + return result; +} + +U_CAPI void U_EXPORT2 +uscript_closeRun(UScriptRun *scriptRun) +{ + if (scriptRun != NULL) { + uprv_free(scriptRun); + } +} + +U_CAPI void U_EXPORT2 +uscript_resetRun(UScriptRun *scriptRun) +{ + if (scriptRun != NULL) { + scriptRun->scriptStart = 0; + scriptRun->scriptLimit = 0; + scriptRun->scriptCode = USCRIPT_INVALID_CODE; + scriptRun->parenSP = -1; + scriptRun->pushCount = 0; + scriptRun->fixupCount = 0; + } +} + +U_CAPI void U_EXPORT2 +uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode) +{ + if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) { + return; + } + + if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) { + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + scriptRun->textArray = src; + scriptRun->textLength = length; + + uscript_resetRun(scriptRun); +} + +U_CAPI UBool U_EXPORT2 +uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript) +{ + UErrorCode error = U_ZERO_ERROR; + + /* if we've fallen off the end of the text, we're done */ + if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) { + return FALSE; + } + + SYNC_FIXUP(scriptRun); + scriptRun->scriptCode = USCRIPT_COMMON; + + for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) { + UChar high = scriptRun->textArray[scriptRun->scriptLimit]; + UChar32 ch = high; + UScriptCode sc; + int32_t pairIndex; + + /* + * if the character is a high surrogate and it's not the last one + * in the text, see if it's followed by a low surrogate + */ + if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) { + UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1]; + + /* + * if it is followed by a low surrogate, + * consume it and form the full character + */ + if (low >= 0xDC00 && low <= 0xDFFF) { + ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000; + scriptRun->scriptLimit += 1; + } + } + + sc = uscript_getScript(ch, &error); + pairIndex = getPairIndex(ch); + + /* + * Paired character handling: + * + * if it's an open character, push it onto the stack. + * if it's a close character, find the matching open on the + * stack, and use that script code. Any non-matching open + * characters above it on the stack will be poped. + */ + if (pairIndex >= 0) { + if ((pairIndex & 1) == 0) { + push(scriptRun, pairIndex, scriptRun->scriptCode); + } else { + int32_t pi = pairIndex & ~1; + + while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) { + pop(scriptRun); + } + + if (STACK_IS_NOT_EMPTY(scriptRun)) { + sc = TOP(scriptRun).scriptCode; + } + } + } + + if (sameScript(scriptRun->scriptCode, sc)) { + if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) { + scriptRun->scriptCode = sc; + + fixup(scriptRun, scriptRun->scriptCode); + } + + /* + * if this character is a close paired character, + * pop the matching open character from the stack + */ + if (pairIndex >= 0 && (pairIndex & 1) != 0) { + pop(scriptRun); + } + } else { + /* + * if the run broke on a surrogate pair, + * end it before the high surrogate + */ + if (ch >= 0x10000) { + scriptRun->scriptLimit -= 1; + } + + break; + } + } + + + if (pRunStart != NULL) { + *pRunStart = scriptRun->scriptStart; + } + + if (pRunLimit != NULL) { + *pRunLimit = scriptRun->scriptLimit; + } + + if (pRunScript != NULL) { + *pRunScript = scriptRun->scriptCode; + } + + return TRUE; +} |