diff options
author | neksard <neksard@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
commit | 1d9c550e7c38e051d7961f576013a482003a70d9 (patch) | |
tree | b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/common/unames.cpp | |
parent | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff) | |
download | ydb-1d9c550e7c38e051d7961f576013a482003a70d9.tar.gz |
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/common/unames.cpp')
-rw-r--r-- | contrib/libs/icu/common/unames.cpp | 4136 |
1 files changed, 2068 insertions, 2068 deletions
diff --git a/contrib/libs/icu/common/unames.cpp b/contrib/libs/icu/common/unames.cpp index cf44d43613..5776058f95 100644 --- a/contrib/libs/icu/common/unames.cpp +++ b/contrib/libs/icu/common/unames.cpp @@ -1,1578 +1,1578 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 1999-2014, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: unames.c +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: unames.c * encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 1999oct04 -* created by: Markus W. Scherer -*/ - -#include "unicode/utypes.h" -#include "unicode/putil.h" -#include "unicode/uchar.h" -#include "unicode/udata.h" -#include "unicode/utf.h" -#include "unicode/utf16.h" -#include "uassert.h" -#include "ustr_imp.h" -#include "umutex.h" -#include "cmemory.h" -#include "cstring.h" -#include "ucln_cmn.h" -#include "udataswp.h" -#include "uprops.h" - -U_NAMESPACE_BEGIN - -/* prototypes ------------------------------------------------------------- */ - -static const char DATA_NAME[] = "unames"; -static const char DATA_TYPE[] = "icu"; - -#define GROUP_SHIFT 5 -#define LINES_PER_GROUP (1L<<GROUP_SHIFT) -#define GROUP_MASK (LINES_PER_GROUP-1) - -/* - * This struct was replaced by explicitly accessing equivalent - * fields from triples of uint16_t. - * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, - * which broke the assumption that sizeof(Group)==6 and that the ++ operator - * would advance by 6 bytes (3 uint16_t). - * - * We can't just change the data structure because it's loaded from a data file, - * and we don't want to make it less compact, so we changed the access code. - * - * For details see ICU tickets 6331 and 6008. -typedef struct { - uint16_t groupMSB, - offsetHigh, offsetLow; / * avoid padding * / -} Group; - */ -enum { - GROUP_MSB, - GROUP_OFFSET_HIGH, - GROUP_OFFSET_LOW, - GROUP_LENGTH -}; - -/* - * Get the 32-bit group offset. - * @param group (const uint16_t *) pointer to a Group triple of uint16_t - * @return group offset (int32_t) - */ -#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW]) - -#define NEXT_GROUP(group) ((group)+GROUP_LENGTH) -#define PREV_GROUP(group) ((group)-GROUP_LENGTH) - -typedef struct { - uint32_t start, end; - uint8_t type, variant; - uint16_t size; -} AlgorithmicRange; - -typedef struct { - uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; -} UCharNames; - -/* - * Get the groups table from a UCharNames struct. - * The groups table consists of one uint16_t groupCount followed by - * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH - * and the comment for the old struct Group above. - * - * @param names (const UCharNames *) pointer to the UCharNames indexes - * @return (const uint16_t *) pointer to the groups table - */ -#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset) - -typedef struct { - const char *otherName; - UChar32 code; -} FindName; - -#define DO_FIND_NAME NULL - -static UDataMemory *uCharNamesData=NULL; -static UCharNames *uCharNames=NULL; -static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER; - -/* - * Maximum length of character names (regular & 1.0). - */ -static int32_t gMaxNameLength=0; - -/* - * Set of chars used in character names (regular & 1.0). - * Chars are platform-dependent (can be EBCDIC). - */ -static uint32_t gNameSet[8]={ 0 }; - -#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT -#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1 -#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2 - -#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3) - -static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = { - "unassigned", - "uppercase letter", - "lowercase letter", - "titlecase letter", - "modifier letter", - "other letter", - "non spacing mark", - "enclosing mark", - "combining spacing mark", - "decimal digit number", - "letter number", - "other number", - "space separator", - "line separator", - "paragraph separator", - "control", - "format", - "private use area", - "surrogate", - "dash punctuation", - "start punctuation", - "end punctuation", - "connector punctuation", - "other punctuation", - "math symbol", - "currency symbol", - "modifier symbol", - "other symbol", - "initial punctuation", - "final punctuation", - "noncharacter", - "lead surrogate", - "trail surrogate" -}; - -/* implementation ----------------------------------------------------------- */ - -static UBool U_CALLCONV unames_cleanup(void) -{ - if(uCharNamesData) { - udata_close(uCharNamesData); - uCharNamesData = NULL; - } - if(uCharNames) { - uCharNames = NULL; - } - gCharNamesInitOnce.reset(); - gMaxNameLength=0; - return TRUE; -} - -static UBool U_CALLCONV -isAcceptable(void * /*context*/, - const char * /*type*/, const char * /*name*/, - const UDataInfo *pInfo) { - return (UBool)( - pInfo->size>=20 && - pInfo->isBigEndian==U_IS_BIG_ENDIAN && - pInfo->charsetFamily==U_CHARSET_FAMILY && - pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ - pInfo->dataFormat[1]==0x6e && - pInfo->dataFormat[2]==0x61 && - pInfo->dataFormat[3]==0x6d && - pInfo->formatVersion[0]==1); -} - -static void U_CALLCONV -loadCharNames(UErrorCode &status) { - U_ASSERT(uCharNamesData == NULL); - U_ASSERT(uCharNames == NULL); - - uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status); - if(U_FAILURE(status)) { - uCharNamesData = NULL; - } else { - uCharNames = (UCharNames *)udata_getMemory(uCharNamesData); - } - ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup); -} - - -static UBool -isDataLoaded(UErrorCode *pErrorCode) { - umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode); - return U_SUCCESS(*pErrorCode); -} - +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999oct04 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/uchar.h" +#include "unicode/udata.h" +#include "unicode/utf.h" +#include "unicode/utf16.h" +#include "uassert.h" +#include "ustr_imp.h" +#include "umutex.h" +#include "cmemory.h" +#include "cstring.h" +#include "ucln_cmn.h" +#include "udataswp.h" +#include "uprops.h" + +U_NAMESPACE_BEGIN + +/* prototypes ------------------------------------------------------------- */ + +static const char DATA_NAME[] = "unames"; +static const char DATA_TYPE[] = "icu"; + +#define GROUP_SHIFT 5 +#define LINES_PER_GROUP (1L<<GROUP_SHIFT) +#define GROUP_MASK (LINES_PER_GROUP-1) + +/* + * This struct was replaced by explicitly accessing equivalent + * fields from triples of uint16_t. + * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, + * which broke the assumption that sizeof(Group)==6 and that the ++ operator + * would advance by 6 bytes (3 uint16_t). + * + * We can't just change the data structure because it's loaded from a data file, + * and we don't want to make it less compact, so we changed the access code. + * + * For details see ICU tickets 6331 and 6008. +typedef struct { + uint16_t groupMSB, + offsetHigh, offsetLow; / * avoid padding * / +} Group; + */ +enum { + GROUP_MSB, + GROUP_OFFSET_HIGH, + GROUP_OFFSET_LOW, + GROUP_LENGTH +}; + +/* + * Get the 32-bit group offset. + * @param group (const uint16_t *) pointer to a Group triple of uint16_t + * @return group offset (int32_t) + */ +#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW]) + +#define NEXT_GROUP(group) ((group)+GROUP_LENGTH) +#define PREV_GROUP(group) ((group)-GROUP_LENGTH) + +typedef struct { + uint32_t start, end; + uint8_t type, variant; + uint16_t size; +} AlgorithmicRange; + +typedef struct { + uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; +} UCharNames; + +/* + * Get the groups table from a UCharNames struct. + * The groups table consists of one uint16_t groupCount followed by + * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH + * and the comment for the old struct Group above. + * + * @param names (const UCharNames *) pointer to the UCharNames indexes + * @return (const uint16_t *) pointer to the groups table + */ +#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset) + +typedef struct { + const char *otherName; + UChar32 code; +} FindName; + +#define DO_FIND_NAME NULL + +static UDataMemory *uCharNamesData=NULL; +static UCharNames *uCharNames=NULL; +static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER; + +/* + * Maximum length of character names (regular & 1.0). + */ +static int32_t gMaxNameLength=0; + +/* + * Set of chars used in character names (regular & 1.0). + * Chars are platform-dependent (can be EBCDIC). + */ +static uint32_t gNameSet[8]={ 0 }; + +#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT +#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1 +#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2 + +#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3) + +static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = { + "unassigned", + "uppercase letter", + "lowercase letter", + "titlecase letter", + "modifier letter", + "other letter", + "non spacing mark", + "enclosing mark", + "combining spacing mark", + "decimal digit number", + "letter number", + "other number", + "space separator", + "line separator", + "paragraph separator", + "control", + "format", + "private use area", + "surrogate", + "dash punctuation", + "start punctuation", + "end punctuation", + "connector punctuation", + "other punctuation", + "math symbol", + "currency symbol", + "modifier symbol", + "other symbol", + "initial punctuation", + "final punctuation", + "noncharacter", + "lead surrogate", + "trail surrogate" +}; + +/* implementation ----------------------------------------------------------- */ + +static UBool U_CALLCONV unames_cleanup(void) +{ + if(uCharNamesData) { + udata_close(uCharNamesData); + uCharNamesData = NULL; + } + if(uCharNames) { + uCharNames = NULL; + } + gCharNamesInitOnce.reset(); + gMaxNameLength=0; + return TRUE; +} + +static UBool U_CALLCONV +isAcceptable(void * /*context*/, + const char * /*type*/, const char * /*name*/, + const UDataInfo *pInfo) { + return (UBool)( + pInfo->size>=20 && + pInfo->isBigEndian==U_IS_BIG_ENDIAN && + pInfo->charsetFamily==U_CHARSET_FAMILY && + pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ + pInfo->dataFormat[1]==0x6e && + pInfo->dataFormat[2]==0x61 && + pInfo->dataFormat[3]==0x6d && + pInfo->formatVersion[0]==1); +} + +static void U_CALLCONV +loadCharNames(UErrorCode &status) { + U_ASSERT(uCharNamesData == NULL); + U_ASSERT(uCharNames == NULL); + + uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status); + if(U_FAILURE(status)) { + uCharNamesData = NULL; + } else { + uCharNames = (UCharNames *)udata_getMemory(uCharNamesData); + } + ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup); +} + + +static UBool +isDataLoaded(UErrorCode *pErrorCode) { + umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode); + return U_SUCCESS(*pErrorCode); +} + #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) UPRV_BLOCK_MACRO_BEGIN { \ - if((bufferLength)>0) { \ - *(buffer)++=c; \ - --(bufferLength); \ - } \ - ++(bufferPos); \ + if((bufferLength)>0) { \ + *(buffer)++=c; \ + --(bufferLength); \ + } \ + ++(bufferPos); \ } UPRV_BLOCK_MACRO_END - -#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT - -/* - * Important: expandName() and compareName() are almost the same - - * apply fixes to both. - * - * UnicodeData.txt uses ';' as a field separator, so no - * field can contain ';' as part of its contents. - * In unames.dat, it is marked as token[';']==-1 only if the - * semicolon is used in the data file - which is iff we - * have Unicode 1.0 names or ISO comments or aliases. - * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases - * although we know that it will never be part of a name. - */ -static uint16_t -expandName(UCharNames *names, - const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, - char *buffer, uint16_t bufferLength) { - uint16_t *tokens=(uint16_t *)names+8; - uint16_t token, tokenCount=*tokens++, bufferPos=0; - uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; - uint8_t c; - - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - /* - * skip the modern name if it is not requested _and_ - * if the semicolon byte value is a character, not a token number - */ - if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { - int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; - do { - while(nameLength>0) { - --nameLength; - if(*name++==';') { - break; - } - } - } while(--fieldIndex>0); - } else { - /* - * the semicolon byte value is a token number, therefore - * only modern names are stored in unames.dat and there is no - * such requested alternate name here - */ - nameLength=0; - } - } - - /* write each letter directly, and write a token word per token */ - while(nameLength>0) { - --nameLength; - c=*name++; - - if(c>=tokenCount) { - if(c!=';') { - /* implicit letter */ - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } else { - /* finished */ - break; - } - } else { - token=tokens[c]; - if(token==(uint16_t)(-2)) { - /* this is a lead byte for a double-byte token */ - token=tokens[c<<8|*name++]; - --nameLength; - } - if(token==(uint16_t)(-1)) { - if(c!=';') { - /* explicit letter */ - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } else { - /* stop, but skip the semicolon if we are seeking - extended names and there was no 2.0 name but there - is a 1.0 name. */ - if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { - if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { - continue; - } - } - /* finished */ - break; - } - } else { - /* write token word */ - uint8_t *tokenString=tokenStrings+token; - while((c=*tokenString++)!=0) { - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } - } - } - } - - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - - return bufferPos; -} - -/* - * compareName() is almost the same as expandName() except that it compares - * the currently expanded name to an input name. - * It returns the match/no match result as soon as possible. - */ -static UBool -compareName(UCharNames *names, - const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, - const char *otherName) { - uint16_t *tokens=(uint16_t *)names+8; - uint16_t token, tokenCount=*tokens++; - uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; - uint8_t c; - const char *origOtherName = otherName; - - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - /* - * skip the modern name if it is not requested _and_ - * if the semicolon byte value is a character, not a token number - */ - if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { - int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; - do { - while(nameLength>0) { - --nameLength; - if(*name++==';') { - break; - } - } - } while(--fieldIndex>0); - } else { - /* - * the semicolon byte value is a token number, therefore - * only modern names are stored in unames.dat and there is no - * such requested alternate name here - */ - nameLength=0; - } - } - - /* compare each letter directly, and compare a token word per token */ - while(nameLength>0) { - --nameLength; - c=*name++; - - if(c>=tokenCount) { - if(c!=';') { - /* implicit letter */ - if((char)c!=*otherName++) { - return FALSE; - } - } else { - /* finished */ - break; - } - } else { - token=tokens[c]; - if(token==(uint16_t)(-2)) { - /* this is a lead byte for a double-byte token */ - token=tokens[c<<8|*name++]; - --nameLength; - } - if(token==(uint16_t)(-1)) { - if(c!=';') { - /* explicit letter */ - if((char)c!=*otherName++) { - return FALSE; - } - } else { - /* stop, but skip the semicolon if we are seeking - extended names and there was no 2.0 name but there - is a 1.0 name. */ - if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { - if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { - continue; - } - } - /* finished */ - break; - } - } else { - /* write token word */ - uint8_t *tokenString=tokenStrings+token; - while((c=*tokenString++)!=0) { - if((char)c!=*otherName++) { - return FALSE; - } - } - } - } - } - - /* complete match? */ - return (UBool)(*otherName==0); -} - -static uint8_t getCharCat(UChar32 cp) { - uint8_t cat; - - if (U_IS_UNICODE_NONCHAR(cp)) { - return U_NONCHARACTER_CODE_POINT; - } - - if ((cat = u_charType(cp)) == U_SURROGATE) { - cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE; - } - - return cat; -} - -static const char *getCharCatName(UChar32 cp) { - uint8_t cat = getCharCat(cp); - - /* Return unknown if the table of names above is not up to - date. */ - - if (cat >= UPRV_LENGTHOF(charCatNames)) { - return "unknown"; - } else { - return charCatNames[cat]; - } -} - -static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { - const char *catname = getCharCatName(code); - uint16_t length = 0; - - UChar32 cp; - int ndigits, i; - - WRITE_CHAR(buffer, bufferLength, length, '<'); - while (catname[length - 1]) { - WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); - } - WRITE_CHAR(buffer, bufferLength, length, '-'); - for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) - ; - if (ndigits < 4) - ndigits = 4; - for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) { - uint8_t v = (uint8_t)(cp & 0xf); - buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); - } - buffer += ndigits; + +#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT + +/* + * Important: expandName() and compareName() are almost the same - + * apply fixes to both. + * + * UnicodeData.txt uses ';' as a field separator, so no + * field can contain ';' as part of its contents. + * In unames.dat, it is marked as token[';']==-1 only if the + * semicolon is used in the data file - which is iff we + * have Unicode 1.0 names or ISO comments or aliases. + * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases + * although we know that it will never be part of a name. + */ +static uint16_t +expandName(UCharNames *names, + const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, + char *buffer, uint16_t bufferLength) { + uint16_t *tokens=(uint16_t *)names+8; + uint16_t token, tokenCount=*tokens++, bufferPos=0; + uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; + uint8_t c; + + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + /* + * skip the modern name if it is not requested _and_ + * if the semicolon byte value is a character, not a token number + */ + if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { + int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; + do { + while(nameLength>0) { + --nameLength; + if(*name++==';') { + break; + } + } + } while(--fieldIndex>0); + } else { + /* + * the semicolon byte value is a token number, therefore + * only modern names are stored in unames.dat and there is no + * such requested alternate name here + */ + nameLength=0; + } + } + + /* write each letter directly, and write a token word per token */ + while(nameLength>0) { + --nameLength; + c=*name++; + + if(c>=tokenCount) { + if(c!=';') { + /* implicit letter */ + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } else { + /* finished */ + break; + } + } else { + token=tokens[c]; + if(token==(uint16_t)(-2)) { + /* this is a lead byte for a double-byte token */ + token=tokens[c<<8|*name++]; + --nameLength; + } + if(token==(uint16_t)(-1)) { + if(c!=';') { + /* explicit letter */ + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } else { + /* stop, but skip the semicolon if we are seeking + extended names and there was no 2.0 name but there + is a 1.0 name. */ + if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { + if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { + continue; + } + } + /* finished */ + break; + } + } else { + /* write token word */ + uint8_t *tokenString=tokenStrings+token; + while((c=*tokenString++)!=0) { + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } + } + } + } + + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + + return bufferPos; +} + +/* + * compareName() is almost the same as expandName() except that it compares + * the currently expanded name to an input name. + * It returns the match/no match result as soon as possible. + */ +static UBool +compareName(UCharNames *names, + const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, + const char *otherName) { + uint16_t *tokens=(uint16_t *)names+8; + uint16_t token, tokenCount=*tokens++; + uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; + uint8_t c; + const char *origOtherName = otherName; + + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + /* + * skip the modern name if it is not requested _and_ + * if the semicolon byte value is a character, not a token number + */ + if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { + int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; + do { + while(nameLength>0) { + --nameLength; + if(*name++==';') { + break; + } + } + } while(--fieldIndex>0); + } else { + /* + * the semicolon byte value is a token number, therefore + * only modern names are stored in unames.dat and there is no + * such requested alternate name here + */ + nameLength=0; + } + } + + /* compare each letter directly, and compare a token word per token */ + while(nameLength>0) { + --nameLength; + c=*name++; + + if(c>=tokenCount) { + if(c!=';') { + /* implicit letter */ + if((char)c!=*otherName++) { + return FALSE; + } + } else { + /* finished */ + break; + } + } else { + token=tokens[c]; + if(token==(uint16_t)(-2)) { + /* this is a lead byte for a double-byte token */ + token=tokens[c<<8|*name++]; + --nameLength; + } + if(token==(uint16_t)(-1)) { + if(c!=';') { + /* explicit letter */ + if((char)c!=*otherName++) { + return FALSE; + } + } else { + /* stop, but skip the semicolon if we are seeking + extended names and there was no 2.0 name but there + is a 1.0 name. */ + if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { + if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { + continue; + } + } + /* finished */ + break; + } + } else { + /* write token word */ + uint8_t *tokenString=tokenStrings+token; + while((c=*tokenString++)!=0) { + if((char)c!=*otherName++) { + return FALSE; + } + } + } + } + } + + /* complete match? */ + return (UBool)(*otherName==0); +} + +static uint8_t getCharCat(UChar32 cp) { + uint8_t cat; + + if (U_IS_UNICODE_NONCHAR(cp)) { + return U_NONCHARACTER_CODE_POINT; + } + + if ((cat = u_charType(cp)) == U_SURROGATE) { + cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE; + } + + return cat; +} + +static const char *getCharCatName(UChar32 cp) { + uint8_t cat = getCharCat(cp); + + /* Return unknown if the table of names above is not up to + date. */ + + if (cat >= UPRV_LENGTHOF(charCatNames)) { + return "unknown"; + } else { + return charCatNames[cat]; + } +} + +static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { + const char *catname = getCharCatName(code); + uint16_t length = 0; + + UChar32 cp; + int ndigits, i; + + WRITE_CHAR(buffer, bufferLength, length, '<'); + while (catname[length - 1]) { + WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); + } + WRITE_CHAR(buffer, bufferLength, length, '-'); + for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) + ; + if (ndigits < 4) + ndigits = 4; + for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) { + uint8_t v = (uint8_t)(cp & 0xf); + buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); + } + buffer += ndigits; length += static_cast<uint16_t>(ndigits); - WRITE_CHAR(buffer, bufferLength, length, '>'); - - return length; -} - -/* - * getGroup() does a binary search for the group that contains the - * Unicode code point "code". - * The return value is always a valid Group* that may contain "code" - * or else is the highest group before "code". - * If the lowest group is after "code", then that one is returned. - */ -static const uint16_t * -getGroup(UCharNames *names, uint32_t code) { - const uint16_t *groups=GET_GROUPS(names); - uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), - start=0, - limit=*groups++, - number; - - /* binary search for the group of names that contains the one for code */ - while(start<limit-1) { - number=(uint16_t)((start+limit)/2); - if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) { - limit=number; - } else { - start=number; - } - } - - /* return this regardless of whether it is an exact match */ - return groups+start*GROUP_LENGTH; -} - -/* - * expandGroupLengths() reads a block of compressed lengths of 32 strings and - * expands them into offsets and lengths for each string. - * Lengths are stored with a variable-width encoding in consecutive nibbles: - * If a nibble<0xc, then it is the length itself (0=empty string). - * If a nibble>=0xc, then it forms a length value with the following nibble. - * Calculation see below. - * The offsets and lengths arrays must be at least 33 (one more) long because - * there is no check here at the end if the last nibble is still used. - */ -static const uint8_t * -expandGroupLengths(const uint8_t *s, - uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { - /* read the lengths of the 32 strings in this group and get each string's offset */ - uint16_t i=0, offset=0, length=0; - uint8_t lengthByte; - - /* all 32 lengths must be read to get the offset of the first group string */ - while(i<LINES_PER_GROUP) { - lengthByte=*s++; - - /* read even nibble - MSBs of lengthByte */ - if(length>=12) { - /* double-nibble length spread across two bytes */ - length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); - lengthByte&=0xf; - } else if((lengthByte /* &0xf0 */)>=0xc0) { - /* double-nibble length spread across this one byte */ - length=(uint16_t)((lengthByte&0x3f)+12); - } else { - /* single-nibble length in MSBs */ - length=(uint16_t)(lengthByte>>4); - lengthByte&=0xf; - } - - *offsets++=offset; - *lengths++=length; - - offset+=length; - ++i; - - /* read odd nibble - LSBs of lengthByte */ - if((lengthByte&0xf0)==0) { - /* this nibble was not consumed for a double-nibble length above */ - length=lengthByte; - if(length<12) { - /* single-nibble length in LSBs */ - *offsets++=offset; - *lengths++=length; - - offset+=length; - ++i; - } - } else { - length=0; /* prevent double-nibble detection in the next iteration */ - } - } - - /* now, s is at the first group string */ - return s; -} - -static uint16_t -expandGroupName(UCharNames *names, const uint16_t *group, - uint16_t lineNumber, UCharNameChoice nameChoice, - char *buffer, uint16_t bufferLength) { - uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; - const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); - s=expandGroupLengths(s, offsets, lengths); - return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice, - buffer, bufferLength); -} - -static uint16_t -getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, - char *buffer, uint16_t bufferLength) { - const uint16_t *group=getGroup(names, code); - if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) { - return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice, - buffer, bufferLength); - } else { - /* group not found */ - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - return 0; - } -} - -/* - * enumGroupNames() enumerates all the names in a 32-group - * and either calls the enumerator function or finds a given input name. - */ -static UBool -enumGroupNames(UCharNames *names, const uint16_t *group, - UChar32 start, UChar32 end, - UEnumCharNamesFn *fn, void *context, - UCharNameChoice nameChoice) { - uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; - const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); - - s=expandGroupLengths(s, offsets, lengths); - if(fn!=DO_FIND_NAME) { - char buffer[200]; - uint16_t length; - - while(start<=end) { - length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer)); - if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { - buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; - } - /* here, we assume that the buffer is large enough */ - if(length>0) { - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - } - ++start; - } - } else { - const char *otherName=((FindName *)context)->otherName; - while(start<=end) { - if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) { - ((FindName *)context)->code=start; - return FALSE; - } - ++start; - } - } - return TRUE; -} - -/* - * enumExtNames enumerate extended names. - * It only needs to do it if it is called with a real function and not - * with the dummy DO_FIND_NAME, because u_charFromName() does a check - * for extended names by itself. - */ -static UBool -enumExtNames(UChar32 start, UChar32 end, - UEnumCharNamesFn *fn, void *context) -{ - if(fn!=DO_FIND_NAME) { - char buffer[200]; - uint16_t length; - - while(start<=end) { - buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; - /* here, we assume that the buffer is large enough */ - if(length>0) { - if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { - return FALSE; - } - } - ++start; - } - } - - return TRUE; -} - -static UBool -enumNames(UCharNames *names, - UChar32 start, UChar32 limit, - UEnumCharNamesFn *fn, void *context, - UCharNameChoice nameChoice) { - uint16_t startGroupMSB, endGroupMSB, groupCount; - const uint16_t *group, *groupLimit; - - startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); - endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); - - /* find the group that contains start, or the highest before it */ - group=getGroup(names, start); - - if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) { - /* enumerate synthetic names between start and the group start */ - UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT); - if(extLimit>limit) { - extLimit=limit; - } - if(!enumExtNames(start, extLimit-1, fn, context)) { - return FALSE; - } - start=extLimit; - } - - if(startGroupMSB==endGroupMSB) { - if(startGroupMSB==group[GROUP_MSB]) { - /* if start and limit-1 are in the same group, then enumerate only in that one */ - return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice); - } - } else { - const uint16_t *groups=GET_GROUPS(names); - groupCount=*groups++; - groupLimit=groups+groupCount*GROUP_LENGTH; - - if(startGroupMSB==group[GROUP_MSB]) { - /* enumerate characters in the partial start group */ - if((start&GROUP_MASK)!=0) { - if(!enumGroupNames(names, group, - start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1, - fn, context, nameChoice)) { - return FALSE; - } - group=NEXT_GROUP(group); /* continue with the next group */ - } - } else if(startGroupMSB>group[GROUP_MSB]) { - /* make sure that we start enumerating with the first group after start */ - const uint16_t *nextGroup=NEXT_GROUP(group); - if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) { - UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; - if (end > limit) { - end = limit; - } - if (!enumExtNames(start, end - 1, fn, context)) { - return FALSE; - } - } - group=nextGroup; - } - - /* enumerate entire groups between the start- and end-groups */ - while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) { - const uint16_t *nextGroup; - start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT; - if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) { - return FALSE; - } - nextGroup=NEXT_GROUP(group); - if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { - UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; - if (end > limit) { - end = limit; - } - if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) { - return FALSE; - } - } - group=nextGroup; - } - - /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ - if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) { - return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice); - } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) { - UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT; - if (next > start) { - start = next; - } - } else { - return TRUE; - } - } - - /* we have not found a group, which means everything is made of - extended names. */ - if (nameChoice == U_EXTENDED_CHAR_NAME) { - if (limit > UCHAR_MAX_VALUE + 1) { - limit = UCHAR_MAX_VALUE + 1; - } - return enumExtNames(start, limit - 1, fn, context); - } - - return TRUE; -} - -static uint16_t -writeFactorSuffix(const uint16_t *factors, uint16_t count, - const char *s, /* suffix elements */ - uint32_t code, - uint16_t indexes[8], /* output fields from here */ - const char *elementBases[8], const char *elements[8], - char *buffer, uint16_t bufferLength) { - uint16_t i, factor, bufferPos=0; - char c; - - /* write elements according to the factors */ - - /* - * the factorized elements are determined by modulo arithmetic - * with the factors of this algorithm - * - * note that for fewer operations, count is decremented here - */ - --count; - for(i=count; i>0; --i) { - factor=factors[i]; - indexes[i]=(uint16_t)(code%factor); - code/=factor; - } - /* - * we don't need to calculate the last modulus because start<=code<=end - * guarantees here that code<=factors[0] - */ - indexes[0]=(uint16_t)code; - - /* write each element */ - for(;;) { - if(elementBases!=NULL) { - *elementBases++=s; - } - - /* skip indexes[i] strings */ - factor=indexes[i]; - while(factor>0) { - while(*s++!=0) {} - --factor; - } - if(elements!=NULL) { - *elements++=s; - } - - /* write element */ - while((c=*s++)!=0) { - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } - - /* we do not need to perform the rest of this loop for i==count - break here */ - if(i>=count) { - break; - } - - /* skip the rest of the strings for this factors[i] */ - factor=(uint16_t)(factors[i]-indexes[i]-1); - while(factor>0) { - while(*s++!=0) {} - --factor; - } - - ++i; - } - - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - - return bufferPos; -} - -/* - * Important: - * Parts of findAlgName() are almost the same as some of getAlgName(). - * Fixes must be applied to both. - */ -static uint16_t -getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, - char *buffer, uint16_t bufferLength) { - uint16_t bufferPos=0; - - /* Only the normative character name can be algorithmic. */ - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - return 0; - } - - switch(range->type) { - case 0: { - /* name = prefix hex-digits */ - const char *s=(const char *)(range+1); - char c; - - uint16_t i, count; - - /* copy prefix */ - while((c=*s++)!=0) { - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } - - /* write hexadecimal code point value */ - count=range->variant; - - /* zero-terminate */ - if(count<bufferLength) { - buffer[count]=0; - } - - for(i=count; i>0;) { - if(--i<bufferLength) { - c=(char)(code&0xf); - if(c<10) { - c+='0'; - } else { - c+='A'-10; - } - buffer[i]=c; - } - code>>=4; - } - - bufferPos+=count; - break; - } - case 1: { - /* name = prefix factorized-elements */ - uint16_t indexes[8]; - const uint16_t *factors=(const uint16_t *)(range+1); - uint16_t count=range->variant; - const char *s=(const char *)(factors+count); - char c; - - /* copy prefix */ - while((c=*s++)!=0) { - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } - - bufferPos+=writeFactorSuffix(factors, count, - s, code-range->start, indexes, NULL, NULL, buffer, bufferLength); - break; - } - default: - /* undefined type */ - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - break; - } - - return bufferPos; -} - -/* - * Important: enumAlgNames() and findAlgName() are almost the same. - * Any fix must be applied to both. - */ -static UBool -enumAlgNames(AlgorithmicRange *range, - UChar32 start, UChar32 limit, - UEnumCharNamesFn *fn, void *context, - UCharNameChoice nameChoice) { - char buffer[200]; - uint16_t length; - - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - return TRUE; - } - - switch(range->type) { - case 0: { - char *s, *end; - char c; - - /* get the full name of the start character */ - length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer)); - if(length<=0) { - return TRUE; - } - - /* call the enumerator function with this first character */ - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - - /* go to the end of the name; all these names have the same length */ - end=buffer; - while(*end!=0) { - ++end; - } - - /* enumerate the rest of the names */ - while(++start<limit) { - /* increment the hexadecimal number on a character-basis */ - s=end; - for (;;) { - c=*--s; - if(('0'<=c && c<'9') || ('A'<=c && c<'F')) { - *s=(char)(c+1); - break; - } else if(c=='9') { - *s='A'; - break; - } else if(c=='F') { - *s='0'; - } - } - - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - } - break; - } - case 1: { - uint16_t indexes[8]; - const char *elementBases[8], *elements[8]; - const uint16_t *factors=(const uint16_t *)(range+1); - uint16_t count=range->variant; - const char *s=(const char *)(factors+count); - char *suffix, *t; - uint16_t prefixLength, i, idx; - - char c; - - /* name = prefix factorized-elements */ - - /* copy prefix */ - suffix=buffer; - prefixLength=0; - while((c=*s++)!=0) { - *suffix++=c; - ++prefixLength; - } - - /* append the suffix of the start character */ - length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, - s, (uint32_t)start-range->start, - indexes, elementBases, elements, - suffix, (uint16_t)(sizeof(buffer)-prefixLength))); - - /* call the enumerator function with this first character */ - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - - /* enumerate the rest of the names */ - while(++start<limit) { - /* increment the indexes in lexical order bound by the factors */ - i=count; - for (;;) { - idx=(uint16_t)(indexes[--i]+1); - if(idx<factors[i]) { - /* skip one index and its element string */ - indexes[i]=idx; - s=elements[i]; - while(*s++!=0) { - } - elements[i]=s; - break; - } else { - /* reset this index to 0 and its element string to the first one */ - indexes[i]=0; - elements[i]=elementBases[i]; - } - } - - /* to make matters a little easier, just append all elements to the suffix */ - t=suffix; - length=prefixLength; - for(i=0; i<count; ++i) { - s=elements[i]; - while((c=*s++)!=0) { - *t++=c; - ++length; - } - } - /* zero-terminate */ - *t=0; - - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - } - break; - } - default: - /* undefined type */ - break; - } - - return TRUE; -} - -/* - * findAlgName() is almost the same as enumAlgNames() except that it - * returns the code point for a name if it fits into the range. - * It returns 0xffff otherwise. + WRITE_CHAR(buffer, bufferLength, length, '>'); + + return length; +} + +/* + * getGroup() does a binary search for the group that contains the + * Unicode code point "code". + * The return value is always a valid Group* that may contain "code" + * or else is the highest group before "code". + * If the lowest group is after "code", then that one is returned. + */ +static const uint16_t * +getGroup(UCharNames *names, uint32_t code) { + const uint16_t *groups=GET_GROUPS(names); + uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), + start=0, + limit=*groups++, + number; + + /* binary search for the group of names that contains the one for code */ + while(start<limit-1) { + number=(uint16_t)((start+limit)/2); + if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) { + limit=number; + } else { + start=number; + } + } + + /* return this regardless of whether it is an exact match */ + return groups+start*GROUP_LENGTH; +} + +/* + * expandGroupLengths() reads a block of compressed lengths of 32 strings and + * expands them into offsets and lengths for each string. + * Lengths are stored with a variable-width encoding in consecutive nibbles: + * If a nibble<0xc, then it is the length itself (0=empty string). + * If a nibble>=0xc, then it forms a length value with the following nibble. + * Calculation see below. + * The offsets and lengths arrays must be at least 33 (one more) long because + * there is no check here at the end if the last nibble is still used. + */ +static const uint8_t * +expandGroupLengths(const uint8_t *s, + uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { + /* read the lengths of the 32 strings in this group and get each string's offset */ + uint16_t i=0, offset=0, length=0; + uint8_t lengthByte; + + /* all 32 lengths must be read to get the offset of the first group string */ + while(i<LINES_PER_GROUP) { + lengthByte=*s++; + + /* read even nibble - MSBs of lengthByte */ + if(length>=12) { + /* double-nibble length spread across two bytes */ + length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); + lengthByte&=0xf; + } else if((lengthByte /* &0xf0 */)>=0xc0) { + /* double-nibble length spread across this one byte */ + length=(uint16_t)((lengthByte&0x3f)+12); + } else { + /* single-nibble length in MSBs */ + length=(uint16_t)(lengthByte>>4); + lengthByte&=0xf; + } + + *offsets++=offset; + *lengths++=length; + + offset+=length; + ++i; + + /* read odd nibble - LSBs of lengthByte */ + if((lengthByte&0xf0)==0) { + /* this nibble was not consumed for a double-nibble length above */ + length=lengthByte; + if(length<12) { + /* single-nibble length in LSBs */ + *offsets++=offset; + *lengths++=length; + + offset+=length; + ++i; + } + } else { + length=0; /* prevent double-nibble detection in the next iteration */ + } + } + + /* now, s is at the first group string */ + return s; +} + +static uint16_t +expandGroupName(UCharNames *names, const uint16_t *group, + uint16_t lineNumber, UCharNameChoice nameChoice, + char *buffer, uint16_t bufferLength) { + uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; + const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); + s=expandGroupLengths(s, offsets, lengths); + return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice, + buffer, bufferLength); +} + +static uint16_t +getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, + char *buffer, uint16_t bufferLength) { + const uint16_t *group=getGroup(names, code); + if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) { + return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice, + buffer, bufferLength); + } else { + /* group not found */ + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + return 0; + } +} + +/* + * enumGroupNames() enumerates all the names in a 32-group + * and either calls the enumerator function or finds a given input name. + */ +static UBool +enumGroupNames(UCharNames *names, const uint16_t *group, + UChar32 start, UChar32 end, + UEnumCharNamesFn *fn, void *context, + UCharNameChoice nameChoice) { + uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; + const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); + + s=expandGroupLengths(s, offsets, lengths); + if(fn!=DO_FIND_NAME) { + char buffer[200]; + uint16_t length; + + while(start<=end) { + length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer)); + if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { + buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; + } + /* here, we assume that the buffer is large enough */ + if(length>0) { + if(!fn(context, start, nameChoice, buffer, length)) { + return FALSE; + } + } + ++start; + } + } else { + const char *otherName=((FindName *)context)->otherName; + while(start<=end) { + if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) { + ((FindName *)context)->code=start; + return FALSE; + } + ++start; + } + } + return TRUE; +} + +/* + * enumExtNames enumerate extended names. + * It only needs to do it if it is called with a real function and not + * with the dummy DO_FIND_NAME, because u_charFromName() does a check + * for extended names by itself. */ -static UChar32 -findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) { - UChar32 code; - - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - return 0xffff; - } - - switch(range->type) { - case 0: { - /* name = prefix hex-digits */ - const char *s=(const char *)(range+1); - char c; - - uint16_t i, count; - - /* compare prefix */ - while((c=*s++)!=0) { - if((char)c!=*otherName++) { - return 0xffff; - } - } - - /* read hexadecimal code point value */ - count=range->variant; - code=0; - for(i=0; i<count; ++i) { - c=*otherName++; - if('0'<=c && c<='9') { - code=(code<<4)|(c-'0'); - } else if('A'<=c && c<='F') { - code=(code<<4)|(c-'A'+10); - } else { - return 0xffff; - } - } - - /* does it fit into the range? */ - if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) { - return code; - } - break; - } - case 1: { - char buffer[64]; - uint16_t indexes[8]; - const char *elementBases[8], *elements[8]; - const uint16_t *factors=(const uint16_t *)(range+1); - uint16_t count=range->variant; - const char *s=(const char *)(factors+count), *t; - UChar32 start, limit; - uint16_t i, idx; - - char c; - - /* name = prefix factorized-elements */ - - /* compare prefix */ - while((c=*s++)!=0) { - if((char)c!=*otherName++) { - return 0xffff; - } - } - - start=(UChar32)range->start; - limit=(UChar32)(range->end+1); - - /* initialize the suffix elements for enumeration; indexes should all be set to 0 */ - writeFactorSuffix(factors, count, s, 0, - indexes, elementBases, elements, buffer, sizeof(buffer)); - - /* compare the first suffix */ - if(0==uprv_strcmp(otherName, buffer)) { - return start; - } - - /* enumerate and compare the rest of the suffixes */ - while(++start<limit) { - /* increment the indexes in lexical order bound by the factors */ - i=count; - for (;;) { - idx=(uint16_t)(indexes[--i]+1); - if(idx<factors[i]) { - /* skip one index and its element string */ - indexes[i]=idx; - s=elements[i]; - while(*s++!=0) {} - elements[i]=s; - break; - } else { - /* reset this index to 0 and its element string to the first one */ - indexes[i]=0; - elements[i]=elementBases[i]; - } - } - - /* to make matters a little easier, just compare all elements of the suffix */ - t=otherName; - for(i=0; i<count; ++i) { - s=elements[i]; - while((c=*s++)!=0) { - if(c!=*t++) { - s=""; /* does not match */ - i=99; - } - } - } - if(i<99 && *t==0) { - return start; - } - } - break; - } - default: - /* undefined type */ - break; - } - - return 0xffff; -} - -/* sets of name characters, maximum name lengths ---------------------------- */ - -#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) -#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0) - -static int32_t -calcStringSetLength(uint32_t set[8], const char *s) { - int32_t length=0; - char c; - - while((c=*s++)!=0) { - SET_ADD(set, c); - ++length; - } - return length; -} - -static int32_t -calcAlgNameSetsLengths(int32_t maxNameLength) { - AlgorithmicRange *range; - uint32_t *p; - uint32_t rangeCount; - int32_t length; - - /* enumerate algorithmic ranges */ - p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); - rangeCount=*p; - range=(AlgorithmicRange *)(p+1); - while(rangeCount>0) { - switch(range->type) { - case 0: - /* name = prefix + (range->variant times) hex-digits */ - /* prefix */ - length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant; - if(length>maxNameLength) { - maxNameLength=length; - } - break; - case 1: { - /* name = prefix factorized-elements */ - const uint16_t *factors=(const uint16_t *)(range+1); - const char *s; - int32_t i, count=range->variant, factor, factorLength, maxFactorLength; - - /* prefix length */ - s=(const char *)(factors+count); - length=calcStringSetLength(gNameSet, s); - s+=length+1; /* start of factor suffixes */ - - /* get the set and maximum factor suffix length for each factor */ - for(i=0; i<count; ++i) { - maxFactorLength=0; - for(factor=factors[i]; factor>0; --factor) { - factorLength=calcStringSetLength(gNameSet, s); - s+=factorLength+1; - if(factorLength>maxFactorLength) { - maxFactorLength=factorLength; - } - } - length+=maxFactorLength; - } - - if(length>maxNameLength) { - maxNameLength=length; - } - break; - } - default: - /* unknown type */ - break; - } - - range=(AlgorithmicRange *)((uint8_t *)range+range->size); - --rangeCount; - } - return maxNameLength; -} - -static int32_t -calcExtNameSetsLengths(int32_t maxNameLength) { - int32_t i, length; - - for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) { - /* - * for each category, count the length of the category name - * plus 9= - * 2 for <> - * 1 for - - * 6 for most hex digits per code point - */ - length=9+calcStringSetLength(gNameSet, charCatNames[i]); - if(length>maxNameLength) { - maxNameLength=length; - } - } - return maxNameLength; -} - -static int32_t -calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, - uint32_t set[8], - const uint8_t **pLine, const uint8_t *lineLimit) { - const uint8_t *line=*pLine; - int32_t length=0, tokenLength; - uint16_t c, token; - - while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { - if(c>=tokenCount) { - /* implicit letter */ - SET_ADD(set, c); - ++length; - } else { - token=tokens[c]; - if(token==(uint16_t)(-2)) { - /* this is a lead byte for a double-byte token */ - c=c<<8|*line++; - token=tokens[c]; - } - if(token==(uint16_t)(-1)) { - /* explicit letter */ - SET_ADD(set, c); - ++length; - } else { - /* count token word */ - if(tokenLengths!=NULL) { - /* use cached token length */ - tokenLength=tokenLengths[c]; - if(tokenLength==0) { - tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); - tokenLengths[c]=(int8_t)tokenLength; - } - } else { - tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); - } - length+=tokenLength; - } - } - } - - *pLine=line; - return length; -} - -static void -calcGroupNameSetsLengths(int32_t maxNameLength) { - uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; - - uint16_t *tokens=(uint16_t *)uCharNames+8; - uint16_t tokenCount=*tokens++; - uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; - - int8_t *tokenLengths; - - const uint16_t *group; - const uint8_t *s, *line, *lineLimit; - - int32_t groupCount, lineNumber, length; - - tokenLengths=(int8_t *)uprv_malloc(tokenCount); - if(tokenLengths!=NULL) { - uprv_memset(tokenLengths, 0, tokenCount); - } - - group=GET_GROUPS(uCharNames); - groupCount=*group++; - - /* enumerate all groups */ - while(groupCount>0) { - s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group); - s=expandGroupLengths(s, offsets, lengths); - - /* enumerate all lines in each group */ - for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) { - line=s+offsets[lineNumber]; - length=lengths[lineNumber]; - if(length==0) { - continue; - } - - lineLimit=line+length; - - /* read regular name */ - length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); - if(length>maxNameLength) { - maxNameLength=length; - } - if(line==lineLimit) { - continue; - } - - /* read Unicode 1.0 name */ - length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); - if(length>maxNameLength) { - maxNameLength=length; - } - if(line==lineLimit) { - continue; - } - - /* read ISO comment */ - /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/ - } - - group=NEXT_GROUP(group); - --groupCount; - } - - if(tokenLengths!=NULL) { - uprv_free(tokenLengths); - } - - /* set gMax... - name length last for threading */ - gMaxNameLength=maxNameLength; -} - -static UBool -calcNameSetsLengths(UErrorCode *pErrorCode) { - static const char extChars[]="0123456789ABCDEF<>-"; - int32_t i, maxNameLength; - - if(gMaxNameLength!=0) { - return TRUE; - } - - if(!isDataLoaded(pErrorCode)) { - return FALSE; - } - - /* set hex digits, used in various names, and <>-, used in extended names */ - for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) { - SET_ADD(gNameSet, extChars[i]); - } - - /* set sets and lengths from algorithmic names */ - maxNameLength=calcAlgNameSetsLengths(0); - - /* set sets and lengths from extended names */ - maxNameLength=calcExtNameSetsLengths(maxNameLength); - - /* set sets and lengths from group names, set global maximum values */ - calcGroupNameSetsLengths(maxNameLength); - - return TRUE; -} - -U_NAMESPACE_END - -/* public API --------------------------------------------------------------- */ - -U_NAMESPACE_USE - -U_CAPI int32_t U_EXPORT2 -u_charName(UChar32 code, UCharNameChoice nameChoice, - char *buffer, int32_t bufferLength, - UErrorCode *pErrorCode) { - AlgorithmicRange *algRange; - uint32_t *p; - uint32_t i; - int32_t length; - - /* check the argument values */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || - bufferLength<0 || (bufferLength>0 && buffer==NULL) - ) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { - return u_terminateChars(buffer, bufferLength, 0, pErrorCode); - } - - length=0; - - /* try algorithmic names first */ - p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); - i=*p; - algRange=(AlgorithmicRange *)(p+1); - while(i>0) { - if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { - length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); - break; - } - algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); - --i; - } - - if(i==0) { - if (nameChoice == U_EXTENDED_CHAR_NAME) { - length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength); - if (!length) { - /* extended character name */ - length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength); - } - } else { - /* normal character name */ - length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); - } - } - - return u_terminateChars(buffer, bufferLength, length, pErrorCode); -} - -U_CAPI int32_t U_EXPORT2 -u_getISOComment(UChar32 /*c*/, - char *dest, int32_t destCapacity, - UErrorCode *pErrorCode) { - /* check the argument values */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - return u_terminateChars(dest, destCapacity, 0, pErrorCode); -} - -U_CAPI UChar32 U_EXPORT2 -u_charFromName(UCharNameChoice nameChoice, - const char *name, - UErrorCode *pErrorCode) { +static UBool +enumExtNames(UChar32 start, UChar32 end, + UEnumCharNamesFn *fn, void *context) +{ + if(fn!=DO_FIND_NAME) { + char buffer[200]; + uint16_t length; + + while(start<=end) { + buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; + /* here, we assume that the buffer is large enough */ + if(length>0) { + if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { + return FALSE; + } + } + ++start; + } + } + + return TRUE; +} + +static UBool +enumNames(UCharNames *names, + UChar32 start, UChar32 limit, + UEnumCharNamesFn *fn, void *context, + UCharNameChoice nameChoice) { + uint16_t startGroupMSB, endGroupMSB, groupCount; + const uint16_t *group, *groupLimit; + + startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); + endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); + + /* find the group that contains start, or the highest before it */ + group=getGroup(names, start); + + if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) { + /* enumerate synthetic names between start and the group start */ + UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT); + if(extLimit>limit) { + extLimit=limit; + } + if(!enumExtNames(start, extLimit-1, fn, context)) { + return FALSE; + } + start=extLimit; + } + + if(startGroupMSB==endGroupMSB) { + if(startGroupMSB==group[GROUP_MSB]) { + /* if start and limit-1 are in the same group, then enumerate only in that one */ + return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice); + } + } else { + const uint16_t *groups=GET_GROUPS(names); + groupCount=*groups++; + groupLimit=groups+groupCount*GROUP_LENGTH; + + if(startGroupMSB==group[GROUP_MSB]) { + /* enumerate characters in the partial start group */ + if((start&GROUP_MASK)!=0) { + if(!enumGroupNames(names, group, + start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1, + fn, context, nameChoice)) { + return FALSE; + } + group=NEXT_GROUP(group); /* continue with the next group */ + } + } else if(startGroupMSB>group[GROUP_MSB]) { + /* make sure that we start enumerating with the first group after start */ + const uint16_t *nextGroup=NEXT_GROUP(group); + if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) { + UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; + if (end > limit) { + end = limit; + } + if (!enumExtNames(start, end - 1, fn, context)) { + return FALSE; + } + } + group=nextGroup; + } + + /* enumerate entire groups between the start- and end-groups */ + while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) { + const uint16_t *nextGroup; + start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT; + if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) { + return FALSE; + } + nextGroup=NEXT_GROUP(group); + if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { + UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; + if (end > limit) { + end = limit; + } + if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) { + return FALSE; + } + } + group=nextGroup; + } + + /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ + if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) { + return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice); + } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) { + UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT; + if (next > start) { + start = next; + } + } else { + return TRUE; + } + } + + /* we have not found a group, which means everything is made of + extended names. */ + if (nameChoice == U_EXTENDED_CHAR_NAME) { + if (limit > UCHAR_MAX_VALUE + 1) { + limit = UCHAR_MAX_VALUE + 1; + } + return enumExtNames(start, limit - 1, fn, context); + } + + return TRUE; +} + +static uint16_t +writeFactorSuffix(const uint16_t *factors, uint16_t count, + const char *s, /* suffix elements */ + uint32_t code, + uint16_t indexes[8], /* output fields from here */ + const char *elementBases[8], const char *elements[8], + char *buffer, uint16_t bufferLength) { + uint16_t i, factor, bufferPos=0; + char c; + + /* write elements according to the factors */ + + /* + * the factorized elements are determined by modulo arithmetic + * with the factors of this algorithm + * + * note that for fewer operations, count is decremented here + */ + --count; + for(i=count; i>0; --i) { + factor=factors[i]; + indexes[i]=(uint16_t)(code%factor); + code/=factor; + } + /* + * we don't need to calculate the last modulus because start<=code<=end + * guarantees here that code<=factors[0] + */ + indexes[0]=(uint16_t)code; + + /* write each element */ + for(;;) { + if(elementBases!=NULL) { + *elementBases++=s; + } + + /* skip indexes[i] strings */ + factor=indexes[i]; + while(factor>0) { + while(*s++!=0) {} + --factor; + } + if(elements!=NULL) { + *elements++=s; + } + + /* write element */ + while((c=*s++)!=0) { + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } + + /* we do not need to perform the rest of this loop for i==count - break here */ + if(i>=count) { + break; + } + + /* skip the rest of the strings for this factors[i] */ + factor=(uint16_t)(factors[i]-indexes[i]-1); + while(factor>0) { + while(*s++!=0) {} + --factor; + } + + ++i; + } + + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + + return bufferPos; +} + +/* + * Important: + * Parts of findAlgName() are almost the same as some of getAlgName(). + * Fixes must be applied to both. + */ +static uint16_t +getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, + char *buffer, uint16_t bufferLength) { + uint16_t bufferPos=0; + + /* Only the normative character name can be algorithmic. */ + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + return 0; + } + + switch(range->type) { + case 0: { + /* name = prefix hex-digits */ + const char *s=(const char *)(range+1); + char c; + + uint16_t i, count; + + /* copy prefix */ + while((c=*s++)!=0) { + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } + + /* write hexadecimal code point value */ + count=range->variant; + + /* zero-terminate */ + if(count<bufferLength) { + buffer[count]=0; + } + + for(i=count; i>0;) { + if(--i<bufferLength) { + c=(char)(code&0xf); + if(c<10) { + c+='0'; + } else { + c+='A'-10; + } + buffer[i]=c; + } + code>>=4; + } + + bufferPos+=count; + break; + } + case 1: { + /* name = prefix factorized-elements */ + uint16_t indexes[8]; + const uint16_t *factors=(const uint16_t *)(range+1); + uint16_t count=range->variant; + const char *s=(const char *)(factors+count); + char c; + + /* copy prefix */ + while((c=*s++)!=0) { + WRITE_CHAR(buffer, bufferLength, bufferPos, c); + } + + bufferPos+=writeFactorSuffix(factors, count, + s, code-range->start, indexes, NULL, NULL, buffer, bufferLength); + break; + } + default: + /* undefined type */ + /* zero-terminate */ + if(bufferLength>0) { + *buffer=0; + } + break; + } + + return bufferPos; +} + +/* + * Important: enumAlgNames() and findAlgName() are almost the same. + * Any fix must be applied to both. + */ +static UBool +enumAlgNames(AlgorithmicRange *range, + UChar32 start, UChar32 limit, + UEnumCharNamesFn *fn, void *context, + UCharNameChoice nameChoice) { + char buffer[200]; + uint16_t length; + + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + return TRUE; + } + + switch(range->type) { + case 0: { + char *s, *end; + char c; + + /* get the full name of the start character */ + length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer)); + if(length<=0) { + return TRUE; + } + + /* call the enumerator function with this first character */ + if(!fn(context, start, nameChoice, buffer, length)) { + return FALSE; + } + + /* go to the end of the name; all these names have the same length */ + end=buffer; + while(*end!=0) { + ++end; + } + + /* enumerate the rest of the names */ + while(++start<limit) { + /* increment the hexadecimal number on a character-basis */ + s=end; + for (;;) { + c=*--s; + if(('0'<=c && c<'9') || ('A'<=c && c<'F')) { + *s=(char)(c+1); + break; + } else if(c=='9') { + *s='A'; + break; + } else if(c=='F') { + *s='0'; + } + } + + if(!fn(context, start, nameChoice, buffer, length)) { + return FALSE; + } + } + break; + } + case 1: { + uint16_t indexes[8]; + const char *elementBases[8], *elements[8]; + const uint16_t *factors=(const uint16_t *)(range+1); + uint16_t count=range->variant; + const char *s=(const char *)(factors+count); + char *suffix, *t; + uint16_t prefixLength, i, idx; + + char c; + + /* name = prefix factorized-elements */ + + /* copy prefix */ + suffix=buffer; + prefixLength=0; + while((c=*s++)!=0) { + *suffix++=c; + ++prefixLength; + } + + /* append the suffix of the start character */ + length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, + s, (uint32_t)start-range->start, + indexes, elementBases, elements, + suffix, (uint16_t)(sizeof(buffer)-prefixLength))); + + /* call the enumerator function with this first character */ + if(!fn(context, start, nameChoice, buffer, length)) { + return FALSE; + } + + /* enumerate the rest of the names */ + while(++start<limit) { + /* increment the indexes in lexical order bound by the factors */ + i=count; + for (;;) { + idx=(uint16_t)(indexes[--i]+1); + if(idx<factors[i]) { + /* skip one index and its element string */ + indexes[i]=idx; + s=elements[i]; + while(*s++!=0) { + } + elements[i]=s; + break; + } else { + /* reset this index to 0 and its element string to the first one */ + indexes[i]=0; + elements[i]=elementBases[i]; + } + } + + /* to make matters a little easier, just append all elements to the suffix */ + t=suffix; + length=prefixLength; + for(i=0; i<count; ++i) { + s=elements[i]; + while((c=*s++)!=0) { + *t++=c; + ++length; + } + } + /* zero-terminate */ + *t=0; + + if(!fn(context, start, nameChoice, buffer, length)) { + return FALSE; + } + } + break; + } + default: + /* undefined type */ + break; + } + + return TRUE; +} + +/* + * findAlgName() is almost the same as enumAlgNames() except that it + * returns the code point for a name if it fits into the range. + * It returns 0xffff otherwise. + */ +static UChar32 +findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) { + UChar32 code; + + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { + return 0xffff; + } + + switch(range->type) { + case 0: { + /* name = prefix hex-digits */ + const char *s=(const char *)(range+1); + char c; + + uint16_t i, count; + + /* compare prefix */ + while((c=*s++)!=0) { + if((char)c!=*otherName++) { + return 0xffff; + } + } + + /* read hexadecimal code point value */ + count=range->variant; + code=0; + for(i=0; i<count; ++i) { + c=*otherName++; + if('0'<=c && c<='9') { + code=(code<<4)|(c-'0'); + } else if('A'<=c && c<='F') { + code=(code<<4)|(c-'A'+10); + } else { + return 0xffff; + } + } + + /* does it fit into the range? */ + if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) { + return code; + } + break; + } + case 1: { + char buffer[64]; + uint16_t indexes[8]; + const char *elementBases[8], *elements[8]; + const uint16_t *factors=(const uint16_t *)(range+1); + uint16_t count=range->variant; + const char *s=(const char *)(factors+count), *t; + UChar32 start, limit; + uint16_t i, idx; + + char c; + + /* name = prefix factorized-elements */ + + /* compare prefix */ + while((c=*s++)!=0) { + if((char)c!=*otherName++) { + return 0xffff; + } + } + + start=(UChar32)range->start; + limit=(UChar32)(range->end+1); + + /* initialize the suffix elements for enumeration; indexes should all be set to 0 */ + writeFactorSuffix(factors, count, s, 0, + indexes, elementBases, elements, buffer, sizeof(buffer)); + + /* compare the first suffix */ + if(0==uprv_strcmp(otherName, buffer)) { + return start; + } + + /* enumerate and compare the rest of the suffixes */ + while(++start<limit) { + /* increment the indexes in lexical order bound by the factors */ + i=count; + for (;;) { + idx=(uint16_t)(indexes[--i]+1); + if(idx<factors[i]) { + /* skip one index and its element string */ + indexes[i]=idx; + s=elements[i]; + while(*s++!=0) {} + elements[i]=s; + break; + } else { + /* reset this index to 0 and its element string to the first one */ + indexes[i]=0; + elements[i]=elementBases[i]; + } + } + + /* to make matters a little easier, just compare all elements of the suffix */ + t=otherName; + for(i=0; i<count; ++i) { + s=elements[i]; + while((c=*s++)!=0) { + if(c!=*t++) { + s=""; /* does not match */ + i=99; + } + } + } + if(i<99 && *t==0) { + return start; + } + } + break; + } + default: + /* undefined type */ + break; + } + + return 0xffff; +} + +/* sets of name characters, maximum name lengths ---------------------------- */ + +#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) +#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0) + +static int32_t +calcStringSetLength(uint32_t set[8], const char *s) { + int32_t length=0; + char c; + + while((c=*s++)!=0) { + SET_ADD(set, c); + ++length; + } + return length; +} + +static int32_t +calcAlgNameSetsLengths(int32_t maxNameLength) { + AlgorithmicRange *range; + uint32_t *p; + uint32_t rangeCount; + int32_t length; + + /* enumerate algorithmic ranges */ + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); + rangeCount=*p; + range=(AlgorithmicRange *)(p+1); + while(rangeCount>0) { + switch(range->type) { + case 0: + /* name = prefix + (range->variant times) hex-digits */ + /* prefix */ + length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant; + if(length>maxNameLength) { + maxNameLength=length; + } + break; + case 1: { + /* name = prefix factorized-elements */ + const uint16_t *factors=(const uint16_t *)(range+1); + const char *s; + int32_t i, count=range->variant, factor, factorLength, maxFactorLength; + + /* prefix length */ + s=(const char *)(factors+count); + length=calcStringSetLength(gNameSet, s); + s+=length+1; /* start of factor suffixes */ + + /* get the set and maximum factor suffix length for each factor */ + for(i=0; i<count; ++i) { + maxFactorLength=0; + for(factor=factors[i]; factor>0; --factor) { + factorLength=calcStringSetLength(gNameSet, s); + s+=factorLength+1; + if(factorLength>maxFactorLength) { + maxFactorLength=factorLength; + } + } + length+=maxFactorLength; + } + + if(length>maxNameLength) { + maxNameLength=length; + } + break; + } + default: + /* unknown type */ + break; + } + + range=(AlgorithmicRange *)((uint8_t *)range+range->size); + --rangeCount; + } + return maxNameLength; +} + +static int32_t +calcExtNameSetsLengths(int32_t maxNameLength) { + int32_t i, length; + + for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) { + /* + * for each category, count the length of the category name + * plus 9= + * 2 for <> + * 1 for - + * 6 for most hex digits per code point + */ + length=9+calcStringSetLength(gNameSet, charCatNames[i]); + if(length>maxNameLength) { + maxNameLength=length; + } + } + return maxNameLength; +} + +static int32_t +calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, + uint32_t set[8], + const uint8_t **pLine, const uint8_t *lineLimit) { + const uint8_t *line=*pLine; + int32_t length=0, tokenLength; + uint16_t c, token; + + while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { + if(c>=tokenCount) { + /* implicit letter */ + SET_ADD(set, c); + ++length; + } else { + token=tokens[c]; + if(token==(uint16_t)(-2)) { + /* this is a lead byte for a double-byte token */ + c=c<<8|*line++; + token=tokens[c]; + } + if(token==(uint16_t)(-1)) { + /* explicit letter */ + SET_ADD(set, c); + ++length; + } else { + /* count token word */ + if(tokenLengths!=NULL) { + /* use cached token length */ + tokenLength=tokenLengths[c]; + if(tokenLength==0) { + tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); + tokenLengths[c]=(int8_t)tokenLength; + } + } else { + tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); + } + length+=tokenLength; + } + } + } + + *pLine=line; + return length; +} + +static void +calcGroupNameSetsLengths(int32_t maxNameLength) { + uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; + + uint16_t *tokens=(uint16_t *)uCharNames+8; + uint16_t tokenCount=*tokens++; + uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; + + int8_t *tokenLengths; + + const uint16_t *group; + const uint8_t *s, *line, *lineLimit; + + int32_t groupCount, lineNumber, length; + + tokenLengths=(int8_t *)uprv_malloc(tokenCount); + if(tokenLengths!=NULL) { + uprv_memset(tokenLengths, 0, tokenCount); + } + + group=GET_GROUPS(uCharNames); + groupCount=*group++; + + /* enumerate all groups */ + while(groupCount>0) { + s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group); + s=expandGroupLengths(s, offsets, lengths); + + /* enumerate all lines in each group */ + for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) { + line=s+offsets[lineNumber]; + length=lengths[lineNumber]; + if(length==0) { + continue; + } + + lineLimit=line+length; + + /* read regular name */ + length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); + if(length>maxNameLength) { + maxNameLength=length; + } + if(line==lineLimit) { + continue; + } + + /* read Unicode 1.0 name */ + length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); + if(length>maxNameLength) { + maxNameLength=length; + } + if(line==lineLimit) { + continue; + } + + /* read ISO comment */ + /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/ + } + + group=NEXT_GROUP(group); + --groupCount; + } + + if(tokenLengths!=NULL) { + uprv_free(tokenLengths); + } + + /* set gMax... - name length last for threading */ + gMaxNameLength=maxNameLength; +} + +static UBool +calcNameSetsLengths(UErrorCode *pErrorCode) { + static const char extChars[]="0123456789ABCDEF<>-"; + int32_t i, maxNameLength; + + if(gMaxNameLength!=0) { + return TRUE; + } + + if(!isDataLoaded(pErrorCode)) { + return FALSE; + } + + /* set hex digits, used in various names, and <>-, used in extended names */ + for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) { + SET_ADD(gNameSet, extChars[i]); + } + + /* set sets and lengths from algorithmic names */ + maxNameLength=calcAlgNameSetsLengths(0); + + /* set sets and lengths from extended names */ + maxNameLength=calcExtNameSetsLengths(maxNameLength); + + /* set sets and lengths from group names, set global maximum values */ + calcGroupNameSetsLengths(maxNameLength); + + return TRUE; +} + +U_NAMESPACE_END + +/* public API --------------------------------------------------------------- */ + +U_NAMESPACE_USE + +U_CAPI int32_t U_EXPORT2 +u_charName(UChar32 code, UCharNameChoice nameChoice, + char *buffer, int32_t bufferLength, + UErrorCode *pErrorCode) { + AlgorithmicRange *algRange; + uint32_t *p; + uint32_t i; + int32_t length; + + /* check the argument values */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || + bufferLength<0 || (bufferLength>0 && buffer==NULL) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { + return u_terminateChars(buffer, bufferLength, 0, pErrorCode); + } + + length=0; + + /* try algorithmic names first */ + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); + i=*p; + algRange=(AlgorithmicRange *)(p+1); + while(i>0) { + if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { + length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); + break; + } + algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); + --i; + } + + if(i==0) { + if (nameChoice == U_EXTENDED_CHAR_NAME) { + length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength); + if (!length) { + /* extended character name */ + length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength); + } + } else { + /* normal character name */ + length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); + } + } + + return u_terminateChars(buffer, bufferLength, length, pErrorCode); +} + +U_CAPI int32_t U_EXPORT2 +u_getISOComment(UChar32 /*c*/, + char *dest, int32_t destCapacity, + UErrorCode *pErrorCode) { + /* check the argument values */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + return u_terminateChars(dest, destCapacity, 0, pErrorCode); +} + +U_CAPI UChar32 U_EXPORT2 +u_charFromName(UCharNameChoice nameChoice, + const char *name, + UErrorCode *pErrorCode) { char upper[120] = {0}; char lower[120] = {0}; - FindName findName; - AlgorithmicRange *algRange; - uint32_t *p; - uint32_t i; - UChar32 cp = 0; - char c0; + FindName findName; + AlgorithmicRange *algRange; + uint32_t *p; + uint32_t i; + UChar32 cp = 0; + char c0; static constexpr UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */ - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return error; - } - - if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return error; - } - - if(!isDataLoaded(pErrorCode)) { - return error; - } - - /* construct the uppercase and lowercase of the name first */ - for(i=0; i<sizeof(upper); ++i) { - if((c0=*name++)!=0) { - upper[i]=uprv_toupper(c0); - lower[i]=uprv_tolower(c0); - } else { - upper[i]=lower[i]=0; - break; - } - } - if(i==sizeof(upper)) { - /* name too long, there is no such character */ - *pErrorCode = U_ILLEGAL_CHAR_FOUND; - return error; - } - // i==strlen(name)==strlen(lower)==strlen(upper) - - /* try extended names first */ - if (lower[0] == '<') { + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return error; + } + + if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return error; + } + + if(!isDataLoaded(pErrorCode)) { + return error; + } + + /* construct the uppercase and lowercase of the name first */ + for(i=0; i<sizeof(upper); ++i) { + if((c0=*name++)!=0) { + upper[i]=uprv_toupper(c0); + lower[i]=uprv_tolower(c0); + } else { + upper[i]=lower[i]=0; + break; + } + } + if(i==sizeof(upper)) { + /* name too long, there is no such character */ + *pErrorCode = U_ILLEGAL_CHAR_FOUND; + return error; + } + // i==strlen(name)==strlen(lower)==strlen(upper) + + /* try extended names first */ + if (lower[0] == '<') { if (nameChoice == U_EXTENDED_CHAR_NAME && lower[--i] == '>') { - // Parse a string like "<category-HHHH>" where HHHH is a hex code point. + // Parse a string like "<category-HHHH>" where HHHH is a hex code point. uint32_t limit = i; while (i >= 3 && lower[--i] != '-') {} - + // There should be 1 to 8 hex digits. int32_t hexLength = limit - (i + 1); if (i >= 2 && lower[i] == '-' && 1 <= hexLength && hexLength <= 8) { uint32_t cIdx; - + lower[i] = 0; - + for (++i; i < limit; ++i) { if (lower[i] >= '0' && lower[i] <= '9') { cp = (cp << 4) + lower[i] - '0'; @@ -1581,528 +1581,528 @@ u_charFromName(UCharNameChoice nameChoice, } else { *pErrorCode = U_ILLEGAL_CHAR_FOUND; return error; - } + } // Prevent signed-integer overflow and out-of-range code points. if (cp > UCHAR_MAX_VALUE) { *pErrorCode = U_ILLEGAL_CHAR_FOUND; return error; } } - + /* Now validate the category name. We could use a binary search, or a trie, if we really wanted to. */ uint8_t cat = getCharCat(cp); for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) { - + if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { if (cat == cIdx) { return cp; - } + } break; - } - } - } - } - - *pErrorCode = U_ILLEGAL_CHAR_FOUND; - return error; - } - - /* try algorithmic names now */ - p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); - i=*p; - algRange=(AlgorithmicRange *)(p+1); - while(i>0) { - if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { - return cp; - } - algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); - --i; - } - - /* normal character name */ - findName.otherName=upper; - findName.code=error; - enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice); - if (findName.code == error) { - *pErrorCode = U_ILLEGAL_CHAR_FOUND; - } - return findName.code; -} - -U_CAPI void U_EXPORT2 -u_enumCharNames(UChar32 start, UChar32 limit, - UEnumCharNamesFn *fn, - void *context, - UCharNameChoice nameChoice, - UErrorCode *pErrorCode) { - AlgorithmicRange *algRange; - uint32_t *p; - uint32_t i; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return; - } - - if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { - limit = UCHAR_MAX_VALUE + 1; - } - if((uint32_t)start>=(uint32_t)limit) { - return; - } - - if(!isDataLoaded(pErrorCode)) { - return; - } - - /* interleave the data-driven ones with the algorithmic ones */ - /* iterate over all algorithmic ranges; assume that they are in ascending order */ - p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); - i=*p; - algRange=(AlgorithmicRange *)(p+1); - while(i>0) { - /* enumerate the character names before the current algorithmic range */ - /* here: start<limit */ - if((uint32_t)start<algRange->start) { - if((uint32_t)limit<=algRange->start) { - enumNames(uCharNames, start, limit, fn, context, nameChoice); - return; - } - if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { - return; - } - start=(UChar32)algRange->start; - } - /* enumerate the character names in the current algorithmic range */ - /* here: algRange->start<=start<limit */ - if((uint32_t)start<=algRange->end) { - if((uint32_t)limit<=(algRange->end+1)) { - enumAlgNames(algRange, start, limit, fn, context, nameChoice); - return; - } - if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { - return; - } - start=(UChar32)algRange->end+1; - } - /* continue to the next algorithmic range (here: start<limit) */ - algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); - --i; - } - /* enumerate the character names after the last algorithmic range */ - enumNames(uCharNames, start, limit, fn, context, nameChoice); -} - -U_CAPI int32_t U_EXPORT2 -uprv_getMaxCharNameLength() { - UErrorCode errorCode=U_ZERO_ERROR; - if(calcNameSetsLengths(&errorCode)) { - return gMaxNameLength; - } else { - return 0; - } -} - -/** - * Converts the char set cset into a Unicode set uset. - * @param cset Set of 256 bit flags corresponding to a set of chars. - * @param uset USet to receive characters. Existing contents are deleted. - */ -static void -charSetToUSet(uint32_t cset[8], const USetAdder *sa) { - UChar us[256]; - char cs[256]; - - int32_t i, length; - UErrorCode errorCode; - - errorCode=U_ZERO_ERROR; - - if(!calcNameSetsLengths(&errorCode)) { - return; - } - - /* build a char string with all chars that are used in character names */ - length=0; - for(i=0; i<256; ++i) { - if(SET_CONTAINS(cset, i)) { - cs[length++]=(char)i; - } - } - - /* convert the char string to a UChar string */ - u_charsToUChars(cs, us, length); - - /* add each UChar to the USet */ - for(i=0; i<length; ++i) { - if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */ - sa->add(sa->set, us[i]); - } - } -} - -/** - * Fills set with characters that are used in Unicode character names. - * @param set USet to receive characters. - */ -U_CAPI void U_EXPORT2 -uprv_getCharNameCharacters(const USetAdder *sa) { - charSetToUSet(gNameSet, sa); -} - -/* data swapping ------------------------------------------------------------ */ - -/* - * The token table contains non-negative entries for token bytes, - * and -1 for bytes that represent themselves in the data file's charset. - * -2 entries are used for lead bytes. - * - * Direct bytes (-1 entries) must be translated from the input charset family - * to the output charset family. - * makeTokenMap() writes a permutation mapping for this. - * Use it once for single-/lead-byte tokens and once more for all trail byte - * tokens. (';' is an unused trail byte marked with -1.) - */ -static void -makeTokenMap(const UDataSwapper *ds, - int16_t tokens[], uint16_t tokenCount, - uint8_t map[256], - UErrorCode *pErrorCode) { - UBool usedOutChar[256]; - uint16_t i, j; - uint8_t c1, c2; - - if(U_FAILURE(*pErrorCode)) { - return; - } - - if(ds->inCharset==ds->outCharset) { - /* Same charset family: identity permutation */ - for(i=0; i<256; ++i) { - map[i]=(uint8_t)i; - } - } else { - uprv_memset(map, 0, 256); - uprv_memset(usedOutChar, 0, 256); - - if(tokenCount>256) { - tokenCount=256; - } - - /* set the direct bytes (byte 0 always maps to itself) */ - for(i=1; i<tokenCount; ++i) { - if(tokens[i]==-1) { - /* convert the direct byte character */ - c1=(uint8_t)i; - ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n", - i, ds->inCharset); - return; - } - - /* enter the converted character into the map and mark it used */ - map[c1]=c2; - usedOutChar[c2]=TRUE; - } - } - - /* set the mappings for the rest of the permutation */ - for(i=j=1; i<tokenCount; ++i) { - /* set mappings that were not set for direct bytes */ - if(map[i]==0) { - /* set an output byte value that was not used as an output byte above */ - while(usedOutChar[j]) { - ++j; - } - map[i]=(uint8_t)j++; - } - } - - /* - * leave mappings at tokenCount and above unset if tokenCount<256 - * because they won't be used - */ - } -} - -U_CAPI int32_t U_EXPORT2 -uchar_swapNames(const UDataSwapper *ds, - const void *inData, int32_t length, void *outData, - UErrorCode *pErrorCode) { - const UDataInfo *pInfo; - int32_t headerSize; - - const uint8_t *inBytes; - uint8_t *outBytes; - - uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset, - offset, i, count, stringsCount; - - const AlgorithmicRange *inRange; - AlgorithmicRange *outRange; - - /* udata_swapDataHeader checks the arguments */ - headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } - - /* check data format and format version */ - pInfo=(const UDataInfo *)((const char *)inData+4); - if(!( - pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ - pInfo->dataFormat[1]==0x6e && - pInfo->dataFormat[2]==0x61 && - pInfo->dataFormat[3]==0x6d && - pInfo->formatVersion[0]==1 - )) { - udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n", - pInfo->dataFormat[0], pInfo->dataFormat[1], - pInfo->dataFormat[2], pInfo->dataFormat[3], - pInfo->formatVersion[0]); - *pErrorCode=U_UNSUPPORTED_ERROR; - return 0; - } - - inBytes=(const uint8_t *)inData+headerSize; - outBytes=(uint8_t *)outData+headerSize; - if(length<0) { - algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]); - } else { - length-=headerSize; - if( length<20 || - (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3])) - ) { - udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n", - length); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - } - - if(length<0) { - /* preflighting: iterate through algorithmic ranges */ - offset=algNamesOffset; - count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); - offset+=4; - - for(i=0; i<count; ++i) { - inRange=(const AlgorithmicRange *)(inBytes+offset); - offset+=ds->readUInt16(inRange->size); - } - } else { - /* swap data */ - const uint16_t *p; - uint16_t *q, *temp; - - int16_t tokens[512]; - uint16_t tokenCount; - - uint8_t map[256], trailMap[256]; - - /* copy the data for inaccessible bytes */ - if(inBytes!=outBytes) { - uprv_memcpy(outBytes, inBytes, length); - } - - /* the initial 4 offsets first */ - tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]); - groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]); - groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]); - ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode); - - /* - * now the tokens table - * it needs to be permutated along with the compressed name strings - */ - p=(const uint16_t *)(inBytes+16); - q=(uint16_t *)(outBytes+16); - - /* read and swap the tokenCount */ - tokenCount=ds->readUInt16(*p); - ds->swapArray16(ds, p, 2, q, pErrorCode); - ++p; - ++q; - - /* read the first 512 tokens and make the token maps */ - if(tokenCount<=512) { - count=tokenCount; - } else { - count=512; - } - for(i=0; i<count; ++i) { - tokens[i]=udata_readInt16(ds, p[i]); - } - for(; i<512; ++i) { - tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */ - } - makeTokenMap(ds, tokens, tokenCount, map, pErrorCode); - makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return 0; - } - - /* - * swap and permutate the tokens - * go through a temporary array to support in-place swapping - */ - temp=(uint16_t *)uprv_malloc(tokenCount*2); - if(temp==NULL) { - udata_printError(ds, "out of memory swapping %u unames.icu tokens\n", - tokenCount); - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - return 0; - } - - /* swap and permutate single-/lead-byte tokens */ - for(i=0; i<tokenCount && i<256; ++i) { - ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode); - } - - /* swap and permutate trail-byte tokens */ - for(; i<tokenCount; ++i) { - ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode); - } - - /* copy the result into the output and free the temporary array */ - uprv_memcpy(q, temp, tokenCount*2); - uprv_free(temp); - - /* - * swap the token strings but not a possible padding byte after - * the terminating NUL of the last string - */ - udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset), - outBytes+tokenStringOffset, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - udata_printError(ds, "uchar_swapNames(token strings) failed\n"); - return 0; - } - - /* swap the group table */ - count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset))); - ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2), - outBytes+groupsOffset, pErrorCode); - - /* - * swap the group strings - * swap the string bytes but not the nibble-encoded string lengths - */ - if(ds->inCharset!=ds->outCharset) { - uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1]; - - const uint8_t *inStrings, *nextInStrings; - uint8_t *outStrings; - - uint8_t c; - - inStrings=inBytes+groupStringOffset; - outStrings=outBytes+groupStringOffset; - - stringsCount=algNamesOffset-groupStringOffset; - - /* iterate through string groups until only a few padding bytes are left */ - while(stringsCount>32) { - nextInStrings=expandGroupLengths(inStrings, offsets, lengths); - - /* move past the length bytes */ - stringsCount-=(uint32_t)(nextInStrings-inStrings); - outStrings+=nextInStrings-inStrings; - inStrings=nextInStrings; - - count=offsets[31]+lengths[31]; /* total number of string bytes in this group */ - stringsCount-=count; - - /* swap the string bytes using map[] and trailMap[] */ - while(count>0) { - c=*inStrings++; - *outStrings++=map[c]; - if(tokens[c]!=-2) { - --count; - } else { - /* token lead byte: swap the trail byte, too */ - *outStrings++=trailMap[*inStrings++]; - count-=2; - } - } - } - } - - /* swap the algorithmic ranges */ - offset=algNamesOffset; - count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); - ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode); - offset+=4; - - for(i=0; i<count; ++i) { - if(offset>(uint32_t)length) { - udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n", - length, i); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - - inRange=(const AlgorithmicRange *)(inBytes+offset); - outRange=(AlgorithmicRange *)(outBytes+offset); - offset+=ds->readUInt16(inRange->size); - - ds->swapArray32(ds, inRange, 8, outRange, pErrorCode); - ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode); - switch(inRange->type) { - case 0: - /* swap prefix string */ - ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)), - outRange+1, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n", - i); - return 0; - } - break; - case 1: - { - /* swap factors and the prefix and factor strings */ - uint32_t factorsCount; - - factorsCount=inRange->variant; - p=(const uint16_t *)(inRange+1); - q=(uint16_t *)(outRange+1); - ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode); - - /* swap the strings, up to the last terminating NUL */ - p+=factorsCount; - q+=factorsCount; - stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p); - while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) { - --stringsCount; - } - ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode); - } - break; - default: - udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n", - inRange->type, i); - *pErrorCode=U_UNSUPPORTED_ERROR; - return 0; - } - } - } - - return headerSize+(int32_t)offset; -} - -/* - * Hey, Emacs, please set the following: - * - * Local Variables: - * indent-tabs-mode: nil - * End: - * - */ + } + } + } + } + + *pErrorCode = U_ILLEGAL_CHAR_FOUND; + return error; + } + + /* try algorithmic names now */ + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); + i=*p; + algRange=(AlgorithmicRange *)(p+1); + while(i>0) { + if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { + return cp; + } + algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); + --i; + } + + /* normal character name */ + findName.otherName=upper; + findName.code=error; + enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice); + if (findName.code == error) { + *pErrorCode = U_ILLEGAL_CHAR_FOUND; + } + return findName.code; +} + +U_CAPI void U_EXPORT2 +u_enumCharNames(UChar32 start, UChar32 limit, + UEnumCharNamesFn *fn, + void *context, + UCharNameChoice nameChoice, + UErrorCode *pErrorCode) { + AlgorithmicRange *algRange; + uint32_t *p; + uint32_t i; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { + limit = UCHAR_MAX_VALUE + 1; + } + if((uint32_t)start>=(uint32_t)limit) { + return; + } + + if(!isDataLoaded(pErrorCode)) { + return; + } + + /* interleave the data-driven ones with the algorithmic ones */ + /* iterate over all algorithmic ranges; assume that they are in ascending order */ + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); + i=*p; + algRange=(AlgorithmicRange *)(p+1); + while(i>0) { + /* enumerate the character names before the current algorithmic range */ + /* here: start<limit */ + if((uint32_t)start<algRange->start) { + if((uint32_t)limit<=algRange->start) { + enumNames(uCharNames, start, limit, fn, context, nameChoice); + return; + } + if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { + return; + } + start=(UChar32)algRange->start; + } + /* enumerate the character names in the current algorithmic range */ + /* here: algRange->start<=start<limit */ + if((uint32_t)start<=algRange->end) { + if((uint32_t)limit<=(algRange->end+1)) { + enumAlgNames(algRange, start, limit, fn, context, nameChoice); + return; + } + if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { + return; + } + start=(UChar32)algRange->end+1; + } + /* continue to the next algorithmic range (here: start<limit) */ + algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); + --i; + } + /* enumerate the character names after the last algorithmic range */ + enumNames(uCharNames, start, limit, fn, context, nameChoice); +} + +U_CAPI int32_t U_EXPORT2 +uprv_getMaxCharNameLength() { + UErrorCode errorCode=U_ZERO_ERROR; + if(calcNameSetsLengths(&errorCode)) { + return gMaxNameLength; + } else { + return 0; + } +} + +/** + * Converts the char set cset into a Unicode set uset. + * @param cset Set of 256 bit flags corresponding to a set of chars. + * @param uset USet to receive characters. Existing contents are deleted. + */ +static void +charSetToUSet(uint32_t cset[8], const USetAdder *sa) { + UChar us[256]; + char cs[256]; + + int32_t i, length; + UErrorCode errorCode; + + errorCode=U_ZERO_ERROR; + + if(!calcNameSetsLengths(&errorCode)) { + return; + } + + /* build a char string with all chars that are used in character names */ + length=0; + for(i=0; i<256; ++i) { + if(SET_CONTAINS(cset, i)) { + cs[length++]=(char)i; + } + } + + /* convert the char string to a UChar string */ + u_charsToUChars(cs, us, length); + + /* add each UChar to the USet */ + for(i=0; i<length; ++i) { + if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */ + sa->add(sa->set, us[i]); + } + } +} + +/** + * Fills set with characters that are used in Unicode character names. + * @param set USet to receive characters. + */ +U_CAPI void U_EXPORT2 +uprv_getCharNameCharacters(const USetAdder *sa) { + charSetToUSet(gNameSet, sa); +} + +/* data swapping ------------------------------------------------------------ */ + +/* + * The token table contains non-negative entries for token bytes, + * and -1 for bytes that represent themselves in the data file's charset. + * -2 entries are used for lead bytes. + * + * Direct bytes (-1 entries) must be translated from the input charset family + * to the output charset family. + * makeTokenMap() writes a permutation mapping for this. + * Use it once for single-/lead-byte tokens and once more for all trail byte + * tokens. (';' is an unused trail byte marked with -1.) + */ +static void +makeTokenMap(const UDataSwapper *ds, + int16_t tokens[], uint16_t tokenCount, + uint8_t map[256], + UErrorCode *pErrorCode) { + UBool usedOutChar[256]; + uint16_t i, j; + uint8_t c1, c2; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + if(ds->inCharset==ds->outCharset) { + /* Same charset family: identity permutation */ + for(i=0; i<256; ++i) { + map[i]=(uint8_t)i; + } + } else { + uprv_memset(map, 0, 256); + uprv_memset(usedOutChar, 0, 256); + + if(tokenCount>256) { + tokenCount=256; + } + + /* set the direct bytes (byte 0 always maps to itself) */ + for(i=1; i<tokenCount; ++i) { + if(tokens[i]==-1) { + /* convert the direct byte character */ + c1=(uint8_t)i; + ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n", + i, ds->inCharset); + return; + } + + /* enter the converted character into the map and mark it used */ + map[c1]=c2; + usedOutChar[c2]=TRUE; + } + } + + /* set the mappings for the rest of the permutation */ + for(i=j=1; i<tokenCount; ++i) { + /* set mappings that were not set for direct bytes */ + if(map[i]==0) { + /* set an output byte value that was not used as an output byte above */ + while(usedOutChar[j]) { + ++j; + } + map[i]=(uint8_t)j++; + } + } + + /* + * leave mappings at tokenCount and above unset if tokenCount<256 + * because they won't be used + */ + } +} + +U_CAPI int32_t U_EXPORT2 +uchar_swapNames(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset, + offset, i, count, stringsCount; + + const AlgorithmicRange *inRange; + AlgorithmicRange *outRange; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ + pInfo->dataFormat[1]==0x6e && + pInfo->dataFormat[2]==0x61 && + pInfo->dataFormat[3]==0x6d && + pInfo->formatVersion[0]==1 + )) { + udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(uint8_t *)outData+headerSize; + if(length<0) { + algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]); + } else { + length-=headerSize; + if( length<20 || + (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3])) + ) { + udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + if(length<0) { + /* preflighting: iterate through algorithmic ranges */ + offset=algNamesOffset; + count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); + offset+=4; + + for(i=0; i<count; ++i) { + inRange=(const AlgorithmicRange *)(inBytes+offset); + offset+=ds->readUInt16(inRange->size); + } + } else { + /* swap data */ + const uint16_t *p; + uint16_t *q, *temp; + + int16_t tokens[512]; + uint16_t tokenCount; + + uint8_t map[256], trailMap[256]; + + /* copy the data for inaccessible bytes */ + if(inBytes!=outBytes) { + uprv_memcpy(outBytes, inBytes, length); + } + + /* the initial 4 offsets first */ + tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]); + groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]); + groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]); + ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode); + + /* + * now the tokens table + * it needs to be permutated along with the compressed name strings + */ + p=(const uint16_t *)(inBytes+16); + q=(uint16_t *)(outBytes+16); + + /* read and swap the tokenCount */ + tokenCount=ds->readUInt16(*p); + ds->swapArray16(ds, p, 2, q, pErrorCode); + ++p; + ++q; + + /* read the first 512 tokens and make the token maps */ + if(tokenCount<=512) { + count=tokenCount; + } else { + count=512; + } + for(i=0; i<count; ++i) { + tokens[i]=udata_readInt16(ds, p[i]); + } + for(; i<512; ++i) { + tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */ + } + makeTokenMap(ds, tokens, tokenCount, map, pErrorCode); + makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return 0; + } + + /* + * swap and permutate the tokens + * go through a temporary array to support in-place swapping + */ + temp=(uint16_t *)uprv_malloc(tokenCount*2); + if(temp==NULL) { + udata_printError(ds, "out of memory swapping %u unames.icu tokens\n", + tokenCount); + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return 0; + } + + /* swap and permutate single-/lead-byte tokens */ + for(i=0; i<tokenCount && i<256; ++i) { + ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode); + } + + /* swap and permutate trail-byte tokens */ + for(; i<tokenCount; ++i) { + ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode); + } + + /* copy the result into the output and free the temporary array */ + uprv_memcpy(q, temp, tokenCount*2); + uprv_free(temp); + + /* + * swap the token strings but not a possible padding byte after + * the terminating NUL of the last string + */ + udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset), + outBytes+tokenStringOffset, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "uchar_swapNames(token strings) failed\n"); + return 0; + } + + /* swap the group table */ + count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset))); + ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2), + outBytes+groupsOffset, pErrorCode); + + /* + * swap the group strings + * swap the string bytes but not the nibble-encoded string lengths + */ + if(ds->inCharset!=ds->outCharset) { + uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1]; + + const uint8_t *inStrings, *nextInStrings; + uint8_t *outStrings; + + uint8_t c; + + inStrings=inBytes+groupStringOffset; + outStrings=outBytes+groupStringOffset; + + stringsCount=algNamesOffset-groupStringOffset; + + /* iterate through string groups until only a few padding bytes are left */ + while(stringsCount>32) { + nextInStrings=expandGroupLengths(inStrings, offsets, lengths); + + /* move past the length bytes */ + stringsCount-=(uint32_t)(nextInStrings-inStrings); + outStrings+=nextInStrings-inStrings; + inStrings=nextInStrings; + + count=offsets[31]+lengths[31]; /* total number of string bytes in this group */ + stringsCount-=count; + + /* swap the string bytes using map[] and trailMap[] */ + while(count>0) { + c=*inStrings++; + *outStrings++=map[c]; + if(tokens[c]!=-2) { + --count; + } else { + /* token lead byte: swap the trail byte, too */ + *outStrings++=trailMap[*inStrings++]; + count-=2; + } + } + } + } + + /* swap the algorithmic ranges */ + offset=algNamesOffset; + count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); + ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode); + offset+=4; + + for(i=0; i<count; ++i) { + if(offset>(uint32_t)length) { + udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n", + length, i); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + inRange=(const AlgorithmicRange *)(inBytes+offset); + outRange=(AlgorithmicRange *)(outBytes+offset); + offset+=ds->readUInt16(inRange->size); + + ds->swapArray32(ds, inRange, 8, outRange, pErrorCode); + ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode); + switch(inRange->type) { + case 0: + /* swap prefix string */ + ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)), + outRange+1, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n", + i); + return 0; + } + break; + case 1: + { + /* swap factors and the prefix and factor strings */ + uint32_t factorsCount; + + factorsCount=inRange->variant; + p=(const uint16_t *)(inRange+1); + q=(uint16_t *)(outRange+1); + ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode); + + /* swap the strings, up to the last terminating NUL */ + p+=factorsCount; + q+=factorsCount; + stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p); + while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) { + --stringsCount; + } + ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode); + } + break; + default: + udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n", + inRange->type, i); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + } + } + + return headerSize+(int32_t)offset; +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ |