diff options
author | mcheshkov <mcheshkov@yandex-team.ru> | 2022-02-10 16:46:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:15 +0300 |
commit | e9d19cec64684c9c1e6b0c98297e5b895cf904fe (patch) | |
tree | 2768b1223e96a8a0610a93d18425d9647c1123c8 /contrib/libs/icu/common/ucnv_u8.cpp | |
parent | 60040c91ffe701a84689b2c6310ff845e65cff42 (diff) | |
download | ydb-e9d19cec64684c9c1e6b0c98297e5b895cf904fe.tar.gz |
Restoring authorship annotation for <mcheshkov@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/common/ucnv_u8.cpp')
-rw-r--r-- | contrib/libs/icu/common/ucnv_u8.cpp | 1886 |
1 files changed, 943 insertions, 943 deletions
diff --git a/contrib/libs/icu/common/ucnv_u8.cpp b/contrib/libs/icu/common/ucnv_u8.cpp index 9b518e08df..4cc2fb181a 100644 --- a/contrib/libs/icu/common/ucnv_u8.cpp +++ b/contrib/libs/icu/common/ucnv_u8.cpp @@ -1,944 +1,944 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -********************************************************************** -* Copyright (C) 2002-2016, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -* file name: ucnv_u8.c -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2002jul01 -* created by: Markus W. Scherer -* -* UTF-8 converter implementation. Used to be in ucnv_utf.c. -* -* Also, CESU-8 implementation, see UTR 26. -* The CESU-8 converter uses all the same functions as the -* UTF-8 converter, with a branch for converting supplementary code points. -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_CONVERSION - -#include "unicode/ucnv.h" -#include "unicode/utf.h" -#include "unicode/utf8.h" -#include "unicode/utf16.h" -#include "uassert.h" -#include "ucnv_bld.h" -#include "ucnv_cnv.h" -#include "cmemory.h" -#include "ustr_imp.h" - -/* Prototypes --------------------------------------------------------------- */ - -/* Keep these here to make finicky compilers happy */ - -U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, - UErrorCode *err); -U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, - UErrorCode *err); - - -/* UTF-8 -------------------------------------------------------------------- */ - -#define MAXIMUM_UCS2 0x0000FFFF - -static const uint32_t offsetsFromUTF8[5] = {0, - (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, - (uint32_t) 0x03C82080 -}; - -static UBool hasCESU8Data(const UConverter *cnv) -{ -#if UCONFIG_ONLY_HTML_CONVERSION - return FALSE; -#else - return (UBool)(cnv->sharedData == &_CESU8Data); -#endif -} -U_CDECL_BEGIN -static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, - UErrorCode * err) -{ - UConverter *cnv = args->converter; - const unsigned char *mySource = (unsigned char *) args->source; - UChar *myTarget = args->target; - const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; - const UChar *targetLimit = args->targetLimit; - unsigned char *toUBytes = cnv->toUBytes; - UBool isCESU8 = hasCESU8Data(cnv); - uint32_t ch, ch2 = 0; - int32_t i, inBytes; - - /* Restore size of current sequence */ - if (cnv->toULength > 0 && myTarget < targetLimit) - { - inBytes = cnv->mode; /* restore # of bytes to consume */ - i = cnv->toULength; /* restore # of bytes consumed */ - cnv->toULength = 0; - - ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ - cnv->toUnicodeStatus = 0; - goto morebytes; - } - - - while (mySource < sourceLimit && myTarget < targetLimit) - { - ch = *(mySource++); - if (U8_IS_SINGLE(ch)) /* Simple case */ - { - *(myTarget++) = (UChar) ch; - } - else - { - /* store the first char */ - toUBytes[0] = (char)ch; - inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */ - i = 1; - -morebytes: - while (i < inBytes) - { - if (mySource < sourceLimit) - { - toUBytes[i] = (char) (ch2 = *mySource); - if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) && - !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) - { - break; /* i < inBytes */ - } - ch = (ch << 6) + ch2; - ++mySource; - i++; - } - else - { - /* stores a partially calculated target*/ - cnv->toUnicodeStatus = ch; - cnv->mode = inBytes; - cnv->toULength = (int8_t) i; - goto donefornow; - } - } - - // In CESU-8, only surrogates, not supplementary code points, are encoded directly. - if (i == inBytes && (!isCESU8 || i <= 3)) - { - /* Remove the accumulated high bits */ - ch -= offsetsFromUTF8[inBytes]; - - /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ - if (ch <= MAXIMUM_UCS2) - { - /* fits in 16 bits */ - *(myTarget++) = (UChar) ch; - } - else - { - /* write out the surrogates */ - *(myTarget++) = U16_LEAD(ch); - ch = U16_TRAIL(ch); - if (myTarget < targetLimit) - { - *(myTarget++) = (UChar)ch; - } - else - { - /* Put in overflow buffer (not handled here) */ - cnv->UCharErrorBuffer[0] = (UChar) ch; - cnv->UCharErrorBufferLength = 1; - *err = U_BUFFER_OVERFLOW_ERROR; - break; - } - } - } - else - { - cnv->toULength = (int8_t)i; - *err = U_ILLEGAL_CHAR_FOUND; - break; - } - } - } - -donefornow: - if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) - { - /* End of target buffer */ - *err = U_BUFFER_OVERFLOW_ERROR; - } - - args->target = myTarget; - args->source = (const char *) mySource; -} - -static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, - UErrorCode * err) -{ - UConverter *cnv = args->converter; - const unsigned char *mySource = (unsigned char *) args->source; - UChar *myTarget = args->target; - int32_t *myOffsets = args->offsets; - int32_t offsetNum = 0; - const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; - const UChar *targetLimit = args->targetLimit; - unsigned char *toUBytes = cnv->toUBytes; - UBool isCESU8 = hasCESU8Data(cnv); - uint32_t ch, ch2 = 0; - int32_t i, inBytes; - - /* Restore size of current sequence */ - if (cnv->toULength > 0 && myTarget < targetLimit) - { - inBytes = cnv->mode; /* restore # of bytes to consume */ - i = cnv->toULength; /* restore # of bytes consumed */ - cnv->toULength = 0; - - ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ - cnv->toUnicodeStatus = 0; - goto morebytes; - } - - while (mySource < sourceLimit && myTarget < targetLimit) - { - ch = *(mySource++); - if (U8_IS_SINGLE(ch)) /* Simple case */ - { - *(myTarget++) = (UChar) ch; - *(myOffsets++) = offsetNum++; - } - else - { - toUBytes[0] = (char)ch; - inBytes = U8_COUNT_BYTES_NON_ASCII(ch); - i = 1; - -morebytes: - while (i < inBytes) - { - if (mySource < sourceLimit) - { - toUBytes[i] = (char) (ch2 = *mySource); - if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) && - !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) - { - break; /* i < inBytes */ - } - ch = (ch << 6) + ch2; - ++mySource; - i++; - } - else - { - cnv->toUnicodeStatus = ch; - cnv->mode = inBytes; - cnv->toULength = (int8_t)i; - goto donefornow; - } - } - - // In CESU-8, only surrogates, not supplementary code points, are encoded directly. - if (i == inBytes && (!isCESU8 || i <= 3)) - { - /* Remove the accumulated high bits */ - ch -= offsetsFromUTF8[inBytes]; - - /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ - if (ch <= MAXIMUM_UCS2) - { - /* fits in 16 bits */ - *(myTarget++) = (UChar) ch; - *(myOffsets++) = offsetNum; - } - else - { - /* write out the surrogates */ - *(myTarget++) = U16_LEAD(ch); - *(myOffsets++) = offsetNum; - ch = U16_TRAIL(ch); - if (myTarget < targetLimit) - { - *(myTarget++) = (UChar)ch; - *(myOffsets++) = offsetNum; - } - else - { - cnv->UCharErrorBuffer[0] = (UChar) ch; - cnv->UCharErrorBufferLength = 1; - *err = U_BUFFER_OVERFLOW_ERROR; - } - } - offsetNum += i; - } - else - { - cnv->toULength = (int8_t)i; - *err = U_ILLEGAL_CHAR_FOUND; - break; - } - } - } - -donefornow: - if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) - { /* End of target buffer */ - *err = U_BUFFER_OVERFLOW_ERROR; - } - - args->target = myTarget; - args->source = (const char *) mySource; - args->offsets = myOffsets; -} -U_CDECL_END - -U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, - UErrorCode * err) -{ - UConverter *cnv = args->converter; - const UChar *mySource = args->source; - const UChar *sourceLimit = args->sourceLimit; - uint8_t *myTarget = (uint8_t *) args->target; - const uint8_t *targetLimit = (uint8_t *) args->targetLimit; - uint8_t *tempPtr; - UChar32 ch; - uint8_t tempBuf[4]; - int32_t indexToWrite; - UBool isNotCESU8 = !hasCESU8Data(cnv); - - if (cnv->fromUChar32 && myTarget < targetLimit) - { - ch = cnv->fromUChar32; - cnv->fromUChar32 = 0; - goto lowsurrogate; - } - - while (mySource < sourceLimit && myTarget < targetLimit) - { - ch = *(mySource++); - - if (ch < 0x80) /* Single byte */ - { - *(myTarget++) = (uint8_t) ch; - } - else if (ch < 0x800) /* Double byte */ - { - *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); - if (myTarget < targetLimit) - { - *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); - } - else - { - cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); - cnv->charErrorBufferLength = 1; - *err = U_BUFFER_OVERFLOW_ERROR; - } - } - else { - /* Check for surrogates */ - if(U16_IS_SURROGATE(ch) && isNotCESU8) { -lowsurrogate: - if (mySource < sourceLimit) { - /* test both code units */ - if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { - /* convert and consume this supplementary code point */ - ch=U16_GET_SUPPLEMENTARY(ch, *mySource); - ++mySource; - /* exit this condition tree */ - } - else { - /* this is an unpaired trail or lead code unit */ - /* callback(illegal) */ - cnv->fromUChar32 = ch; - *err = U_ILLEGAL_CHAR_FOUND; - break; - } - } - else { - /* no more input */ - cnv->fromUChar32 = ch; - break; - } - } - - /* Do we write the buffer directly for speed, - or do we have to be careful about target buffer space? */ - tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); - - if (ch <= MAXIMUM_UCS2) { - indexToWrite = 2; - tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); - } - else { - indexToWrite = 3; - tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); - tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); - } - tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); - tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); - - if (tempPtr == myTarget) { - /* There was enough space to write the codepoint directly. */ - myTarget += (indexToWrite + 1); - } - else { - /* We might run out of room soon. Write it slowly. */ - for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { - if (myTarget < targetLimit) { - *(myTarget++) = *tempPtr; - } - else { - cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; - *err = U_BUFFER_OVERFLOW_ERROR; - } - } - } - } - } - - if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) - { - *err = U_BUFFER_OVERFLOW_ERROR; - } - - args->target = (char *) myTarget; - args->source = mySource; -} - -U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, - UErrorCode * err) -{ - UConverter *cnv = args->converter; - const UChar *mySource = args->source; - int32_t *myOffsets = args->offsets; - const UChar *sourceLimit = args->sourceLimit; - uint8_t *myTarget = (uint8_t *) args->target; - const uint8_t *targetLimit = (uint8_t *) args->targetLimit; - uint8_t *tempPtr; - UChar32 ch; - int32_t offsetNum, nextSourceIndex; - int32_t indexToWrite; - uint8_t tempBuf[4]; - UBool isNotCESU8 = !hasCESU8Data(cnv); - - if (cnv->fromUChar32 && myTarget < targetLimit) - { - ch = cnv->fromUChar32; - cnv->fromUChar32 = 0; - offsetNum = -1; - nextSourceIndex = 0; - goto lowsurrogate; - } else { - offsetNum = 0; - } - - while (mySource < sourceLimit && myTarget < targetLimit) - { - ch = *(mySource++); - - if (ch < 0x80) /* Single byte */ - { - *(myOffsets++) = offsetNum++; - *(myTarget++) = (char) ch; - } - else if (ch < 0x800) /* Double byte */ - { - *(myOffsets++) = offsetNum; - *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); - if (myTarget < targetLimit) - { - *(myOffsets++) = offsetNum++; - *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); - } - else - { - cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); - cnv->charErrorBufferLength = 1; - *err = U_BUFFER_OVERFLOW_ERROR; - } - } - else - /* Check for surrogates */ - { - nextSourceIndex = offsetNum + 1; - - if(U16_IS_SURROGATE(ch) && isNotCESU8) { -lowsurrogate: - if (mySource < sourceLimit) { - /* test both code units */ - if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { - /* convert and consume this supplementary code point */ - ch=U16_GET_SUPPLEMENTARY(ch, *mySource); - ++mySource; - ++nextSourceIndex; - /* exit this condition tree */ - } - else { - /* this is an unpaired trail or lead code unit */ - /* callback(illegal) */ - cnv->fromUChar32 = ch; - *err = U_ILLEGAL_CHAR_FOUND; - break; - } - } - else { - /* no more input */ - cnv->fromUChar32 = ch; - break; - } - } - - /* Do we write the buffer directly for speed, - or do we have to be careful about target buffer space? */ - tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); - - if (ch <= MAXIMUM_UCS2) { - indexToWrite = 2; - tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); - } - else { - indexToWrite = 3; - tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); - tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); - } - tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); - tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); - - if (tempPtr == myTarget) { - /* There was enough space to write the codepoint directly. */ - myTarget += (indexToWrite + 1); - myOffsets[0] = offsetNum; - myOffsets[1] = offsetNum; - myOffsets[2] = offsetNum; - if (indexToWrite >= 3) { - myOffsets[3] = offsetNum; - } - myOffsets += (indexToWrite + 1); - } - else { - /* We might run out of room soon. Write it slowly. */ - for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { - if (myTarget < targetLimit) - { - *(myOffsets++) = offsetNum; - *(myTarget++) = *tempPtr; - } - else - { - cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; - *err = U_BUFFER_OVERFLOW_ERROR; - } - } - } - offsetNum = nextSourceIndex; - } - } - - if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) - { - *err = U_BUFFER_OVERFLOW_ERROR; - } - - args->target = (char *) myTarget; - args->source = mySource; - args->offsets = myOffsets; -} - -U_CDECL_BEGIN -static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, - UErrorCode *err) { - UConverter *cnv; - const uint8_t *sourceInitial; - const uint8_t *source; - uint8_t myByte; - UChar32 ch; - int8_t i; - - /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ - - cnv = args->converter; - sourceInitial = source = (const uint8_t *)args->source; - if (source >= (const uint8_t *)args->sourceLimit) - { - /* no input */ - *err = U_INDEX_OUTOFBOUNDS_ERROR; - return 0xffff; - } - - myByte = (uint8_t)*(source++); - if (U8_IS_SINGLE(myByte)) - { - args->source = (const char *)source; - return (UChar32)myByte; - } - - uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte); - if (countTrailBytes == 0) { - cnv->toUBytes[0] = myByte; - cnv->toULength = 1; - *err = U_ILLEGAL_CHAR_FOUND; - args->source = (const char *)source; - return 0xffff; - } - - /*The byte sequence is longer than the buffer area passed*/ - if (((const char *)source + countTrailBytes) > args->sourceLimit) - { - /* check if all of the remaining bytes are trail bytes */ - uint16_t extraBytesToWrite = countTrailBytes + 1; - cnv->toUBytes[0] = myByte; - i = 1; - *err = U_TRUNCATED_CHAR_FOUND; - while(source < (const uint8_t *)args->sourceLimit) { - uint8_t b = *source; - if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) { - cnv->toUBytes[i++] = b; - ++source; - } else { - /* error even before we run out of input */ - *err = U_ILLEGAL_CHAR_FOUND; - break; - } - } - cnv->toULength = i; - args->source = (const char *)source; - return 0xffff; - } - - ch = myByte << 6; - if(countTrailBytes == 2) { - uint8_t t1 = *source, t2; - if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) { - args->source = (const char *)(source + 1); - return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3]; - } - } else if(countTrailBytes == 1) { - uint8_t t1 = *source; - if(U8_IS_TRAIL(t1)) { - args->source = (const char *)(source + 1); - return (ch + t1) - offsetsFromUTF8[2]; - } - } else { // countTrailBytes == 3 - uint8_t t1 = *source, t2, t3; - if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) && - U8_IS_TRAIL(t3 = *++source)) { - args->source = (const char *)(source + 1); - return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4]; - } - } - args->source = (const char *)source; - - for(i = 0; sourceInitial < source; ++i) { - cnv->toUBytes[i] = *sourceInitial++; - } - cnv->toULength = i; - *err = U_ILLEGAL_CHAR_FOUND; - return 0xffff; +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2002-2016, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* file name: ucnv_u8.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002jul01 +* created by: Markus W. Scherer +* +* UTF-8 converter implementation. Used to be in ucnv_utf.c. +* +* Also, CESU-8 implementation, see UTR 26. +* The CESU-8 converter uses all the same functions as the +* UTF-8 converter, with a branch for converting supplementary code points. +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "unicode/ucnv.h" +#include "unicode/utf.h" +#include "unicode/utf8.h" +#include "unicode/utf16.h" +#include "uassert.h" +#include "ucnv_bld.h" +#include "ucnv_cnv.h" +#include "cmemory.h" +#include "ustr_imp.h" + +/* Prototypes --------------------------------------------------------------- */ + +/* Keep these here to make finicky compilers happy */ + +U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, + UErrorCode *err); +U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, + UErrorCode *err); + + +/* UTF-8 -------------------------------------------------------------------- */ + +#define MAXIMUM_UCS2 0x0000FFFF + +static const uint32_t offsetsFromUTF8[5] = {0, + (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, + (uint32_t) 0x03C82080 +}; + +static UBool hasCESU8Data(const UConverter *cnv) +{ +#if UCONFIG_ONLY_HTML_CONVERSION + return FALSE; +#else + return (UBool)(cnv->sharedData == &_CESU8Data); +#endif } -U_CDECL_END - -/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ - -U_CDECL_BEGIN -/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ -static void U_CALLCONV -ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, - UConverterToUnicodeArgs *pToUArgs, - UErrorCode *pErrorCode) { - UConverter *utf8; - const uint8_t *source, *sourceLimit; - uint8_t *target; - int32_t targetCapacity; - int32_t count; - - int8_t oldToULength, toULength, toULimit; - - UChar32 c; - uint8_t b, t1, t2; - - /* set up the local pointers */ - utf8=pToUArgs->converter; - source=(uint8_t *)pToUArgs->source; - sourceLimit=(uint8_t *)pToUArgs->sourceLimit; - target=(uint8_t *)pFromUArgs->target; - targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); - - /* get the converter state from the UTF-8 UConverter */ - if(utf8->toULength > 0) { - toULength=oldToULength=utf8->toULength; - toULimit=(int8_t)utf8->mode; - c=(UChar32)utf8->toUnicodeStatus; - } else { - toULength=oldToULength=toULimit=0; - c = 0; - } - - count=(int32_t)(sourceLimit-source)+oldToULength; - if(count<toULimit) { - /* - * Not enough input to complete the partial character. - * Jump to moreBytes below - it will not output to target. - */ - } else if(targetCapacity<toULimit) { - /* - * Not enough target capacity to output the partial character. - * Let the standard converter handle this. - */ - *pErrorCode=U_USING_DEFAULT_WARNING; - return; - } else { - // Use a single counter for source and target, counting the minimum of - // the source length and the target capacity. - // Let the standard converter handle edge cases. - if(count>targetCapacity) { - count=targetCapacity; - } - - // The conversion loop checks count>0 only once per character. - // If the buffer ends with a truncated sequence, - // then we reduce the count to stop before that, - // and collect the remaining bytes after the conversion loop. - - // Do not go back into the bytes that will be read for finishing a partial - // sequence from the previous buffer. - int32_t length=count-toULimit; - U8_TRUNCATE_IF_INCOMPLETE(source, 0, length); - count=toULimit+length; - } - - if(c!=0) { - utf8->toUnicodeStatus=0; - utf8->toULength=0; - goto moreBytes; - /* See note in ucnv_SBCSFromUTF8() about this goto. */ - } - - /* conversion loop */ - while(count>0) { - b=*source++; - if(U8_IS_SINGLE(b)) { - /* convert ASCII */ - *target++=b; - --count; - continue; - } else { - if(b>=0xe0) { - if( /* handle U+0800..U+FFFF inline */ - b<0xf0 && - U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) && - U8_IS_TRAIL(t2=source[1]) - ) { - source+=2; - *target++=b; - *target++=t1; - *target++=t2; - count-=3; - continue; - } - } else { - if( /* handle U+0080..U+07FF inline */ - b>=0xc2 && - U8_IS_TRAIL(t1=*source) - ) { - ++source; - *target++=b; - *target++=t1; - count-=2; - continue; - } - } - - /* handle "complicated" and error cases, and continuing partial characters */ - oldToULength=0; - toULength=1; - toULimit=U8_COUNT_BYTES_NON_ASCII(b); - c=b; -moreBytes: - while(toULength<toULimit) { - if(source<sourceLimit) { - b=*source; - if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { - ++source; - ++toULength; - c=(c<<6)+b; - } else { - break; /* sequence too short, stop with toULength<toULimit */ - } - } else { - /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ - source-=(toULength-oldToULength); - while(oldToULength<toULength) { - utf8->toUBytes[oldToULength++]=*source++; - } - utf8->toUnicodeStatus=c; - utf8->toULength=toULength; - utf8->mode=toULimit; - pToUArgs->source=(char *)source; - pFromUArgs->target=(char *)target; - return; - } - } - - if(toULength!=toULimit) { - /* error handling: illegal UTF-8 byte sequence */ - source-=(toULength-oldToULength); - while(oldToULength<toULength) { - utf8->toUBytes[oldToULength++]=*source++; - } - utf8->toULength=toULength; - pToUArgs->source=(char *)source; - pFromUArgs->target=(char *)target; - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - return; - } - - /* copy the legal byte sequence to the target */ - { - int8_t i; - - for(i=0; i<oldToULength; ++i) { - *target++=utf8->toUBytes[i]; - } - source-=(toULength-oldToULength); - for(; i<toULength; ++i) { - *target++=*source++; - } - count-=toULength; - } - } - } - U_ASSERT(count>=0); - - if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { - if(target==(const uint8_t *)pFromUArgs->targetLimit) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } else { - b=*source; - toULimit=U8_COUNT_BYTES(b); - if(toULimit>(sourceLimit-source)) { - /* collect a truncated byte sequence */ - toULength=0; - c=b; - for(;;) { - utf8->toUBytes[toULength++]=b; - if(++source==sourceLimit) { - /* partial byte sequence at end of source */ - utf8->toUnicodeStatus=c; - utf8->toULength=toULength; - utf8->mode=toULimit; - break; - } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) { - utf8->toULength=toULength; - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - break; - } - c=(c<<6)+b; - } - } else { - /* partial-sequence target overflow: fall back to the pivoting implementation */ - *pErrorCode=U_USING_DEFAULT_WARNING; - } - } - } - - /* write back the updated pointers */ - pToUArgs->source=(char *)source; - pFromUArgs->target=(char *)target; -} - -U_CDECL_END - -/* UTF-8 converter data ----------------------------------------------------- */ - -static const UConverterImpl _UTF8Impl={ - UCNV_UTF8, - - NULL, - NULL, - - NULL, - NULL, - NULL, - - ucnv_toUnicode_UTF8, - ucnv_toUnicode_UTF8_OFFSETS_LOGIC, - ucnv_fromUnicode_UTF8, - ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, - ucnv_getNextUChar_UTF8, - - NULL, - NULL, - NULL, - NULL, - ucnv_getNonSurrogateUnicodeSet, - - ucnv_UTF8FromUTF8, - ucnv_UTF8FromUTF8 -}; - -/* The 1208 CCSID refers to any version of Unicode of UTF-8 */ -static const UConverterStaticData _UTF8StaticData={ - sizeof(UConverterStaticData), - "UTF-8", - 1208, UCNV_IBM, UCNV_UTF8, - 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ - { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, - 0, - 0, - { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ -}; - - -const UConverterSharedData _UTF8Data= - UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl); - -/* CESU-8 converter data ---------------------------------------------------- */ - -static const UConverterImpl _CESU8Impl={ - UCNV_CESU8, - - NULL, - NULL, - - NULL, - NULL, - NULL, - - ucnv_toUnicode_UTF8, - ucnv_toUnicode_UTF8_OFFSETS_LOGIC, - ucnv_fromUnicode_UTF8, - ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, - NULL, - - NULL, - NULL, - NULL, - NULL, - ucnv_getCompleteUnicodeSet, - - NULL, - NULL -}; - -static const UConverterStaticData _CESU8StaticData={ - sizeof(UConverterStaticData), - "CESU-8", - 9400, /* CCSID for CESU-8 */ - UCNV_UNKNOWN, UCNV_CESU8, 1, 3, - { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, - 0, - 0, - { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ -}; - - -const UConverterSharedData _CESU8Data= - UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl); - -#endif +U_CDECL_BEGIN +static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, + UErrorCode * err) +{ + UConverter *cnv = args->converter; + const unsigned char *mySource = (unsigned char *) args->source; + UChar *myTarget = args->target; + const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; + const UChar *targetLimit = args->targetLimit; + unsigned char *toUBytes = cnv->toUBytes; + UBool isCESU8 = hasCESU8Data(cnv); + uint32_t ch, ch2 = 0; + int32_t i, inBytes; + + /* Restore size of current sequence */ + if (cnv->toULength > 0 && myTarget < targetLimit) + { + inBytes = cnv->mode; /* restore # of bytes to consume */ + i = cnv->toULength; /* restore # of bytes consumed */ + cnv->toULength = 0; + + ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ + cnv->toUnicodeStatus = 0; + goto morebytes; + } + + + while (mySource < sourceLimit && myTarget < targetLimit) + { + ch = *(mySource++); + if (U8_IS_SINGLE(ch)) /* Simple case */ + { + *(myTarget++) = (UChar) ch; + } + else + { + /* store the first char */ + toUBytes[0] = (char)ch; + inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */ + i = 1; + +morebytes: + while (i < inBytes) + { + if (mySource < sourceLimit) + { + toUBytes[i] = (char) (ch2 = *mySource); + if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) && + !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) + { + break; /* i < inBytes */ + } + ch = (ch << 6) + ch2; + ++mySource; + i++; + } + else + { + /* stores a partially calculated target*/ + cnv->toUnicodeStatus = ch; + cnv->mode = inBytes; + cnv->toULength = (int8_t) i; + goto donefornow; + } + } + + // In CESU-8, only surrogates, not supplementary code points, are encoded directly. + if (i == inBytes && (!isCESU8 || i <= 3)) + { + /* Remove the accumulated high bits */ + ch -= offsetsFromUTF8[inBytes]; + + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + if (ch <= MAXIMUM_UCS2) + { + /* fits in 16 bits */ + *(myTarget++) = (UChar) ch; + } + else + { + /* write out the surrogates */ + *(myTarget++) = U16_LEAD(ch); + ch = U16_TRAIL(ch); + if (myTarget < targetLimit) + { + *(myTarget++) = (UChar)ch; + } + else + { + /* Put in overflow buffer (not handled here) */ + cnv->UCharErrorBuffer[0] = (UChar) ch; + cnv->UCharErrorBufferLength = 1; + *err = U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } + else + { + cnv->toULength = (int8_t)i; + *err = U_ILLEGAL_CHAR_FOUND; + break; + } + } + } + +donefornow: + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) + { + /* End of target buffer */ + *err = U_BUFFER_OVERFLOW_ERROR; + } + + args->target = myTarget; + args->source = (const char *) mySource; +} + +static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, + UErrorCode * err) +{ + UConverter *cnv = args->converter; + const unsigned char *mySource = (unsigned char *) args->source; + UChar *myTarget = args->target; + int32_t *myOffsets = args->offsets; + int32_t offsetNum = 0; + const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; + const UChar *targetLimit = args->targetLimit; + unsigned char *toUBytes = cnv->toUBytes; + UBool isCESU8 = hasCESU8Data(cnv); + uint32_t ch, ch2 = 0; + int32_t i, inBytes; + + /* Restore size of current sequence */ + if (cnv->toULength > 0 && myTarget < targetLimit) + { + inBytes = cnv->mode; /* restore # of bytes to consume */ + i = cnv->toULength; /* restore # of bytes consumed */ + cnv->toULength = 0; + + ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ + cnv->toUnicodeStatus = 0; + goto morebytes; + } + + while (mySource < sourceLimit && myTarget < targetLimit) + { + ch = *(mySource++); + if (U8_IS_SINGLE(ch)) /* Simple case */ + { + *(myTarget++) = (UChar) ch; + *(myOffsets++) = offsetNum++; + } + else + { + toUBytes[0] = (char)ch; + inBytes = U8_COUNT_BYTES_NON_ASCII(ch); + i = 1; + +morebytes: + while (i < inBytes) + { + if (mySource < sourceLimit) + { + toUBytes[i] = (char) (ch2 = *mySource); + if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) && + !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) + { + break; /* i < inBytes */ + } + ch = (ch << 6) + ch2; + ++mySource; + i++; + } + else + { + cnv->toUnicodeStatus = ch; + cnv->mode = inBytes; + cnv->toULength = (int8_t)i; + goto donefornow; + } + } + + // In CESU-8, only surrogates, not supplementary code points, are encoded directly. + if (i == inBytes && (!isCESU8 || i <= 3)) + { + /* Remove the accumulated high bits */ + ch -= offsetsFromUTF8[inBytes]; + + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + if (ch <= MAXIMUM_UCS2) + { + /* fits in 16 bits */ + *(myTarget++) = (UChar) ch; + *(myOffsets++) = offsetNum; + } + else + { + /* write out the surrogates */ + *(myTarget++) = U16_LEAD(ch); + *(myOffsets++) = offsetNum; + ch = U16_TRAIL(ch); + if (myTarget < targetLimit) + { + *(myTarget++) = (UChar)ch; + *(myOffsets++) = offsetNum; + } + else + { + cnv->UCharErrorBuffer[0] = (UChar) ch; + cnv->UCharErrorBufferLength = 1; + *err = U_BUFFER_OVERFLOW_ERROR; + } + } + offsetNum += i; + } + else + { + cnv->toULength = (int8_t)i; + *err = U_ILLEGAL_CHAR_FOUND; + break; + } + } + } + +donefornow: + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) + { /* End of target buffer */ + *err = U_BUFFER_OVERFLOW_ERROR; + } + + args->target = myTarget; + args->source = (const char *) mySource; + args->offsets = myOffsets; +} +U_CDECL_END + +U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, + UErrorCode * err) +{ + UConverter *cnv = args->converter; + const UChar *mySource = args->source; + const UChar *sourceLimit = args->sourceLimit; + uint8_t *myTarget = (uint8_t *) args->target; + const uint8_t *targetLimit = (uint8_t *) args->targetLimit; + uint8_t *tempPtr; + UChar32 ch; + uint8_t tempBuf[4]; + int32_t indexToWrite; + UBool isNotCESU8 = !hasCESU8Data(cnv); + + if (cnv->fromUChar32 && myTarget < targetLimit) + { + ch = cnv->fromUChar32; + cnv->fromUChar32 = 0; + goto lowsurrogate; + } + + while (mySource < sourceLimit && myTarget < targetLimit) + { + ch = *(mySource++); + + if (ch < 0x80) /* Single byte */ + { + *(myTarget++) = (uint8_t) ch; + } + else if (ch < 0x800) /* Double byte */ + { + *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); + if (myTarget < targetLimit) + { + *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); + } + else + { + cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); + cnv->charErrorBufferLength = 1; + *err = U_BUFFER_OVERFLOW_ERROR; + } + } + else { + /* Check for surrogates */ + if(U16_IS_SURROGATE(ch) && isNotCESU8) { +lowsurrogate: + if (mySource < sourceLimit) { + /* test both code units */ + if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { + /* convert and consume this supplementary code point */ + ch=U16_GET_SUPPLEMENTARY(ch, *mySource); + ++mySource; + /* exit this condition tree */ + } + else { + /* this is an unpaired trail or lead code unit */ + /* callback(illegal) */ + cnv->fromUChar32 = ch; + *err = U_ILLEGAL_CHAR_FOUND; + break; + } + } + else { + /* no more input */ + cnv->fromUChar32 = ch; + break; + } + } + + /* Do we write the buffer directly for speed, + or do we have to be careful about target buffer space? */ + tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); + + if (ch <= MAXIMUM_UCS2) { + indexToWrite = 2; + tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); + } + else { + indexToWrite = 3; + tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); + tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); + } + tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); + tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); + + if (tempPtr == myTarget) { + /* There was enough space to write the codepoint directly. */ + myTarget += (indexToWrite + 1); + } + else { + /* We might run out of room soon. Write it slowly. */ + for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { + if (myTarget < targetLimit) { + *(myTarget++) = *tempPtr; + } + else { + cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; + *err = U_BUFFER_OVERFLOW_ERROR; + } + } + } + } + } + + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) + { + *err = U_BUFFER_OVERFLOW_ERROR; + } + + args->target = (char *) myTarget; + args->source = mySource; +} + +U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, + UErrorCode * err) +{ + UConverter *cnv = args->converter; + const UChar *mySource = args->source; + int32_t *myOffsets = args->offsets; + const UChar *sourceLimit = args->sourceLimit; + uint8_t *myTarget = (uint8_t *) args->target; + const uint8_t *targetLimit = (uint8_t *) args->targetLimit; + uint8_t *tempPtr; + UChar32 ch; + int32_t offsetNum, nextSourceIndex; + int32_t indexToWrite; + uint8_t tempBuf[4]; + UBool isNotCESU8 = !hasCESU8Data(cnv); + + if (cnv->fromUChar32 && myTarget < targetLimit) + { + ch = cnv->fromUChar32; + cnv->fromUChar32 = 0; + offsetNum = -1; + nextSourceIndex = 0; + goto lowsurrogate; + } else { + offsetNum = 0; + } + + while (mySource < sourceLimit && myTarget < targetLimit) + { + ch = *(mySource++); + + if (ch < 0x80) /* Single byte */ + { + *(myOffsets++) = offsetNum++; + *(myTarget++) = (char) ch; + } + else if (ch < 0x800) /* Double byte */ + { + *(myOffsets++) = offsetNum; + *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); + if (myTarget < targetLimit) + { + *(myOffsets++) = offsetNum++; + *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); + } + else + { + cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); + cnv->charErrorBufferLength = 1; + *err = U_BUFFER_OVERFLOW_ERROR; + } + } + else + /* Check for surrogates */ + { + nextSourceIndex = offsetNum + 1; + + if(U16_IS_SURROGATE(ch) && isNotCESU8) { +lowsurrogate: + if (mySource < sourceLimit) { + /* test both code units */ + if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { + /* convert and consume this supplementary code point */ + ch=U16_GET_SUPPLEMENTARY(ch, *mySource); + ++mySource; + ++nextSourceIndex; + /* exit this condition tree */ + } + else { + /* this is an unpaired trail or lead code unit */ + /* callback(illegal) */ + cnv->fromUChar32 = ch; + *err = U_ILLEGAL_CHAR_FOUND; + break; + } + } + else { + /* no more input */ + cnv->fromUChar32 = ch; + break; + } + } + + /* Do we write the buffer directly for speed, + or do we have to be careful about target buffer space? */ + tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); + + if (ch <= MAXIMUM_UCS2) { + indexToWrite = 2; + tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); + } + else { + indexToWrite = 3; + tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); + tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); + } + tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); + tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); + + if (tempPtr == myTarget) { + /* There was enough space to write the codepoint directly. */ + myTarget += (indexToWrite + 1); + myOffsets[0] = offsetNum; + myOffsets[1] = offsetNum; + myOffsets[2] = offsetNum; + if (indexToWrite >= 3) { + myOffsets[3] = offsetNum; + } + myOffsets += (indexToWrite + 1); + } + else { + /* We might run out of room soon. Write it slowly. */ + for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { + if (myTarget < targetLimit) + { + *(myOffsets++) = offsetNum; + *(myTarget++) = *tempPtr; + } + else + { + cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; + *err = U_BUFFER_OVERFLOW_ERROR; + } + } + } + offsetNum = nextSourceIndex; + } + } + + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) + { + *err = U_BUFFER_OVERFLOW_ERROR; + } + + args->target = (char *) myTarget; + args->source = mySource; + args->offsets = myOffsets; +} + +U_CDECL_BEGIN +static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, + UErrorCode *err) { + UConverter *cnv; + const uint8_t *sourceInitial; + const uint8_t *source; + uint8_t myByte; + UChar32 ch; + int8_t i; + + /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ + + cnv = args->converter; + sourceInitial = source = (const uint8_t *)args->source; + if (source >= (const uint8_t *)args->sourceLimit) + { + /* no input */ + *err = U_INDEX_OUTOFBOUNDS_ERROR; + return 0xffff; + } + + myByte = (uint8_t)*(source++); + if (U8_IS_SINGLE(myByte)) + { + args->source = (const char *)source; + return (UChar32)myByte; + } + + uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte); + if (countTrailBytes == 0) { + cnv->toUBytes[0] = myByte; + cnv->toULength = 1; + *err = U_ILLEGAL_CHAR_FOUND; + args->source = (const char *)source; + return 0xffff; + } + + /*The byte sequence is longer than the buffer area passed*/ + if (((const char *)source + countTrailBytes) > args->sourceLimit) + { + /* check if all of the remaining bytes are trail bytes */ + uint16_t extraBytesToWrite = countTrailBytes + 1; + cnv->toUBytes[0] = myByte; + i = 1; + *err = U_TRUNCATED_CHAR_FOUND; + while(source < (const uint8_t *)args->sourceLimit) { + uint8_t b = *source; + if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) { + cnv->toUBytes[i++] = b; + ++source; + } else { + /* error even before we run out of input */ + *err = U_ILLEGAL_CHAR_FOUND; + break; + } + } + cnv->toULength = i; + args->source = (const char *)source; + return 0xffff; + } + + ch = myByte << 6; + if(countTrailBytes == 2) { + uint8_t t1 = *source, t2; + if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) { + args->source = (const char *)(source + 1); + return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3]; + } + } else if(countTrailBytes == 1) { + uint8_t t1 = *source; + if(U8_IS_TRAIL(t1)) { + args->source = (const char *)(source + 1); + return (ch + t1) - offsetsFromUTF8[2]; + } + } else { // countTrailBytes == 3 + uint8_t t1 = *source, t2, t3; + if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) && + U8_IS_TRAIL(t3 = *++source)) { + args->source = (const char *)(source + 1); + return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4]; + } + } + args->source = (const char *)source; + + for(i = 0; sourceInitial < source; ++i) { + cnv->toUBytes[i] = *sourceInitial++; + } + cnv->toULength = i; + *err = U_ILLEGAL_CHAR_FOUND; + return 0xffff; +} +U_CDECL_END + +/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ + +U_CDECL_BEGIN +/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ +static void U_CALLCONV +ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, + UConverterToUnicodeArgs *pToUArgs, + UErrorCode *pErrorCode) { + UConverter *utf8; + const uint8_t *source, *sourceLimit; + uint8_t *target; + int32_t targetCapacity; + int32_t count; + + int8_t oldToULength, toULength, toULimit; + + UChar32 c; + uint8_t b, t1, t2; + + /* set up the local pointers */ + utf8=pToUArgs->converter; + source=(uint8_t *)pToUArgs->source; + sourceLimit=(uint8_t *)pToUArgs->sourceLimit; + target=(uint8_t *)pFromUArgs->target; + targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); + + /* get the converter state from the UTF-8 UConverter */ + if(utf8->toULength > 0) { + toULength=oldToULength=utf8->toULength; + toULimit=(int8_t)utf8->mode; + c=(UChar32)utf8->toUnicodeStatus; + } else { + toULength=oldToULength=toULimit=0; + c = 0; + } + + count=(int32_t)(sourceLimit-source)+oldToULength; + if(count<toULimit) { + /* + * Not enough input to complete the partial character. + * Jump to moreBytes below - it will not output to target. + */ + } else if(targetCapacity<toULimit) { + /* + * Not enough target capacity to output the partial character. + * Let the standard converter handle this. + */ + *pErrorCode=U_USING_DEFAULT_WARNING; + return; + } else { + // Use a single counter for source and target, counting the minimum of + // the source length and the target capacity. + // Let the standard converter handle edge cases. + if(count>targetCapacity) { + count=targetCapacity; + } + + // The conversion loop checks count>0 only once per character. + // If the buffer ends with a truncated sequence, + // then we reduce the count to stop before that, + // and collect the remaining bytes after the conversion loop. + + // Do not go back into the bytes that will be read for finishing a partial + // sequence from the previous buffer. + int32_t length=count-toULimit; + U8_TRUNCATE_IF_INCOMPLETE(source, 0, length); + count=toULimit+length; + } + + if(c!=0) { + utf8->toUnicodeStatus=0; + utf8->toULength=0; + goto moreBytes; + /* See note in ucnv_SBCSFromUTF8() about this goto. */ + } + + /* conversion loop */ + while(count>0) { + b=*source++; + if(U8_IS_SINGLE(b)) { + /* convert ASCII */ + *target++=b; + --count; + continue; + } else { + if(b>=0xe0) { + if( /* handle U+0800..U+FFFF inline */ + b<0xf0 && + U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) && + U8_IS_TRAIL(t2=source[1]) + ) { + source+=2; + *target++=b; + *target++=t1; + *target++=t2; + count-=3; + continue; + } + } else { + if( /* handle U+0080..U+07FF inline */ + b>=0xc2 && + U8_IS_TRAIL(t1=*source) + ) { + ++source; + *target++=b; + *target++=t1; + count-=2; + continue; + } + } + + /* handle "complicated" and error cases, and continuing partial characters */ + oldToULength=0; + toULength=1; + toULimit=U8_COUNT_BYTES_NON_ASCII(b); + c=b; +moreBytes: + while(toULength<toULimit) { + if(source<sourceLimit) { + b=*source; + if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { + ++source; + ++toULength; + c=(c<<6)+b; + } else { + break; /* sequence too short, stop with toULength<toULimit */ + } + } else { + /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ + source-=(toULength-oldToULength); + while(oldToULength<toULength) { + utf8->toUBytes[oldToULength++]=*source++; + } + utf8->toUnicodeStatus=c; + utf8->toULength=toULength; + utf8->mode=toULimit; + pToUArgs->source=(char *)source; + pFromUArgs->target=(char *)target; + return; + } + } + + if(toULength!=toULimit) { + /* error handling: illegal UTF-8 byte sequence */ + source-=(toULength-oldToULength); + while(oldToULength<toULength) { + utf8->toUBytes[oldToULength++]=*source++; + } + utf8->toULength=toULength; + pToUArgs->source=(char *)source; + pFromUArgs->target=(char *)target; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + return; + } + + /* copy the legal byte sequence to the target */ + { + int8_t i; + + for(i=0; i<oldToULength; ++i) { + *target++=utf8->toUBytes[i]; + } + source-=(toULength-oldToULength); + for(; i<toULength; ++i) { + *target++=*source++; + } + count-=toULength; + } + } + } + U_ASSERT(count>=0); + + if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { + if(target==(const uint8_t *)pFromUArgs->targetLimit) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } else { + b=*source; + toULimit=U8_COUNT_BYTES(b); + if(toULimit>(sourceLimit-source)) { + /* collect a truncated byte sequence */ + toULength=0; + c=b; + for(;;) { + utf8->toUBytes[toULength++]=b; + if(++source==sourceLimit) { + /* partial byte sequence at end of source */ + utf8->toUnicodeStatus=c; + utf8->toULength=toULength; + utf8->mode=toULimit; + break; + } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) { + utf8->toULength=toULength; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; + } + c=(c<<6)+b; + } + } else { + /* partial-sequence target overflow: fall back to the pivoting implementation */ + *pErrorCode=U_USING_DEFAULT_WARNING; + } + } + } + + /* write back the updated pointers */ + pToUArgs->source=(char *)source; + pFromUArgs->target=(char *)target; +} + +U_CDECL_END + +/* UTF-8 converter data ----------------------------------------------------- */ + +static const UConverterImpl _UTF8Impl={ + UCNV_UTF8, + + NULL, + NULL, + + NULL, + NULL, + NULL, + + ucnv_toUnicode_UTF8, + ucnv_toUnicode_UTF8_OFFSETS_LOGIC, + ucnv_fromUnicode_UTF8, + ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, + ucnv_getNextUChar_UTF8, + + NULL, + NULL, + NULL, + NULL, + ucnv_getNonSurrogateUnicodeSet, + + ucnv_UTF8FromUTF8, + ucnv_UTF8FromUTF8 +}; + +/* The 1208 CCSID refers to any version of Unicode of UTF-8 */ +static const UConverterStaticData _UTF8StaticData={ + sizeof(UConverterStaticData), + "UTF-8", + 1208, UCNV_IBM, UCNV_UTF8, + 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ + { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + + +const UConverterSharedData _UTF8Data= + UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl); + +/* CESU-8 converter data ---------------------------------------------------- */ + +static const UConverterImpl _CESU8Impl={ + UCNV_CESU8, + + NULL, + NULL, + + NULL, + NULL, + NULL, + + ucnv_toUnicode_UTF8, + ucnv_toUnicode_UTF8_OFFSETS_LOGIC, + ucnv_fromUnicode_UTF8, + ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, + NULL, + + NULL, + NULL, + NULL, + NULL, + ucnv_getCompleteUnicodeSet, + + NULL, + NULL +}; + +static const UConverterStaticData _CESU8StaticData={ + sizeof(UConverterStaticData), + "CESU-8", + 9400, /* CCSID for CESU-8 */ + UCNV_UNKNOWN, UCNV_CESU8, 1, 3, + { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + + +const UConverterSharedData _CESU8Data= + UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl); + +#endif |