diff options
author | mcheshkov <mcheshkov@yandex-team.ru> | 2022-02-10 16:46:16 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:16 +0300 |
commit | 1312621288956f199a5bd5342b0133d4395fa725 (patch) | |
tree | 1a2c5ffcf89eb53ecd79dbc9bc0a195c27404d0c /contrib/libs/icu/common/ucnv_u16.cpp | |
parent | e9d19cec64684c9c1e6b0c98297e5b895cf904fe (diff) | |
download | ydb-1312621288956f199a5bd5342b0133d4395fa725.tar.gz |
Restoring authorship annotation for <mcheshkov@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/common/ucnv_u16.cpp')
-rw-r--r-- | contrib/libs/icu/common/ucnv_u16.cpp | 3154 |
1 files changed, 1577 insertions, 1577 deletions
diff --git a/contrib/libs/icu/common/ucnv_u16.cpp b/contrib/libs/icu/common/ucnv_u16.cpp index 0f76feb25e..a5e8367400 100644 --- a/contrib/libs/icu/common/ucnv_u16.cpp +++ b/contrib/libs/icu/common/ucnv_u16.cpp @@ -1,1579 +1,1579 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -********************************************************************** -* Copyright (C) 2002-2015, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -* file name: ucnv_u16.c -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2002jul01 -* created by: Markus W. Scherer -* -* UTF-16 converter implementation. Used to be in ucnv_utf.c. -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_CONVERSION - -#include "unicode/ucnv.h" -#include "unicode/uversion.h" -#include "ucnv_bld.h" -#include "ucnv_cnv.h" -#include "cmemory.h" - -enum { - UCNV_NEED_TO_WRITE_BOM=1 -}; - -U_CDECL_BEGIN -/* - * The UTF-16 toUnicode implementation is also used for the Java-specific - * "with BOM" variants of UTF-16BE and UTF-16LE. - */ -static void U_CALLCONV -_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode); - -/* UTF-16BE ----------------------------------------------------------------- */ - -#if U_IS_BIG_ENDIAN -# define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets -#else -# define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets -#endif - - -static void U_CALLCONV -_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const UChar *source; - char *target; - int32_t *offsets; - - uint32_t targetCapacity, length, sourceIndex; - UChar c, trail; - char overflow[4]; - - source=pArgs->source; - length=(int32_t)(pArgs->sourceLimit-source); - if(length<=0) { - /* no input, nothing to do */ - return; - } - - cnv=pArgs->converter; - - /* write the BOM if necessary */ - if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { - static const char bom[]={ (char)0xfeu, (char)0xffu }; - ucnv_fromUWriteBytes(cnv, - bom, 2, - &pArgs->target, pArgs->targetLimit, - &pArgs->offsets, -1, - pErrorCode); - cnv->fromUnicodeStatus=0; - } - - target=pArgs->target; - if(target >= pArgs->targetLimit) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - return; - } - - targetCapacity=(uint32_t)(pArgs->targetLimit-target); - offsets=pArgs->offsets; - sourceIndex=0; - - /* c!=0 indicates in several places outside the main loops that a surrogate was found */ - - if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { - /* the last buffer ended with a lead surrogate, output the surrogate pair */ - ++source; - --length; - target[0]=(uint8_t)(c>>8); - target[1]=(uint8_t)c; - target[2]=(uint8_t)(trail>>8); - target[3]=(uint8_t)trail; - target+=4; - targetCapacity-=4; - if(offsets!=NULL) { - *offsets++=-1; - *offsets++=-1; - *offsets++=-1; - *offsets++=-1; - } - sourceIndex=1; - cnv->fromUChar32=c=0; - } - - if(c==0) { - /* copy an even number of bytes for complete UChars */ - uint32_t count=2*length; - if(count>targetCapacity) { - count=targetCapacity&~1; - } - /* count is even */ - targetCapacity-=count; - count>>=1; - length-=count; - - if(offsets==NULL) { - while(count>0) { - c=*source++; - if(U16_IS_SINGLE(c)) { - target[0]=(uint8_t)(c>>8); - target[1]=(uint8_t)c; - target+=2; - } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { - ++source; - --count; - target[0]=(uint8_t)(c>>8); - target[1]=(uint8_t)c; - target[2]=(uint8_t)(trail>>8); - target[3]=(uint8_t)trail; - target+=4; - } else { - break; - } - --count; - } - } else { - while(count>0) { - c=*source++; - if(U16_IS_SINGLE(c)) { - target[0]=(uint8_t)(c>>8); - target[1]=(uint8_t)c; - target+=2; - *offsets++=sourceIndex; - *offsets++=sourceIndex++; - } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { - ++source; - --count; - target[0]=(uint8_t)(c>>8); - target[1]=(uint8_t)c; - target[2]=(uint8_t)(trail>>8); - target[3]=(uint8_t)trail; - target+=4; - *offsets++=sourceIndex; - *offsets++=sourceIndex; - *offsets++=sourceIndex; - *offsets++=sourceIndex; - sourceIndex+=2; - } else { - break; - } - --count; - } - } - - if(count==0) { - /* done with the loop for complete UChars */ - if(length>0 && targetCapacity>0) { - /* - * there is more input and some target capacity - - * it must be targetCapacity==1 because otherwise - * the above would have copied more; - * prepare for overflow output - */ - if(U16_IS_SINGLE(c=*source++)) { - overflow[0]=(char)(c>>8); - overflow[1]=(char)c; - length=2; /* 2 bytes to output */ - c=0; - /* } else { keep c for surrogate handling, length will be set there */ - } - } else { - length=0; - c=0; - } - } else { - /* keep c for surrogate handling, length will be set there */ - targetCapacity+=2*count; - } - } else { - length=0; /* from here on, length counts the bytes in overflow[] */ - } - - if(c!=0) { - /* - * c is a surrogate, and - * - source or target too short - * - or the surrogate is unmatched - */ - length=0; - if(U16_IS_SURROGATE_LEAD(c)) { - if(source<pArgs->sourceLimit) { - if(U16_IS_TRAIL(trail=*source)) { - /* output the surrogate pair, will overflow (see conditions comment above) */ - ++source; - overflow[0]=(char)(c>>8); - overflow[1]=(char)c; - overflow[2]=(char)(trail>>8); - overflow[3]=(char)trail; - length=4; /* 4 bytes to output */ - c=0; - } else { - /* unmatched lead surrogate */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - } else { - /* see if the trail surrogate is in the next buffer */ - } - } else { - /* unmatched trail surrogate */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - cnv->fromUChar32=c; - } - - if(length>0) { - /* output length bytes with overflow (length>targetCapacity>0) */ - ucnv_fromUWriteBytes(cnv, - overflow, length, - (char **)&target, pArgs->targetLimit, - &offsets, sourceIndex, - pErrorCode); - targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); - } - - if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } - - /* write back the updated pointers */ - pArgs->source=source; - pArgs->target=(char *)target; - pArgs->offsets=offsets; +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2002-2015, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* file name: ucnv_u16.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002jul01 +* created by: Markus W. Scherer +* +* UTF-16 converter implementation. Used to be in ucnv_utf.c. +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "unicode/ucnv.h" +#include "unicode/uversion.h" +#include "ucnv_bld.h" +#include "ucnv_cnv.h" +#include "cmemory.h" + +enum { + UCNV_NEED_TO_WRITE_BOM=1 +}; + +U_CDECL_BEGIN +/* + * The UTF-16 toUnicode implementation is also used for the Java-specific + * "with BOM" variants of UTF-16BE and UTF-16LE. + */ +static void U_CALLCONV +_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode); + +/* UTF-16BE ----------------------------------------------------------------- */ + +#if U_IS_BIG_ENDIAN +# define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets +#else +# define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets +#endif + + +static void U_CALLCONV +_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const UChar *source; + char *target; + int32_t *offsets; + + uint32_t targetCapacity, length, sourceIndex; + UChar c, trail; + char overflow[4]; + + source=pArgs->source; + length=(int32_t)(pArgs->sourceLimit-source); + if(length<=0) { + /* no input, nothing to do */ + return; + } + + cnv=pArgs->converter; + + /* write the BOM if necessary */ + if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { + static const char bom[]={ (char)0xfeu, (char)0xffu }; + ucnv_fromUWriteBytes(cnv, + bom, 2, + &pArgs->target, pArgs->targetLimit, + &pArgs->offsets, -1, + pErrorCode); + cnv->fromUnicodeStatus=0; + } + + target=pArgs->target; + if(target >= pArgs->targetLimit) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return; + } + + targetCapacity=(uint32_t)(pArgs->targetLimit-target); + offsets=pArgs->offsets; + sourceIndex=0; + + /* c!=0 indicates in several places outside the main loops that a surrogate was found */ + + if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { + /* the last buffer ended with a lead surrogate, output the surrogate pair */ + ++source; + --length; + target[0]=(uint8_t)(c>>8); + target[1]=(uint8_t)c; + target[2]=(uint8_t)(trail>>8); + target[3]=(uint8_t)trail; + target+=4; + targetCapacity-=4; + if(offsets!=NULL) { + *offsets++=-1; + *offsets++=-1; + *offsets++=-1; + *offsets++=-1; + } + sourceIndex=1; + cnv->fromUChar32=c=0; + } + + if(c==0) { + /* copy an even number of bytes for complete UChars */ + uint32_t count=2*length; + if(count>targetCapacity) { + count=targetCapacity&~1; + } + /* count is even */ + targetCapacity-=count; + count>>=1; + length-=count; + + if(offsets==NULL) { + while(count>0) { + c=*source++; + if(U16_IS_SINGLE(c)) { + target[0]=(uint8_t)(c>>8); + target[1]=(uint8_t)c; + target+=2; + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { + ++source; + --count; + target[0]=(uint8_t)(c>>8); + target[1]=(uint8_t)c; + target[2]=(uint8_t)(trail>>8); + target[3]=(uint8_t)trail; + target+=4; + } else { + break; + } + --count; + } + } else { + while(count>0) { + c=*source++; + if(U16_IS_SINGLE(c)) { + target[0]=(uint8_t)(c>>8); + target[1]=(uint8_t)c; + target+=2; + *offsets++=sourceIndex; + *offsets++=sourceIndex++; + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { + ++source; + --count; + target[0]=(uint8_t)(c>>8); + target[1]=(uint8_t)c; + target[2]=(uint8_t)(trail>>8); + target[3]=(uint8_t)trail; + target+=4; + *offsets++=sourceIndex; + *offsets++=sourceIndex; + *offsets++=sourceIndex; + *offsets++=sourceIndex; + sourceIndex+=2; + } else { + break; + } + --count; + } + } + + if(count==0) { + /* done with the loop for complete UChars */ + if(length>0 && targetCapacity>0) { + /* + * there is more input and some target capacity - + * it must be targetCapacity==1 because otherwise + * the above would have copied more; + * prepare for overflow output + */ + if(U16_IS_SINGLE(c=*source++)) { + overflow[0]=(char)(c>>8); + overflow[1]=(char)c; + length=2; /* 2 bytes to output */ + c=0; + /* } else { keep c for surrogate handling, length will be set there */ + } + } else { + length=0; + c=0; + } + } else { + /* keep c for surrogate handling, length will be set there */ + targetCapacity+=2*count; + } + } else { + length=0; /* from here on, length counts the bytes in overflow[] */ + } + + if(c!=0) { + /* + * c is a surrogate, and + * - source or target too short + * - or the surrogate is unmatched + */ + length=0; + if(U16_IS_SURROGATE_LEAD(c)) { + if(source<pArgs->sourceLimit) { + if(U16_IS_TRAIL(trail=*source)) { + /* output the surrogate pair, will overflow (see conditions comment above) */ + ++source; + overflow[0]=(char)(c>>8); + overflow[1]=(char)c; + overflow[2]=(char)(trail>>8); + overflow[3]=(char)trail; + length=4; /* 4 bytes to output */ + c=0; + } else { + /* unmatched lead surrogate */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + } else { + /* see if the trail surrogate is in the next buffer */ + } + } else { + /* unmatched trail surrogate */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + cnv->fromUChar32=c; + } + + if(length>0) { + /* output length bytes with overflow (length>targetCapacity>0) */ + ucnv_fromUWriteBytes(cnv, + overflow, length, + (char **)&target, pArgs->targetLimit, + &offsets, sourceIndex, + pErrorCode); + targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); + } + + if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + + /* write back the updated pointers */ + pArgs->source=source; + pArgs->target=(char *)target; + pArgs->offsets=offsets; +} + +static void U_CALLCONV +_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const uint8_t *source; + UChar *target; + int32_t *offsets; + + uint32_t targetCapacity, length, count, sourceIndex; + UChar c, trail; + + if(pArgs->converter->mode<8) { + _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); + return; + } + + cnv=pArgs->converter; + source=(const uint8_t *)pArgs->source; + length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); + if(length<=0 && cnv->toUnicodeStatus==0) { + /* no input, nothing to do */ + return; + } + + target=pArgs->target; + if(target >= pArgs->targetLimit) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return; + } + + targetCapacity=(uint32_t)(pArgs->targetLimit-target); + offsets=pArgs->offsets; + sourceIndex=0; + c=0; + + /* complete a partial UChar or pair from the last call */ + if(cnv->toUnicodeStatus!=0) { + /* + * special case: single byte from a previous buffer, + * where the byte turned out not to belong to a trail surrogate + * and the preceding, unmatched lead surrogate was put into toUBytes[] + * for error handling + */ + cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; + cnv->toULength=1; + cnv->toUnicodeStatus=0; + } + if((count=cnv->toULength)!=0) { + uint8_t *p=cnv->toUBytes; + do { + p[count++]=*source++; + ++sourceIndex; + --length; + if(count==2) { + c=((UChar)p[0]<<8)|p[1]; + if(U16_IS_SINGLE(c)) { + /* output the BMP code point */ + *target++=c; + if(offsets!=NULL) { + *offsets++=-1; + } + --targetCapacity; + count=0; + c=0; + break; + } else if(U16_IS_SURROGATE_LEAD(c)) { + /* continue collecting bytes for the trail surrogate */ + c=0; /* avoid unnecessary surrogate handling below */ + } else { + /* fall through to error handling for an unmatched trail surrogate */ + break; + } + } else if(count==4) { + c=((UChar)p[0]<<8)|p[1]; + trail=((UChar)p[2]<<8)|p[3]; + if(U16_IS_TRAIL(trail)) { + /* output the surrogate pair */ + *target++=c; + if(targetCapacity>=2) { + *target++=trail; + if(offsets!=NULL) { + *offsets++=-1; + *offsets++=-1; + } + targetCapacity-=2; + } else /* targetCapacity==1 */ { + targetCapacity=0; + cnv->UCharErrorBuffer[0]=trail; + cnv->UCharErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + count=0; + c=0; + break; + } else { + /* unmatched lead surrogate, handle here for consistent toUBytes[] */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + + /* back out reading the code unit after it */ + if(((const uint8_t *)pArgs->source-source)>=2) { + source-=2; + } else { + /* + * if the trail unit's first byte was in a previous buffer, then + * we need to put it into a special place because toUBytes[] will be + * used for the lead unit's bytes + */ + cnv->toUnicodeStatus=0x100|p[2]; + --source; + } + cnv->toULength=2; + + /* write back the updated pointers */ + pArgs->source=(const char *)source; + pArgs->target=target; + pArgs->offsets=offsets; + return; + } + } + } while(length>0); + cnv->toULength=(int8_t)count; + } + + /* copy an even number of bytes for complete UChars */ + count=2*targetCapacity; + if(count>length) { + count=length&~1; + } + if(c==0 && count>0) { + length-=count; + count>>=1; + targetCapacity-=count; + if(offsets==NULL) { + do { + c=((UChar)source[0]<<8)|source[1]; + source+=2; + if(U16_IS_SINGLE(c)) { + *target++=c; + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && + U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) + ) { + source+=2; + --count; + *target++=c; + *target++=trail; + } else { + break; + } + } while(--count>0); + } else { + do { + c=((UChar)source[0]<<8)|source[1]; + source+=2; + if(U16_IS_SINGLE(c)) { + *target++=c; + *offsets++=sourceIndex; + sourceIndex+=2; + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && + U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) + ) { + source+=2; + --count; + *target++=c; + *target++=trail; + *offsets++=sourceIndex; + *offsets++=sourceIndex; + sourceIndex+=4; + } else { + break; + } + } while(--count>0); + } + + if(count==0) { + /* done with the loop for complete UChars */ + c=0; + } else { + /* keep c for surrogate handling, trail will be set there */ + length+=2*(count-1); /* one more byte pair was consumed than count decremented */ + targetCapacity+=count; + } + } + + if(c!=0) { + /* + * c is a surrogate, and + * - source or target too short + * - or the surrogate is unmatched + */ + cnv->toUBytes[0]=(uint8_t)(c>>8); + cnv->toUBytes[1]=(uint8_t)c; + cnv->toULength=2; + + if(U16_IS_SURROGATE_LEAD(c)) { + if(length>=2) { + if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { + /* output the surrogate pair, will overflow (see conditions comment above) */ + source+=2; + length-=2; + *target++=c; + if(offsets!=NULL) { + *offsets++=sourceIndex; + } + cnv->UCharErrorBuffer[0]=trail; + cnv->UCharErrorBufferLength=1; + cnv->toULength=0; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } else { + /* unmatched lead surrogate */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + } else { + /* see if the trail surrogate is in the next buffer */ + } + } else { + /* unmatched trail surrogate */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + } + + if(U_SUCCESS(*pErrorCode)) { + /* check for a remaining source byte */ + if(length>0) { + if(targetCapacity==0) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } else { + /* it must be length==1 because otherwise the above would have copied more */ + cnv->toUBytes[cnv->toULength++]=*source++; + } + } + } + + /* write back the updated pointers */ + pArgs->source=(const char *)source; + pArgs->target=target; + pArgs->offsets=offsets; +} + +static UChar32 U_CALLCONV +_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { + const uint8_t *s, *sourceLimit; + UChar32 c; + + if(pArgs->converter->mode<8) { + return UCNV_GET_NEXT_UCHAR_USE_TO_U; + } + + s=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + + if(s>=sourceLimit) { + /* no input */ + *err=U_INDEX_OUTOFBOUNDS_ERROR; + return 0xffff; + } + + if(s+2>sourceLimit) { + /* only one byte: truncated UChar */ + pArgs->converter->toUBytes[0]=*s++; + pArgs->converter->toULength=1; + pArgs->source=(const char *)s; + *err = U_TRUNCATED_CHAR_FOUND; + return 0xffff; + } + + /* get one UChar */ + c=((UChar32)*s<<8)|s[1]; + s+=2; + + /* check for a surrogate pair */ + if(U_IS_SURROGATE(c)) { + if(U16_IS_SURROGATE_LEAD(c)) { + if(s+2<=sourceLimit) { + UChar trail; + + /* get a second UChar and see if it is a trail surrogate */ + trail=((UChar)*s<<8)|s[1]; + if(U16_IS_TRAIL(trail)) { + c=U16_GET_SUPPLEMENTARY(c, trail); + s+=2; + } else { + /* unmatched lead surrogate */ + c=-2; + } + } else { + /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ + uint8_t *bytes=pArgs->converter->toUBytes; + s-=2; + pArgs->converter->toULength=(int8_t)(sourceLimit-s); + do { + *bytes++=*s++; + } while(s<sourceLimit); + + c=0xffff; + *err=U_TRUNCATED_CHAR_FOUND; + } + } else { + /* unmatched trail surrogate */ + c=-2; + } + + if(c<0) { + /* write the unmatched surrogate */ + uint8_t *bytes=pArgs->converter->toUBytes; + pArgs->converter->toULength=2; + *bytes=*(s-2); + bytes[1]=*(s-1); + + c=0xffff; + *err=U_ILLEGAL_CHAR_FOUND; + } + } + + pArgs->source=(const char *)s; + return c; } - -static void U_CALLCONV -_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const uint8_t *source; - UChar *target; - int32_t *offsets; - - uint32_t targetCapacity, length, count, sourceIndex; - UChar c, trail; - - if(pArgs->converter->mode<8) { - _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); - return; - } - - cnv=pArgs->converter; - source=(const uint8_t *)pArgs->source; - length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); - if(length<=0 && cnv->toUnicodeStatus==0) { - /* no input, nothing to do */ - return; - } - - target=pArgs->target; - if(target >= pArgs->targetLimit) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - return; - } - - targetCapacity=(uint32_t)(pArgs->targetLimit-target); - offsets=pArgs->offsets; - sourceIndex=0; - c=0; - - /* complete a partial UChar or pair from the last call */ - if(cnv->toUnicodeStatus!=0) { - /* - * special case: single byte from a previous buffer, - * where the byte turned out not to belong to a trail surrogate - * and the preceding, unmatched lead surrogate was put into toUBytes[] - * for error handling - */ - cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; - cnv->toULength=1; - cnv->toUnicodeStatus=0; - } - if((count=cnv->toULength)!=0) { - uint8_t *p=cnv->toUBytes; - do { - p[count++]=*source++; - ++sourceIndex; - --length; - if(count==2) { - c=((UChar)p[0]<<8)|p[1]; - if(U16_IS_SINGLE(c)) { - /* output the BMP code point */ - *target++=c; - if(offsets!=NULL) { - *offsets++=-1; - } - --targetCapacity; - count=0; - c=0; - break; - } else if(U16_IS_SURROGATE_LEAD(c)) { - /* continue collecting bytes for the trail surrogate */ - c=0; /* avoid unnecessary surrogate handling below */ - } else { - /* fall through to error handling for an unmatched trail surrogate */ - break; - } - } else if(count==4) { - c=((UChar)p[0]<<8)|p[1]; - trail=((UChar)p[2]<<8)|p[3]; - if(U16_IS_TRAIL(trail)) { - /* output the surrogate pair */ - *target++=c; - if(targetCapacity>=2) { - *target++=trail; - if(offsets!=NULL) { - *offsets++=-1; - *offsets++=-1; - } - targetCapacity-=2; - } else /* targetCapacity==1 */ { - targetCapacity=0; - cnv->UCharErrorBuffer[0]=trail; - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } - count=0; - c=0; - break; - } else { - /* unmatched lead surrogate, handle here for consistent toUBytes[] */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - - /* back out reading the code unit after it */ - if(((const uint8_t *)pArgs->source-source)>=2) { - source-=2; - } else { - /* - * if the trail unit's first byte was in a previous buffer, then - * we need to put it into a special place because toUBytes[] will be - * used for the lead unit's bytes - */ - cnv->toUnicodeStatus=0x100|p[2]; - --source; - } - cnv->toULength=2; - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - pArgs->offsets=offsets; - return; - } - } - } while(length>0); - cnv->toULength=(int8_t)count; - } - - /* copy an even number of bytes for complete UChars */ - count=2*targetCapacity; - if(count>length) { - count=length&~1; - } - if(c==0 && count>0) { - length-=count; - count>>=1; - targetCapacity-=count; - if(offsets==NULL) { - do { - c=((UChar)source[0]<<8)|source[1]; - source+=2; - if(U16_IS_SINGLE(c)) { - *target++=c; - } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && - U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) - ) { - source+=2; - --count; - *target++=c; - *target++=trail; - } else { - break; - } - } while(--count>0); - } else { - do { - c=((UChar)source[0]<<8)|source[1]; - source+=2; - if(U16_IS_SINGLE(c)) { - *target++=c; - *offsets++=sourceIndex; - sourceIndex+=2; - } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && - U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) - ) { - source+=2; - --count; - *target++=c; - *target++=trail; - *offsets++=sourceIndex; - *offsets++=sourceIndex; - sourceIndex+=4; - } else { - break; - } - } while(--count>0); - } - - if(count==0) { - /* done with the loop for complete UChars */ - c=0; - } else { - /* keep c for surrogate handling, trail will be set there */ - length+=2*(count-1); /* one more byte pair was consumed than count decremented */ - targetCapacity+=count; - } - } - - if(c!=0) { - /* - * c is a surrogate, and - * - source or target too short - * - or the surrogate is unmatched - */ - cnv->toUBytes[0]=(uint8_t)(c>>8); - cnv->toUBytes[1]=(uint8_t)c; - cnv->toULength=2; - - if(U16_IS_SURROGATE_LEAD(c)) { - if(length>=2) { - if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { - /* output the surrogate pair, will overflow (see conditions comment above) */ - source+=2; - length-=2; - *target++=c; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - cnv->UCharErrorBuffer[0]=trail; - cnv->UCharErrorBufferLength=1; - cnv->toULength=0; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } else { - /* unmatched lead surrogate */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - } else { - /* see if the trail surrogate is in the next buffer */ - } - } else { - /* unmatched trail surrogate */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - } - - if(U_SUCCESS(*pErrorCode)) { - /* check for a remaining source byte */ - if(length>0) { - if(targetCapacity==0) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } else { - /* it must be length==1 because otherwise the above would have copied more */ - cnv->toUBytes[cnv->toULength++]=*source++; - } - } - } - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - pArgs->offsets=offsets; + +static void U_CALLCONV +_UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) { + if(choice<=UCNV_RESET_TO_UNICODE) { + /* reset toUnicode state */ + if(UCNV_GET_VERSION(cnv)==0) { + cnv->mode=8; /* no BOM handling */ + } else { + cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */ + } + } + if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { + /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */ + cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; + } +} + +static void U_CALLCONV +_UTF16BEOpen(UConverter *cnv, + UConverterLoadArgs *pArgs, + UErrorCode *pErrorCode) { + (void)pArgs; + if(UCNV_GET_VERSION(cnv)<=1) { + _UTF16BEReset(cnv, UCNV_RESET_BOTH); + } else { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + } +} + +static const char * U_CALLCONV +_UTF16BEGetName(const UConverter *cnv) { + if(UCNV_GET_VERSION(cnv)==0) { + return "UTF-16BE"; + } else { + return "UTF-16BE,version=1"; + } +} +U_CDECL_END + +static const UConverterImpl _UTF16BEImpl={ + UCNV_UTF16_BigEndian, + + NULL, + NULL, + + _UTF16BEOpen, + NULL, + _UTF16BEReset, + + _UTF16BEToUnicodeWithOffsets, + _UTF16BEToUnicodeWithOffsets, + _UTF16BEFromUnicodeWithOffsets, + _UTF16BEFromUnicodeWithOffsets, + _UTF16BEGetNextUChar, + + NULL, + _UTF16BEGetName, + NULL, + NULL, + ucnv_getNonSurrogateUnicodeSet, + + NULL, + NULL +}; + +static const UConverterStaticData _UTF16BEStaticData={ + sizeof(UConverterStaticData), + "UTF-16BE", + 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, + { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + + +const UConverterSharedData _UTF16BEData= + UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl); + +/* UTF-16LE ----------------------------------------------------------------- */ +U_CDECL_BEGIN +static void U_CALLCONV +_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const UChar *source; + char *target; + int32_t *offsets; + + uint32_t targetCapacity, length, sourceIndex; + UChar c, trail; + char overflow[4]; + + source=pArgs->source; + length=(int32_t)(pArgs->sourceLimit-source); + if(length<=0) { + /* no input, nothing to do */ + return; + } + + cnv=pArgs->converter; + + /* write the BOM if necessary */ + if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { + static const char bom[]={ (char)0xffu, (char)0xfeu }; + ucnv_fromUWriteBytes(cnv, + bom, 2, + &pArgs->target, pArgs->targetLimit, + &pArgs->offsets, -1, + pErrorCode); + cnv->fromUnicodeStatus=0; + } + + target=pArgs->target; + if(target >= pArgs->targetLimit) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return; + } + + targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); + offsets=pArgs->offsets; + sourceIndex=0; + + /* c!=0 indicates in several places outside the main loops that a surrogate was found */ + + if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { + /* the last buffer ended with a lead surrogate, output the surrogate pair */ + ++source; + --length; + target[0]=(uint8_t)c; + target[1]=(uint8_t)(c>>8); + target[2]=(uint8_t)trail; + target[3]=(uint8_t)(trail>>8); + target+=4; + targetCapacity-=4; + if(offsets!=NULL) { + *offsets++=-1; + *offsets++=-1; + *offsets++=-1; + *offsets++=-1; + } + sourceIndex=1; + cnv->fromUChar32=c=0; + } + + if(c==0) { + /* copy an even number of bytes for complete UChars */ + uint32_t count=2*length; + if(count>targetCapacity) { + count=targetCapacity&~1; + } + /* count is even */ + targetCapacity-=count; + count>>=1; + length-=count; + + if(offsets==NULL) { + while(count>0) { + c=*source++; + if(U16_IS_SINGLE(c)) { + target[0]=(uint8_t)c; + target[1]=(uint8_t)(c>>8); + target+=2; + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { + ++source; + --count; + target[0]=(uint8_t)c; + target[1]=(uint8_t)(c>>8); + target[2]=(uint8_t)trail; + target[3]=(uint8_t)(trail>>8); + target+=4; + } else { + break; + } + --count; + } + } else { + while(count>0) { + c=*source++; + if(U16_IS_SINGLE(c)) { + target[0]=(uint8_t)c; + target[1]=(uint8_t)(c>>8); + target+=2; + *offsets++=sourceIndex; + *offsets++=sourceIndex++; + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { + ++source; + --count; + target[0]=(uint8_t)c; + target[1]=(uint8_t)(c>>8); + target[2]=(uint8_t)trail; + target[3]=(uint8_t)(trail>>8); + target+=4; + *offsets++=sourceIndex; + *offsets++=sourceIndex; + *offsets++=sourceIndex; + *offsets++=sourceIndex; + sourceIndex+=2; + } else { + break; + } + --count; + } + } + + if(count==0) { + /* done with the loop for complete UChars */ + if(length>0 && targetCapacity>0) { + /* + * there is more input and some target capacity - + * it must be targetCapacity==1 because otherwise + * the above would have copied more; + * prepare for overflow output + */ + if(U16_IS_SINGLE(c=*source++)) { + overflow[0]=(char)c; + overflow[1]=(char)(c>>8); + length=2; /* 2 bytes to output */ + c=0; + /* } else { keep c for surrogate handling, length will be set there */ + } + } else { + length=0; + c=0; + } + } else { + /* keep c for surrogate handling, length will be set there */ + targetCapacity+=2*count; + } + } else { + length=0; /* from here on, length counts the bytes in overflow[] */ + } + + if(c!=0) { + /* + * c is a surrogate, and + * - source or target too short + * - or the surrogate is unmatched + */ + length=0; + if(U16_IS_SURROGATE_LEAD(c)) { + if(source<pArgs->sourceLimit) { + if(U16_IS_TRAIL(trail=*source)) { + /* output the surrogate pair, will overflow (see conditions comment above) */ + ++source; + overflow[0]=(char)c; + overflow[1]=(char)(c>>8); + overflow[2]=(char)trail; + overflow[3]=(char)(trail>>8); + length=4; /* 4 bytes to output */ + c=0; + } else { + /* unmatched lead surrogate */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + } else { + /* see if the trail surrogate is in the next buffer */ + } + } else { + /* unmatched trail surrogate */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + cnv->fromUChar32=c; + } + + if(length>0) { + /* output length bytes with overflow (length>targetCapacity>0) */ + ucnv_fromUWriteBytes(cnv, + overflow, length, + &target, pArgs->targetLimit, + &offsets, sourceIndex, + pErrorCode); + targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); + } + + if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + + /* write back the updated pointers */ + pArgs->source=source; + pArgs->target=target; + pArgs->offsets=offsets; +} + +static void U_CALLCONV +_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const uint8_t *source; + UChar *target; + int32_t *offsets; + + uint32_t targetCapacity, length, count, sourceIndex; + UChar c, trail; + + if(pArgs->converter->mode<8) { + _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); + return; + } + + cnv=pArgs->converter; + source=(const uint8_t *)pArgs->source; + length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); + if(length<=0 && cnv->toUnicodeStatus==0) { + /* no input, nothing to do */ + return; + } + + target=pArgs->target; + if(target >= pArgs->targetLimit) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return; + } + + targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); + offsets=pArgs->offsets; + sourceIndex=0; + c=0; + + /* complete a partial UChar or pair from the last call */ + if(cnv->toUnicodeStatus!=0) { + /* + * special case: single byte from a previous buffer, + * where the byte turned out not to belong to a trail surrogate + * and the preceding, unmatched lead surrogate was put into toUBytes[] + * for error handling + */ + cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; + cnv->toULength=1; + cnv->toUnicodeStatus=0; + } + if((count=cnv->toULength)!=0) { + uint8_t *p=cnv->toUBytes; + do { + p[count++]=*source++; + ++sourceIndex; + --length; + if(count==2) { + c=((UChar)p[1]<<8)|p[0]; + if(U16_IS_SINGLE(c)) { + /* output the BMP code point */ + *target++=c; + if(offsets!=NULL) { + *offsets++=-1; + } + --targetCapacity; + count=0; + c=0; + break; + } else if(U16_IS_SURROGATE_LEAD(c)) { + /* continue collecting bytes for the trail surrogate */ + c=0; /* avoid unnecessary surrogate handling below */ + } else { + /* fall through to error handling for an unmatched trail surrogate */ + break; + } + } else if(count==4) { + c=((UChar)p[1]<<8)|p[0]; + trail=((UChar)p[3]<<8)|p[2]; + if(U16_IS_TRAIL(trail)) { + /* output the surrogate pair */ + *target++=c; + if(targetCapacity>=2) { + *target++=trail; + if(offsets!=NULL) { + *offsets++=-1; + *offsets++=-1; + } + targetCapacity-=2; + } else /* targetCapacity==1 */ { + targetCapacity=0; + cnv->UCharErrorBuffer[0]=trail; + cnv->UCharErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + count=0; + c=0; + break; + } else { + /* unmatched lead surrogate, handle here for consistent toUBytes[] */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + + /* back out reading the code unit after it */ + if(((const uint8_t *)pArgs->source-source)>=2) { + source-=2; + } else { + /* + * if the trail unit's first byte was in a previous buffer, then + * we need to put it into a special place because toUBytes[] will be + * used for the lead unit's bytes + */ + cnv->toUnicodeStatus=0x100|p[2]; + --source; + } + cnv->toULength=2; + + /* write back the updated pointers */ + pArgs->source=(const char *)source; + pArgs->target=target; + pArgs->offsets=offsets; + return; + } + } + } while(length>0); + cnv->toULength=(int8_t)count; + } + + /* copy an even number of bytes for complete UChars */ + count=2*targetCapacity; + if(count>length) { + count=length&~1; + } + if(c==0 && count>0) { + length-=count; + count>>=1; + targetCapacity-=count; + if(offsets==NULL) { + do { + c=((UChar)source[1]<<8)|source[0]; + source+=2; + if(U16_IS_SINGLE(c)) { + *target++=c; + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && + U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) + ) { + source+=2; + --count; + *target++=c; + *target++=trail; + } else { + break; + } + } while(--count>0); + } else { + do { + c=((UChar)source[1]<<8)|source[0]; + source+=2; + if(U16_IS_SINGLE(c)) { + *target++=c; + *offsets++=sourceIndex; + sourceIndex+=2; + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && + U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) + ) { + source+=2; + --count; + *target++=c; + *target++=trail; + *offsets++=sourceIndex; + *offsets++=sourceIndex; + sourceIndex+=4; + } else { + break; + } + } while(--count>0); + } + + if(count==0) { + /* done with the loop for complete UChars */ + c=0; + } else { + /* keep c for surrogate handling, trail will be set there */ + length+=2*(count-1); /* one more byte pair was consumed than count decremented */ + targetCapacity+=count; + } + } + + if(c!=0) { + /* + * c is a surrogate, and + * - source or target too short + * - or the surrogate is unmatched + */ + cnv->toUBytes[0]=(uint8_t)c; + cnv->toUBytes[1]=(uint8_t)(c>>8); + cnv->toULength=2; + + if(U16_IS_SURROGATE_LEAD(c)) { + if(length>=2) { + if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { + /* output the surrogate pair, will overflow (see conditions comment above) */ + source+=2; + length-=2; + *target++=c; + if(offsets!=NULL) { + *offsets++=sourceIndex; + } + cnv->UCharErrorBuffer[0]=trail; + cnv->UCharErrorBufferLength=1; + cnv->toULength=0; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } else { + /* unmatched lead surrogate */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + } else { + /* see if the trail surrogate is in the next buffer */ + } + } else { + /* unmatched trail surrogate */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + } + + if(U_SUCCESS(*pErrorCode)) { + /* check for a remaining source byte */ + if(length>0) { + if(targetCapacity==0) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } else { + /* it must be length==1 because otherwise the above would have copied more */ + cnv->toUBytes[cnv->toULength++]=*source++; + } + } + } + + /* write back the updated pointers */ + pArgs->source=(const char *)source; + pArgs->target=target; + pArgs->offsets=offsets; +} + +static UChar32 U_CALLCONV +_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { + const uint8_t *s, *sourceLimit; + UChar32 c; + + if(pArgs->converter->mode<8) { + return UCNV_GET_NEXT_UCHAR_USE_TO_U; + } + + s=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + + if(s>=sourceLimit) { + /* no input */ + *err=U_INDEX_OUTOFBOUNDS_ERROR; + return 0xffff; + } + + if(s+2>sourceLimit) { + /* only one byte: truncated UChar */ + pArgs->converter->toUBytes[0]=*s++; + pArgs->converter->toULength=1; + pArgs->source=(const char *)s; + *err = U_TRUNCATED_CHAR_FOUND; + return 0xffff; + } + + /* get one UChar */ + c=((UChar32)s[1]<<8)|*s; + s+=2; + + /* check for a surrogate pair */ + if(U_IS_SURROGATE(c)) { + if(U16_IS_SURROGATE_LEAD(c)) { + if(s+2<=sourceLimit) { + UChar trail; + + /* get a second UChar and see if it is a trail surrogate */ + trail=((UChar)s[1]<<8)|*s; + if(U16_IS_TRAIL(trail)) { + c=U16_GET_SUPPLEMENTARY(c, trail); + s+=2; + } else { + /* unmatched lead surrogate */ + c=-2; + } + } else { + /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ + uint8_t *bytes=pArgs->converter->toUBytes; + s-=2; + pArgs->converter->toULength=(int8_t)(sourceLimit-s); + do { + *bytes++=*s++; + } while(s<sourceLimit); + + c=0xffff; + *err=U_TRUNCATED_CHAR_FOUND; + } + } else { + /* unmatched trail surrogate */ + c=-2; + } + + if(c<0) { + /* write the unmatched surrogate */ + uint8_t *bytes=pArgs->converter->toUBytes; + pArgs->converter->toULength=2; + *bytes=*(s-2); + bytes[1]=*(s-1); + + c=0xffff; + *err=U_ILLEGAL_CHAR_FOUND; + } + } + + pArgs->source=(const char *)s; + return c; } - -static UChar32 U_CALLCONV -_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { - const uint8_t *s, *sourceLimit; - UChar32 c; - - if(pArgs->converter->mode<8) { - return UCNV_GET_NEXT_UCHAR_USE_TO_U; - } - - s=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - - if(s>=sourceLimit) { - /* no input */ - *err=U_INDEX_OUTOFBOUNDS_ERROR; - return 0xffff; - } - - if(s+2>sourceLimit) { - /* only one byte: truncated UChar */ - pArgs->converter->toUBytes[0]=*s++; - pArgs->converter->toULength=1; - pArgs->source=(const char *)s; - *err = U_TRUNCATED_CHAR_FOUND; - return 0xffff; - } - - /* get one UChar */ - c=((UChar32)*s<<8)|s[1]; - s+=2; - - /* check for a surrogate pair */ - if(U_IS_SURROGATE(c)) { - if(U16_IS_SURROGATE_LEAD(c)) { - if(s+2<=sourceLimit) { - UChar trail; - - /* get a second UChar and see if it is a trail surrogate */ - trail=((UChar)*s<<8)|s[1]; - if(U16_IS_TRAIL(trail)) { - c=U16_GET_SUPPLEMENTARY(c, trail); - s+=2; - } else { - /* unmatched lead surrogate */ - c=-2; - } - } else { - /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ - uint8_t *bytes=pArgs->converter->toUBytes; - s-=2; - pArgs->converter->toULength=(int8_t)(sourceLimit-s); - do { - *bytes++=*s++; - } while(s<sourceLimit); - - c=0xffff; - *err=U_TRUNCATED_CHAR_FOUND; - } - } else { - /* unmatched trail surrogate */ - c=-2; - } - - if(c<0) { - /* write the unmatched surrogate */ - uint8_t *bytes=pArgs->converter->toUBytes; - pArgs->converter->toULength=2; - *bytes=*(s-2); - bytes[1]=*(s-1); - - c=0xffff; - *err=U_ILLEGAL_CHAR_FOUND; - } - } - - pArgs->source=(const char *)s; - return c; -} - -static void U_CALLCONV -_UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) { - if(choice<=UCNV_RESET_TO_UNICODE) { - /* reset toUnicode state */ - if(UCNV_GET_VERSION(cnv)==0) { - cnv->mode=8; /* no BOM handling */ - } else { - cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */ - } - } - if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { - /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */ - cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; - } -} - -static void U_CALLCONV -_UTF16BEOpen(UConverter *cnv, - UConverterLoadArgs *pArgs, - UErrorCode *pErrorCode) { - (void)pArgs; - if(UCNV_GET_VERSION(cnv)<=1) { - _UTF16BEReset(cnv, UCNV_RESET_BOTH); - } else { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - } -} - -static const char * U_CALLCONV -_UTF16BEGetName(const UConverter *cnv) { - if(UCNV_GET_VERSION(cnv)==0) { - return "UTF-16BE"; - } else { - return "UTF-16BE,version=1"; - } -} -U_CDECL_END - -static const UConverterImpl _UTF16BEImpl={ - UCNV_UTF16_BigEndian, - - NULL, - NULL, - - _UTF16BEOpen, - NULL, - _UTF16BEReset, - - _UTF16BEToUnicodeWithOffsets, - _UTF16BEToUnicodeWithOffsets, - _UTF16BEFromUnicodeWithOffsets, - _UTF16BEFromUnicodeWithOffsets, - _UTF16BEGetNextUChar, - - NULL, - _UTF16BEGetName, - NULL, - NULL, - ucnv_getNonSurrogateUnicodeSet, - - NULL, - NULL -}; - -static const UConverterStaticData _UTF16BEStaticData={ - sizeof(UConverterStaticData), - "UTF-16BE", - 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, - { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, - 0, - 0, - { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ -}; - - -const UConverterSharedData _UTF16BEData= - UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl); - -/* UTF-16LE ----------------------------------------------------------------- */ -U_CDECL_BEGIN -static void U_CALLCONV -_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const UChar *source; - char *target; - int32_t *offsets; - - uint32_t targetCapacity, length, sourceIndex; - UChar c, trail; - char overflow[4]; - - source=pArgs->source; - length=(int32_t)(pArgs->sourceLimit-source); - if(length<=0) { - /* no input, nothing to do */ - return; - } - - cnv=pArgs->converter; - - /* write the BOM if necessary */ - if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { - static const char bom[]={ (char)0xffu, (char)0xfeu }; - ucnv_fromUWriteBytes(cnv, - bom, 2, - &pArgs->target, pArgs->targetLimit, - &pArgs->offsets, -1, - pErrorCode); - cnv->fromUnicodeStatus=0; - } - - target=pArgs->target; - if(target >= pArgs->targetLimit) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - return; - } - - targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); - offsets=pArgs->offsets; - sourceIndex=0; - - /* c!=0 indicates in several places outside the main loops that a surrogate was found */ - - if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { - /* the last buffer ended with a lead surrogate, output the surrogate pair */ - ++source; - --length; - target[0]=(uint8_t)c; - target[1]=(uint8_t)(c>>8); - target[2]=(uint8_t)trail; - target[3]=(uint8_t)(trail>>8); - target+=4; - targetCapacity-=4; - if(offsets!=NULL) { - *offsets++=-1; - *offsets++=-1; - *offsets++=-1; - *offsets++=-1; - } - sourceIndex=1; - cnv->fromUChar32=c=0; - } - - if(c==0) { - /* copy an even number of bytes for complete UChars */ - uint32_t count=2*length; - if(count>targetCapacity) { - count=targetCapacity&~1; - } - /* count is even */ - targetCapacity-=count; - count>>=1; - length-=count; - - if(offsets==NULL) { - while(count>0) { - c=*source++; - if(U16_IS_SINGLE(c)) { - target[0]=(uint8_t)c; - target[1]=(uint8_t)(c>>8); - target+=2; - } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { - ++source; - --count; - target[0]=(uint8_t)c; - target[1]=(uint8_t)(c>>8); - target[2]=(uint8_t)trail; - target[3]=(uint8_t)(trail>>8); - target+=4; - } else { - break; - } - --count; - } - } else { - while(count>0) { - c=*source++; - if(U16_IS_SINGLE(c)) { - target[0]=(uint8_t)c; - target[1]=(uint8_t)(c>>8); - target+=2; - *offsets++=sourceIndex; - *offsets++=sourceIndex++; - } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { - ++source; - --count; - target[0]=(uint8_t)c; - target[1]=(uint8_t)(c>>8); - target[2]=(uint8_t)trail; - target[3]=(uint8_t)(trail>>8); - target+=4; - *offsets++=sourceIndex; - *offsets++=sourceIndex; - *offsets++=sourceIndex; - *offsets++=sourceIndex; - sourceIndex+=2; - } else { - break; - } - --count; - } - } - - if(count==0) { - /* done with the loop for complete UChars */ - if(length>0 && targetCapacity>0) { - /* - * there is more input and some target capacity - - * it must be targetCapacity==1 because otherwise - * the above would have copied more; - * prepare for overflow output - */ - if(U16_IS_SINGLE(c=*source++)) { - overflow[0]=(char)c; - overflow[1]=(char)(c>>8); - length=2; /* 2 bytes to output */ - c=0; - /* } else { keep c for surrogate handling, length will be set there */ - } - } else { - length=0; - c=0; - } - } else { - /* keep c for surrogate handling, length will be set there */ - targetCapacity+=2*count; - } - } else { - length=0; /* from here on, length counts the bytes in overflow[] */ - } - - if(c!=0) { - /* - * c is a surrogate, and - * - source or target too short - * - or the surrogate is unmatched - */ - length=0; - if(U16_IS_SURROGATE_LEAD(c)) { - if(source<pArgs->sourceLimit) { - if(U16_IS_TRAIL(trail=*source)) { - /* output the surrogate pair, will overflow (see conditions comment above) */ - ++source; - overflow[0]=(char)c; - overflow[1]=(char)(c>>8); - overflow[2]=(char)trail; - overflow[3]=(char)(trail>>8); - length=4; /* 4 bytes to output */ - c=0; - } else { - /* unmatched lead surrogate */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - } else { - /* see if the trail surrogate is in the next buffer */ - } - } else { - /* unmatched trail surrogate */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - cnv->fromUChar32=c; - } - - if(length>0) { - /* output length bytes with overflow (length>targetCapacity>0) */ - ucnv_fromUWriteBytes(cnv, - overflow, length, - &target, pArgs->targetLimit, - &offsets, sourceIndex, - pErrorCode); - targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); - } - - if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } - - /* write back the updated pointers */ - pArgs->source=source; - pArgs->target=target; - pArgs->offsets=offsets; -} - -static void U_CALLCONV -_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const uint8_t *source; - UChar *target; - int32_t *offsets; - - uint32_t targetCapacity, length, count, sourceIndex; - UChar c, trail; - - if(pArgs->converter->mode<8) { - _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); - return; - } - - cnv=pArgs->converter; - source=(const uint8_t *)pArgs->source; - length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); - if(length<=0 && cnv->toUnicodeStatus==0) { - /* no input, nothing to do */ - return; - } - - target=pArgs->target; - if(target >= pArgs->targetLimit) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - return; - } - - targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); - offsets=pArgs->offsets; - sourceIndex=0; - c=0; - - /* complete a partial UChar or pair from the last call */ - if(cnv->toUnicodeStatus!=0) { - /* - * special case: single byte from a previous buffer, - * where the byte turned out not to belong to a trail surrogate - * and the preceding, unmatched lead surrogate was put into toUBytes[] - * for error handling - */ - cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; - cnv->toULength=1; - cnv->toUnicodeStatus=0; - } - if((count=cnv->toULength)!=0) { - uint8_t *p=cnv->toUBytes; - do { - p[count++]=*source++; - ++sourceIndex; - --length; - if(count==2) { - c=((UChar)p[1]<<8)|p[0]; - if(U16_IS_SINGLE(c)) { - /* output the BMP code point */ - *target++=c; - if(offsets!=NULL) { - *offsets++=-1; - } - --targetCapacity; - count=0; - c=0; - break; - } else if(U16_IS_SURROGATE_LEAD(c)) { - /* continue collecting bytes for the trail surrogate */ - c=0; /* avoid unnecessary surrogate handling below */ - } else { - /* fall through to error handling for an unmatched trail surrogate */ - break; - } - } else if(count==4) { - c=((UChar)p[1]<<8)|p[0]; - trail=((UChar)p[3]<<8)|p[2]; - if(U16_IS_TRAIL(trail)) { - /* output the surrogate pair */ - *target++=c; - if(targetCapacity>=2) { - *target++=trail; - if(offsets!=NULL) { - *offsets++=-1; - *offsets++=-1; - } - targetCapacity-=2; - } else /* targetCapacity==1 */ { - targetCapacity=0; - cnv->UCharErrorBuffer[0]=trail; - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } - count=0; - c=0; - break; - } else { - /* unmatched lead surrogate, handle here for consistent toUBytes[] */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - - /* back out reading the code unit after it */ - if(((const uint8_t *)pArgs->source-source)>=2) { - source-=2; - } else { - /* - * if the trail unit's first byte was in a previous buffer, then - * we need to put it into a special place because toUBytes[] will be - * used for the lead unit's bytes - */ - cnv->toUnicodeStatus=0x100|p[2]; - --source; - } - cnv->toULength=2; - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - pArgs->offsets=offsets; - return; - } - } - } while(length>0); - cnv->toULength=(int8_t)count; - } - - /* copy an even number of bytes for complete UChars */ - count=2*targetCapacity; - if(count>length) { - count=length&~1; - } - if(c==0 && count>0) { - length-=count; - count>>=1; - targetCapacity-=count; - if(offsets==NULL) { - do { - c=((UChar)source[1]<<8)|source[0]; - source+=2; - if(U16_IS_SINGLE(c)) { - *target++=c; - } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && - U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) - ) { - source+=2; - --count; - *target++=c; - *target++=trail; - } else { - break; - } - } while(--count>0); - } else { - do { - c=((UChar)source[1]<<8)|source[0]; - source+=2; - if(U16_IS_SINGLE(c)) { - *target++=c; - *offsets++=sourceIndex; - sourceIndex+=2; - } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && - U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) - ) { - source+=2; - --count; - *target++=c; - *target++=trail; - *offsets++=sourceIndex; - *offsets++=sourceIndex; - sourceIndex+=4; - } else { - break; - } - } while(--count>0); - } - - if(count==0) { - /* done with the loop for complete UChars */ - c=0; - } else { - /* keep c for surrogate handling, trail will be set there */ - length+=2*(count-1); /* one more byte pair was consumed than count decremented */ - targetCapacity+=count; - } - } - - if(c!=0) { - /* - * c is a surrogate, and - * - source or target too short - * - or the surrogate is unmatched - */ - cnv->toUBytes[0]=(uint8_t)c; - cnv->toUBytes[1]=(uint8_t)(c>>8); - cnv->toULength=2; - - if(U16_IS_SURROGATE_LEAD(c)) { - if(length>=2) { - if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { - /* output the surrogate pair, will overflow (see conditions comment above) */ - source+=2; - length-=2; - *target++=c; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - cnv->UCharErrorBuffer[0]=trail; - cnv->UCharErrorBufferLength=1; - cnv->toULength=0; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } else { - /* unmatched lead surrogate */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - } else { - /* see if the trail surrogate is in the next buffer */ - } - } else { - /* unmatched trail surrogate */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - } - - if(U_SUCCESS(*pErrorCode)) { - /* check for a remaining source byte */ - if(length>0) { - if(targetCapacity==0) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } else { - /* it must be length==1 because otherwise the above would have copied more */ - cnv->toUBytes[cnv->toULength++]=*source++; - } - } - } - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - pArgs->offsets=offsets; -} - -static UChar32 U_CALLCONV -_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { - const uint8_t *s, *sourceLimit; - UChar32 c; - - if(pArgs->converter->mode<8) { - return UCNV_GET_NEXT_UCHAR_USE_TO_U; - } - - s=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - - if(s>=sourceLimit) { - /* no input */ - *err=U_INDEX_OUTOFBOUNDS_ERROR; - return 0xffff; - } - - if(s+2>sourceLimit) { - /* only one byte: truncated UChar */ - pArgs->converter->toUBytes[0]=*s++; - pArgs->converter->toULength=1; - pArgs->source=(const char *)s; - *err = U_TRUNCATED_CHAR_FOUND; - return 0xffff; - } - - /* get one UChar */ - c=((UChar32)s[1]<<8)|*s; - s+=2; - - /* check for a surrogate pair */ - if(U_IS_SURROGATE(c)) { - if(U16_IS_SURROGATE_LEAD(c)) { - if(s+2<=sourceLimit) { - UChar trail; - - /* get a second UChar and see if it is a trail surrogate */ - trail=((UChar)s[1]<<8)|*s; - if(U16_IS_TRAIL(trail)) { - c=U16_GET_SUPPLEMENTARY(c, trail); - s+=2; - } else { - /* unmatched lead surrogate */ - c=-2; - } - } else { - /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ - uint8_t *bytes=pArgs->converter->toUBytes; - s-=2; - pArgs->converter->toULength=(int8_t)(sourceLimit-s); - do { - *bytes++=*s++; - } while(s<sourceLimit); - - c=0xffff; - *err=U_TRUNCATED_CHAR_FOUND; - } - } else { - /* unmatched trail surrogate */ - c=-2; - } - - if(c<0) { - /* write the unmatched surrogate */ - uint8_t *bytes=pArgs->converter->toUBytes; - pArgs->converter->toULength=2; - *bytes=*(s-2); - bytes[1]=*(s-1); - - c=0xffff; - *err=U_ILLEGAL_CHAR_FOUND; - } - } - - pArgs->source=(const char *)s; - return c; -} - -static void U_CALLCONV -_UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) { - if(choice<=UCNV_RESET_TO_UNICODE) { - /* reset toUnicode state */ - if(UCNV_GET_VERSION(cnv)==0) { - cnv->mode=8; /* no BOM handling */ - } else { - cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */ - } - } - if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { - /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */ - cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; - } -} - -static void U_CALLCONV -_UTF16LEOpen(UConverter *cnv, - UConverterLoadArgs *pArgs, - UErrorCode *pErrorCode) { - (void)pArgs; - if(UCNV_GET_VERSION(cnv)<=1) { - _UTF16LEReset(cnv, UCNV_RESET_BOTH); - } else { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - } -} - -static const char * U_CALLCONV -_UTF16LEGetName(const UConverter *cnv) { - if(UCNV_GET_VERSION(cnv)==0) { - return "UTF-16LE"; - } else { - return "UTF-16LE,version=1"; - } -} -U_CDECL_END - -static const UConverterImpl _UTF16LEImpl={ - UCNV_UTF16_LittleEndian, - - NULL, - NULL, - - _UTF16LEOpen, - NULL, - _UTF16LEReset, - - _UTF16LEToUnicodeWithOffsets, - _UTF16LEToUnicodeWithOffsets, - _UTF16LEFromUnicodeWithOffsets, - _UTF16LEFromUnicodeWithOffsets, - _UTF16LEGetNextUChar, - - NULL, - _UTF16LEGetName, - NULL, - NULL, - ucnv_getNonSurrogateUnicodeSet, - - NULL, - NULL -}; - - -static const UConverterStaticData _UTF16LEStaticData={ - sizeof(UConverterStaticData), - "UTF-16LE", - 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, - { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, - 0, - 0, - { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ -}; - - -const UConverterSharedData _UTF16LEData= - UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl); - -/* UTF-16 (Detect BOM) ------------------------------------------------------ */ - -/* - * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE - * accordingly. - * This is a simpler version of the UTF-32 converter, with - * fewer states for shorter BOMs. - * - * State values: - * 0 initial state - * 1 saw first byte - * 2..5 - - * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1 - * 8 UTF-16BE mode - * 9 UTF-16LE mode - * - * During detection: state==number of initial bytes seen so far. - * - * On output, emit U+FEFF as the first code point. - * - * Variants: - * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error. - * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and - * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error. - */ -U_CDECL_BEGIN -static void U_CALLCONV -_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { - if(choice<=UCNV_RESET_TO_UNICODE) { - /* reset toUnicode: state=0 */ - cnv->mode=0; - } - if(choice!=UCNV_RESET_TO_UNICODE) { - /* reset fromUnicode: prepare to output the UTF-16PE BOM */ - cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; - } -} -U_CDECL_END -extern const UConverterSharedData _UTF16v2Data; -U_CDECL_BEGIN -static void U_CALLCONV -_UTF16Open(UConverter *cnv, - UConverterLoadArgs *pArgs, - UErrorCode *pErrorCode) { - if(UCNV_GET_VERSION(cnv)<=2) { - if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) { - /* - * Switch implementation, and switch the staticData that's different - * and was copied into the UConverter. - * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.) - * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream. - */ - cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data; - uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN); - } - _UTF16Reset(cnv, UCNV_RESET_BOTH); - } else { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - } -} - -static const char * U_CALLCONV -_UTF16GetName(const UConverter *cnv) { - if(UCNV_GET_VERSION(cnv)==0) { - return "UTF-16"; - } else if(UCNV_GET_VERSION(cnv)==1) { - return "UTF-16,version=1"; - } else { - return "UTF-16,version=2"; - } -} -U_CDECL_END -extern const UConverterSharedData _UTF16Data; - -static inline bool IS_UTF16BE(const UConverter *cnv) { - return ((cnv)->sharedData == &_UTF16BEData); -} - -static inline bool IS_UTF16LE(const UConverter *cnv) { - return ((cnv)->sharedData == &_UTF16LEData); -} - -static inline bool IS_UTF16(const UConverter *cnv) { - return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data); -} - -U_CDECL_BEGIN -static void U_CALLCONV -_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv=pArgs->converter; - const char *source=pArgs->source; - const char *sourceLimit=pArgs->sourceLimit; - int32_t *offsets=pArgs->offsets; - - int32_t state, offsetDelta; - uint8_t b; - - state=cnv->mode; - - /* - * If we detect a BOM in this buffer, then we must add the BOM size to the - * offsets because the actual converter function will not see and count the BOM. - * offsetDelta will have the number of the BOM bytes that are in the current buffer. - */ - offsetDelta=0; - - while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { - switch(state) { - case 0: - cnv->toUBytes[0]=(uint8_t)*source++; - cnv->toULength=1; - state=1; - break; - case 1: - /* - * Only inside this switch case can the state variable - * temporarily take two additional values: - * 6: BOM error, continue with BE - * 7: BOM error, continue with LE - */ - b=*source; - if(cnv->toUBytes[0]==0xfe && b==0xff) { - if(IS_UTF16LE(cnv)) { - state=7; /* illegal reverse BOM for Java "UnicodeLittle" */ - } else { - state=8; /* detect UTF-16BE */ - } - } else if(cnv->toUBytes[0]==0xff && b==0xfe) { - if(IS_UTF16BE(cnv)) { - state=6; /* illegal reverse BOM for Java "UnicodeBig" */ - } else { - state=9; /* detect UTF-16LE */ - } - } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) { - state=6; /* illegal missing BOM for Java "Unicode" */ - } - if(state>=8) { - /* BOM detected, consume it */ - ++source; - cnv->toULength=0; - offsetDelta=(int32_t)(source-pArgs->source); - } else if(state<6) { - /* ok: no BOM, and not a reverse BOM */ - if(source!=pArgs->source) { - /* reset the source for a correct first offset */ - source=pArgs->source; - cnv->toULength=0; - } - if(IS_UTF16LE(cnv)) { - /* Make Java "UnicodeLittle" default to LE. */ - state=9; - } else { - /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */ - state=8; - } - } else { - /* - * error: missing BOM, or reverse BOM - * UTF-16,version=1: Java-specific "Unicode" requires a BOM. - * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM. - * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM. - */ - /* report the non-BOM or reverse BOM as an illegal sequence */ - cnv->toUBytes[1]=b; - cnv->toULength=2; - pArgs->source=source+1; - /* continue with conversion if the callback resets the error */ - /* - * Make Java "Unicode" default to BE like standard UTF-16. - * Make Java "UnicodeBig" and "UnicodeLittle" default - * to their normal endiannesses. - */ - cnv->mode=state+2; - *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; - return; - } - /* convert the rest of the stream */ - cnv->mode=state; - continue; - case 8: - /* call UTF-16BE */ - pArgs->source=source; - _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); - source=pArgs->source; - break; - case 9: - /* call UTF-16LE */ - pArgs->source=source; - _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); - source=pArgs->source; - break; - default: - break; /* does not occur */ - } - } - - /* add BOM size to offsets - see comment at offsetDelta declaration */ - if(offsets!=NULL && offsetDelta!=0) { - int32_t *offsetsLimit=pArgs->offsets; - while(offsets<offsetsLimit) { - *offsets++ += offsetDelta; - } - } - - pArgs->source=source; - - if(source==sourceLimit && pArgs->flush) { - /* handle truncated input */ - switch(state) { - case 0: - break; /* no input at all, nothing to do */ - case 8: - _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); - break; - case 9: - _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); - break; - default: - /* 0<state<8: framework will report truncation, nothing to do here */ - break; - } - } - - cnv->mode=state; -} - -static UChar32 U_CALLCONV -_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - switch(pArgs->converter->mode) { - case 8: - return _UTF16BEGetNextUChar(pArgs, pErrorCode); - case 9: - return _UTF16LEGetNextUChar(pArgs, pErrorCode); - default: - return UCNV_GET_NEXT_UCHAR_USE_TO_U; - } -} -U_CDECL_END - -static const UConverterImpl _UTF16Impl = { - UCNV_UTF16, - - NULL, - NULL, - - _UTF16Open, - NULL, - _UTF16Reset, - - _UTF16ToUnicodeWithOffsets, - _UTF16ToUnicodeWithOffsets, - _UTF16PEFromUnicodeWithOffsets, - _UTF16PEFromUnicodeWithOffsets, - _UTF16GetNextUChar, - - NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ - _UTF16GetName, - NULL, - NULL, - ucnv_getNonSurrogateUnicodeSet, - - NULL, - NULL -}; - -static const UConverterStaticData _UTF16StaticData = { - sizeof(UConverterStaticData), - "UTF-16", - 1204, /* CCSID for BOM sensitive UTF-16 */ - UCNV_IBM, UCNV_UTF16, 2, 2, -#if U_IS_BIG_ENDIAN - { 0xff, 0xfd, 0, 0 }, 2, -#else - { 0xfd, 0xff, 0, 0 }, 2, -#endif - FALSE, FALSE, - 0, - 0, - { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ -}; - -const UConverterSharedData _UTF16Data = - UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl); - -static const UConverterImpl _UTF16v2Impl = { - UCNV_UTF16, - - NULL, - NULL, - - _UTF16Open, - NULL, - _UTF16Reset, - - _UTF16ToUnicodeWithOffsets, - _UTF16ToUnicodeWithOffsets, - _UTF16BEFromUnicodeWithOffsets, - _UTF16BEFromUnicodeWithOffsets, - _UTF16GetNextUChar, - - NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ - _UTF16GetName, - NULL, - NULL, - ucnv_getNonSurrogateUnicodeSet, - - NULL, - NULL -}; - -static const UConverterStaticData _UTF16v2StaticData = { - sizeof(UConverterStaticData), - "UTF-16,version=2", - 1204, /* CCSID for BOM sensitive UTF-16 */ - UCNV_IBM, UCNV_UTF16, 2, 2, - { 0xff, 0xfd, 0, 0 }, 2, - FALSE, FALSE, - 0, - 0, - { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ -}; - -const UConverterSharedData _UTF16v2Data = - UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl); - -#endif + +static void U_CALLCONV +_UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) { + if(choice<=UCNV_RESET_TO_UNICODE) { + /* reset toUnicode state */ + if(UCNV_GET_VERSION(cnv)==0) { + cnv->mode=8; /* no BOM handling */ + } else { + cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */ + } + } + if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { + /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */ + cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; + } +} + +static void U_CALLCONV +_UTF16LEOpen(UConverter *cnv, + UConverterLoadArgs *pArgs, + UErrorCode *pErrorCode) { + (void)pArgs; + if(UCNV_GET_VERSION(cnv)<=1) { + _UTF16LEReset(cnv, UCNV_RESET_BOTH); + } else { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + } +} + +static const char * U_CALLCONV +_UTF16LEGetName(const UConverter *cnv) { + if(UCNV_GET_VERSION(cnv)==0) { + return "UTF-16LE"; + } else { + return "UTF-16LE,version=1"; + } +} +U_CDECL_END + +static const UConverterImpl _UTF16LEImpl={ + UCNV_UTF16_LittleEndian, + + NULL, + NULL, + + _UTF16LEOpen, + NULL, + _UTF16LEReset, + + _UTF16LEToUnicodeWithOffsets, + _UTF16LEToUnicodeWithOffsets, + _UTF16LEFromUnicodeWithOffsets, + _UTF16LEFromUnicodeWithOffsets, + _UTF16LEGetNextUChar, + + NULL, + _UTF16LEGetName, + NULL, + NULL, + ucnv_getNonSurrogateUnicodeSet, + + NULL, + NULL +}; + + +static const UConverterStaticData _UTF16LEStaticData={ + sizeof(UConverterStaticData), + "UTF-16LE", + 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, + { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + + +const UConverterSharedData _UTF16LEData= + UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl); + +/* UTF-16 (Detect BOM) ------------------------------------------------------ */ + +/* + * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE + * accordingly. + * This is a simpler version of the UTF-32 converter, with + * fewer states for shorter BOMs. + * + * State values: + * 0 initial state + * 1 saw first byte + * 2..5 - + * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1 + * 8 UTF-16BE mode + * 9 UTF-16LE mode + * + * During detection: state==number of initial bytes seen so far. + * + * On output, emit U+FEFF as the first code point. + * + * Variants: + * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error. + * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and + * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error. + */ +U_CDECL_BEGIN +static void U_CALLCONV +_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { + if(choice<=UCNV_RESET_TO_UNICODE) { + /* reset toUnicode: state=0 */ + cnv->mode=0; + } + if(choice!=UCNV_RESET_TO_UNICODE) { + /* reset fromUnicode: prepare to output the UTF-16PE BOM */ + cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; + } +} +U_CDECL_END +extern const UConverterSharedData _UTF16v2Data; +U_CDECL_BEGIN +static void U_CALLCONV +_UTF16Open(UConverter *cnv, + UConverterLoadArgs *pArgs, + UErrorCode *pErrorCode) { + if(UCNV_GET_VERSION(cnv)<=2) { + if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) { + /* + * Switch implementation, and switch the staticData that's different + * and was copied into the UConverter. + * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.) + * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream. + */ + cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data; + uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN); + } + _UTF16Reset(cnv, UCNV_RESET_BOTH); + } else { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + } +} + +static const char * U_CALLCONV +_UTF16GetName(const UConverter *cnv) { + if(UCNV_GET_VERSION(cnv)==0) { + return "UTF-16"; + } else if(UCNV_GET_VERSION(cnv)==1) { + return "UTF-16,version=1"; + } else { + return "UTF-16,version=2"; + } +} +U_CDECL_END +extern const UConverterSharedData _UTF16Data; + +static inline bool IS_UTF16BE(const UConverter *cnv) { + return ((cnv)->sharedData == &_UTF16BEData); +} + +static inline bool IS_UTF16LE(const UConverter *cnv) { + return ((cnv)->sharedData == &_UTF16LEData); +} + +static inline bool IS_UTF16(const UConverter *cnv) { + return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data); +} + +U_CDECL_BEGIN +static void U_CALLCONV +_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv=pArgs->converter; + const char *source=pArgs->source; + const char *sourceLimit=pArgs->sourceLimit; + int32_t *offsets=pArgs->offsets; + + int32_t state, offsetDelta; + uint8_t b; + + state=cnv->mode; + + /* + * If we detect a BOM in this buffer, then we must add the BOM size to the + * offsets because the actual converter function will not see and count the BOM. + * offsetDelta will have the number of the BOM bytes that are in the current buffer. + */ + offsetDelta=0; + + while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { + switch(state) { + case 0: + cnv->toUBytes[0]=(uint8_t)*source++; + cnv->toULength=1; + state=1; + break; + case 1: + /* + * Only inside this switch case can the state variable + * temporarily take two additional values: + * 6: BOM error, continue with BE + * 7: BOM error, continue with LE + */ + b=*source; + if(cnv->toUBytes[0]==0xfe && b==0xff) { + if(IS_UTF16LE(cnv)) { + state=7; /* illegal reverse BOM for Java "UnicodeLittle" */ + } else { + state=8; /* detect UTF-16BE */ + } + } else if(cnv->toUBytes[0]==0xff && b==0xfe) { + if(IS_UTF16BE(cnv)) { + state=6; /* illegal reverse BOM for Java "UnicodeBig" */ + } else { + state=9; /* detect UTF-16LE */ + } + } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) { + state=6; /* illegal missing BOM for Java "Unicode" */ + } + if(state>=8) { + /* BOM detected, consume it */ + ++source; + cnv->toULength=0; + offsetDelta=(int32_t)(source-pArgs->source); + } else if(state<6) { + /* ok: no BOM, and not a reverse BOM */ + if(source!=pArgs->source) { + /* reset the source for a correct first offset */ + source=pArgs->source; + cnv->toULength=0; + } + if(IS_UTF16LE(cnv)) { + /* Make Java "UnicodeLittle" default to LE. */ + state=9; + } else { + /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */ + state=8; + } + } else { + /* + * error: missing BOM, or reverse BOM + * UTF-16,version=1: Java-specific "Unicode" requires a BOM. + * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM. + * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM. + */ + /* report the non-BOM or reverse BOM as an illegal sequence */ + cnv->toUBytes[1]=b; + cnv->toULength=2; + pArgs->source=source+1; + /* continue with conversion if the callback resets the error */ + /* + * Make Java "Unicode" default to BE like standard UTF-16. + * Make Java "UnicodeBig" and "UnicodeLittle" default + * to their normal endiannesses. + */ + cnv->mode=state+2; + *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; + return; + } + /* convert the rest of the stream */ + cnv->mode=state; + continue; + case 8: + /* call UTF-16BE */ + pArgs->source=source; + _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); + source=pArgs->source; + break; + case 9: + /* call UTF-16LE */ + pArgs->source=source; + _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); + source=pArgs->source; + break; + default: + break; /* does not occur */ + } + } + + /* add BOM size to offsets - see comment at offsetDelta declaration */ + if(offsets!=NULL && offsetDelta!=0) { + int32_t *offsetsLimit=pArgs->offsets; + while(offsets<offsetsLimit) { + *offsets++ += offsetDelta; + } + } + + pArgs->source=source; + + if(source==sourceLimit && pArgs->flush) { + /* handle truncated input */ + switch(state) { + case 0: + break; /* no input at all, nothing to do */ + case 8: + _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); + break; + case 9: + _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); + break; + default: + /* 0<state<8: framework will report truncation, nothing to do here */ + break; + } + } + + cnv->mode=state; +} + +static UChar32 U_CALLCONV +_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + switch(pArgs->converter->mode) { + case 8: + return _UTF16BEGetNextUChar(pArgs, pErrorCode); + case 9: + return _UTF16LEGetNextUChar(pArgs, pErrorCode); + default: + return UCNV_GET_NEXT_UCHAR_USE_TO_U; + } +} +U_CDECL_END + +static const UConverterImpl _UTF16Impl = { + UCNV_UTF16, + + NULL, + NULL, + + _UTF16Open, + NULL, + _UTF16Reset, + + _UTF16ToUnicodeWithOffsets, + _UTF16ToUnicodeWithOffsets, + _UTF16PEFromUnicodeWithOffsets, + _UTF16PEFromUnicodeWithOffsets, + _UTF16GetNextUChar, + + NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ + _UTF16GetName, + NULL, + NULL, + ucnv_getNonSurrogateUnicodeSet, + + NULL, + NULL +}; + +static const UConverterStaticData _UTF16StaticData = { + sizeof(UConverterStaticData), + "UTF-16", + 1204, /* CCSID for BOM sensitive UTF-16 */ + UCNV_IBM, UCNV_UTF16, 2, 2, +#if U_IS_BIG_ENDIAN + { 0xff, 0xfd, 0, 0 }, 2, +#else + { 0xfd, 0xff, 0, 0 }, 2, +#endif + FALSE, FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +const UConverterSharedData _UTF16Data = + UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl); + +static const UConverterImpl _UTF16v2Impl = { + UCNV_UTF16, + + NULL, + NULL, + + _UTF16Open, + NULL, + _UTF16Reset, + + _UTF16ToUnicodeWithOffsets, + _UTF16ToUnicodeWithOffsets, + _UTF16BEFromUnicodeWithOffsets, + _UTF16BEFromUnicodeWithOffsets, + _UTF16GetNextUChar, + + NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ + _UTF16GetName, + NULL, + NULL, + ucnv_getNonSurrogateUnicodeSet, + + NULL, + NULL +}; + +static const UConverterStaticData _UTF16v2StaticData = { + sizeof(UConverterStaticData), + "UTF-16,version=2", + 1204, /* CCSID for BOM sensitive UTF-16 */ + UCNV_IBM, UCNV_UTF16, 2, 2, + { 0xff, 0xfd, 0, 0 }, 2, + FALSE, FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +const UConverterSharedData _UTF16v2Data = + UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl); + +#endif |