diff options
author | neksard <[email protected]> | 2022-02-10 16:45:33 +0300 |
---|---|---|
committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:45:33 +0300 |
commit | 1d9c550e7c38e051d7961f576013a482003a70d9 (patch) | |
tree | b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/common/ucnvbocu.cpp | |
parent | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff) |
Restoring authorship annotation for <[email protected]>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/common/ucnvbocu.cpp')
-rw-r--r-- | contrib/libs/icu/common/ucnvbocu.cpp | 2818 |
1 files changed, 1409 insertions, 1409 deletions
diff --git a/contrib/libs/icu/common/ucnvbocu.cpp b/contrib/libs/icu/common/ucnvbocu.cpp index ee115e0ebe7..7c2aab56558 100644 --- a/contrib/libs/icu/common/ucnvbocu.cpp +++ b/contrib/libs/icu/common/ucnvbocu.cpp @@ -1,1413 +1,1413 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 2002-2016, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: ucnvbocu.cpp +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 2002-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: ucnvbocu.cpp * encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2002mar27 -* created by: Markus W. Scherer -* -* This is an implementation of the Binary Ordered Compression for Unicode, -* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION - -#include "unicode/ucnv.h" -#include "unicode/ucnv_cb.h" -#include "unicode/utf16.h" -#include "putilimp.h" -#include "ucnv_bld.h" -#include "ucnv_cnv.h" -#include "uassert.h" - -/* BOCU-1 constants and macros ---------------------------------------------- */ - -/* - * BOCU-1 encodes the code points of a Unicode string as - * a sequence of byte-encoded differences (slope detection), - * preserving lexical order. - * - * Optimize the difference-taking for runs of Unicode text within - * small scripts: - * - * Most small scripts are allocated within aligned 128-blocks of Unicode - * code points. Lexical order is preserved if the "previous code point" state - * is always moved into the middle of such a block. - * - * Additionally, "prev" is moved from anywhere in the Unihan and Hangul - * areas into the middle of those areas. - * - * C0 control codes and space are encoded with their US-ASCII bytes. - * "prev" is reset for C0 controls but not for space. - */ - -/* initial value for "prev": middle of the ASCII range */ -#define BOCU1_ASCII_PREV 0x40 - -/* bounding byte values for differences */ -#define BOCU1_MIN 0x21 -#define BOCU1_MIDDLE 0x90 -#define BOCU1_MAX_LEAD 0xfe -#define BOCU1_MAX_TRAIL 0xff -#define BOCU1_RESET 0xff - -/* number of lead bytes */ -#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) - -/* adjust trail byte counts for the use of some C0 control byte values */ -#define BOCU1_TRAIL_CONTROLS_COUNT 20 -#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) - -/* number of trail bytes */ -#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) - -/* - * number of positive and negative single-byte codes - * (counting 0==BOCU1_MIDDLE among the positive ones) - */ -#define BOCU1_SINGLE 64 - -/* number of lead bytes for positive and negative 2/3/4-byte sequences */ -#define BOCU1_LEAD_2 43 -#define BOCU1_LEAD_3 3 -#define BOCU1_LEAD_4 1 - -/* The difference value range for single-byters. */ -#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) -#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) - -/* The difference value range for double-byters. */ -#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) -#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) - -/* The difference value range for 3-byters. */ -#define BOCU1_REACH_POS_3 \ - (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) - -#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) - -/* The lead byte start values. */ -#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) -#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) -#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) - /* ==BOCU1_MAX_LEAD */ - -#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) -#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) -#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) - /* ==BOCU1_MIN+1 */ - -/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ -#define BOCU1_LENGTH_FROM_LEAD(lead) \ - ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ - (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ - (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) - -/* The length of a byte sequence, according to its packed form. */ -#define BOCU1_LENGTH_FROM_PACKED(packed) \ - ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) - -/* - * 12 commonly used C0 control codes (and space) are only used to encode - * themselves directly, - * which makes BOCU-1 MIME-usable and reasonably safe for - * ASCII-oriented software. - * - * These controls are - * 0 NUL - * - * 7 BEL - * 8 BS - * - * 9 TAB - * a LF - * b VT - * c FF - * d CR - * - * e SO - * f SI - * - * 1a SUB - * 1b ESC - * - * The other 20 C0 controls are also encoded directly (to preserve order) - * but are also used as trail bytes in difference encoding - * (for better compression). - */ -#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) - -/* - * Byte value map for control codes, - * from external byte values 0x00..0x20 - * to trail byte values 0..19 (0..0x13) as used in the difference calculation. - * External byte values that are illegal as trail bytes are mapped to -1. - */ -static const int8_t -bocu1ByteToTrail[BOCU1_MIN]={ -/* 0 1 2 3 4 5 6 7 */ - -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, - -/* 8 9 a b c d e f */ - -1, -1, -1, -1, -1, -1, -1, -1, - -/* 10 11 12 13 14 15 16 17 */ - 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, - -/* 18 19 1a 1b 1c 1d 1e 1f */ - 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, - -/* 20 */ - -1 -}; - -/* - * Byte value map for control codes, - * from trail byte values 0..19 (0..0x13) as used in the difference calculation - * to external byte values 0x00..0x20. - */ -static const int8_t -bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ -/* 0 1 2 3 4 5 6 7 */ - 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, - -/* 8 9 a b c d e f */ - 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, - -/* 10 11 12 13 */ - 0x1c, 0x1d, 0x1e, 0x1f -}; - -/** - * Integer division and modulo with negative numerators - * yields negative modulo results and quotients that are one more than - * what we need here. - * This macro adjust the results so that the modulo-value m is always >=0. - * - * For positive n, the if() condition is always FALSE. - * - * @param n Number to be split into quotient and rest. - * Will be modified to contain the quotient. - * @param d Divisor. - * @param m Output variable for the rest (modulo result). - */ +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002mar27 +* created by: Markus W. Scherer +* +* This is an implementation of the Binary Ordered Compression for Unicode, +* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION + +#include "unicode/ucnv.h" +#include "unicode/ucnv_cb.h" +#include "unicode/utf16.h" +#include "putilimp.h" +#include "ucnv_bld.h" +#include "ucnv_cnv.h" +#include "uassert.h" + +/* BOCU-1 constants and macros ---------------------------------------------- */ + +/* + * BOCU-1 encodes the code points of a Unicode string as + * a sequence of byte-encoded differences (slope detection), + * preserving lexical order. + * + * Optimize the difference-taking for runs of Unicode text within + * small scripts: + * + * Most small scripts are allocated within aligned 128-blocks of Unicode + * code points. Lexical order is preserved if the "previous code point" state + * is always moved into the middle of such a block. + * + * Additionally, "prev" is moved from anywhere in the Unihan and Hangul + * areas into the middle of those areas. + * + * C0 control codes and space are encoded with their US-ASCII bytes. + * "prev" is reset for C0 controls but not for space. + */ + +/* initial value for "prev": middle of the ASCII range */ +#define BOCU1_ASCII_PREV 0x40 + +/* bounding byte values for differences */ +#define BOCU1_MIN 0x21 +#define BOCU1_MIDDLE 0x90 +#define BOCU1_MAX_LEAD 0xfe +#define BOCU1_MAX_TRAIL 0xff +#define BOCU1_RESET 0xff + +/* number of lead bytes */ +#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) + +/* adjust trail byte counts for the use of some C0 control byte values */ +#define BOCU1_TRAIL_CONTROLS_COUNT 20 +#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) + +/* number of trail bytes */ +#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) + +/* + * number of positive and negative single-byte codes + * (counting 0==BOCU1_MIDDLE among the positive ones) + */ +#define BOCU1_SINGLE 64 + +/* number of lead bytes for positive and negative 2/3/4-byte sequences */ +#define BOCU1_LEAD_2 43 +#define BOCU1_LEAD_3 3 +#define BOCU1_LEAD_4 1 + +/* The difference value range for single-byters. */ +#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) +#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) + +/* The difference value range for double-byters. */ +#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) +#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) + +/* The difference value range for 3-byters. */ +#define BOCU1_REACH_POS_3 \ + (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) + +#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) + +/* The lead byte start values. */ +#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) +#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) +#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) + /* ==BOCU1_MAX_LEAD */ + +#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) +#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) +#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) + /* ==BOCU1_MIN+1 */ + +/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ +#define BOCU1_LENGTH_FROM_LEAD(lead) \ + ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ + (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ + (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) + +/* The length of a byte sequence, according to its packed form. */ +#define BOCU1_LENGTH_FROM_PACKED(packed) \ + ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) + +/* + * 12 commonly used C0 control codes (and space) are only used to encode + * themselves directly, + * which makes BOCU-1 MIME-usable and reasonably safe for + * ASCII-oriented software. + * + * These controls are + * 0 NUL + * + * 7 BEL + * 8 BS + * + * 9 TAB + * a LF + * b VT + * c FF + * d CR + * + * e SO + * f SI + * + * 1a SUB + * 1b ESC + * + * The other 20 C0 controls are also encoded directly (to preserve order) + * but are also used as trail bytes in difference encoding + * (for better compression). + */ +#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) + +/* + * Byte value map for control codes, + * from external byte values 0x00..0x20 + * to trail byte values 0..19 (0..0x13) as used in the difference calculation. + * External byte values that are illegal as trail bytes are mapped to -1. + */ +static const int8_t +bocu1ByteToTrail[BOCU1_MIN]={ +/* 0 1 2 3 4 5 6 7 */ + -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, + +/* 8 9 a b c d e f */ + -1, -1, -1, -1, -1, -1, -1, -1, + +/* 10 11 12 13 14 15 16 17 */ + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + +/* 18 19 1a 1b 1c 1d 1e 1f */ + 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, + +/* 20 */ + -1 +}; + +/* + * Byte value map for control codes, + * from trail byte values 0..19 (0..0x13) as used in the difference calculation + * to external byte values 0x00..0x20. + */ +static const int8_t +bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ +/* 0 1 2 3 4 5 6 7 */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, + +/* 8 9 a b c d e f */ + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + +/* 10 11 12 13 */ + 0x1c, 0x1d, 0x1e, 0x1f +}; + +/** + * Integer division and modulo with negative numerators + * yields negative modulo results and quotients that are one more than + * what we need here. + * This macro adjust the results so that the modulo-value m is always >=0. + * + * For positive n, the if() condition is always FALSE. + * + * @param n Number to be split into quotient and rest. + * Will be modified to contain the quotient. + * @param d Divisor. + * @param m Output variable for the rest (modulo result). + */ #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \ - (m)=(n)%(d); \ - (n)/=(d); \ - if((m)<0) { \ - --(n); \ - (m)+=(d); \ - } \ + (m)=(n)%(d); \ + (n)/=(d); \ + if((m)<0) { \ + --(n); \ + (m)+=(d); \ + } \ } UPRV_BLOCK_MACRO_END - -/* Faster versions of packDiff() for single-byte-encoded diff values. */ - -/** Is a diff value encodable in a single byte? */ -#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) - -/** Encode a diff value in a single byte. */ -#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) - -/** Is a diff value encodable in two bytes? */ -#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) - -/* BOCU-1 implementation functions ------------------------------------------ */ - -#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) - -/** - * Compute the next "previous" value for differencing - * from the current code point. - * - * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) - * @return "previous code point" state value - */ -static inline int32_t -bocu1Prev(int32_t c) { - /* compute new prev */ - if(/* 0x3040<=c && */ c<=0x309f) { - /* Hiragana is not 128-aligned */ - return 0x3070; - } else if(0x4e00<=c && c<=0x9fa5) { - /* CJK Unihan */ - return 0x4e00-BOCU1_REACH_NEG_2; - } else if(0xac00<=c /* && c<=0xd7a3 */) { - /* Korean Hangul */ - return (0xd7a3+0xac00)/2; - } else { - /* mostly small scripts */ - return BOCU1_SIMPLE_PREV(c); - } -} - -/** Fast version of bocu1Prev() for most scripts. */ -#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) - -/* - * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. - * The UConverter fields are used as follows: - * - * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) - * - * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) - * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) - */ - -/* BOCU-1-from-Unicode conversion functions --------------------------------- */ - -/** - * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes - * and return a packed integer with them. - * - * The encoding favors small absolute differences with short encodings - * to compress runs of same-script characters. - * - * Optimized version with unrolled loops and fewer floating-point operations - * than the standard packDiff(). - * - * @param diff difference value -0x10ffff..0x10ffff - * @return - * 0x010000zz for 1-byte sequence zz - * 0x0200yyzz for 2-byte sequence yy zz - * 0x03xxyyzz for 3-byte sequence xx yy zz - * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) - */ -static int32_t -packDiff(int32_t diff) { - int32_t result, m; - - U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ - if(diff>=BOCU1_REACH_NEG_1) { - /* mostly positive differences, and single-byte negative ones */ -#if 0 /* single-byte case handled in macros, see below */ - if(diff<=BOCU1_REACH_POS_1) { - /* single byte */ - return 0x01000000|(BOCU1_MIDDLE+diff); - } else -#endif - if(diff<=BOCU1_REACH_POS_2) { - /* two bytes */ - diff-=BOCU1_REACH_POS_1+1; - result=0x02000000; - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m); - - result|=(BOCU1_START_POS_2+diff)<<8; - } else if(diff<=BOCU1_REACH_POS_3) { - /* three bytes */ - diff-=BOCU1_REACH_POS_2+1; - result=0x03000000; - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m); - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m)<<8; - - result|=(BOCU1_START_POS_3+diff)<<16; - } else { - /* four bytes */ - diff-=BOCU1_REACH_POS_3+1; - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result=BOCU1_TRAIL_TO_BYTE(m); - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m)<<8; - - /* - * We know that / and % would deliver quotient 0 and rest=diff. - * Avoid division and modulo for performance. - */ - result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; - - result|=((uint32_t)BOCU1_START_POS_4)<<24; - } - } else { - /* two- to four-byte negative differences */ - if(diff>=BOCU1_REACH_NEG_2) { - /* two bytes */ - diff-=BOCU1_REACH_NEG_1; - result=0x02000000; - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result|=BOCU1_TRAIL_TO_BYTE(m); - - result|=(BOCU1_START_NEG_2+diff)<<8; - } else if(diff>=BOCU1_REACH_NEG_3) { - /* three bytes */ - diff-=BOCU1_REACH_NEG_2; - result=0x03000000; - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result|=BOCU1_TRAIL_TO_BYTE(m); - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result|=BOCU1_TRAIL_TO_BYTE(m)<<8; - - result|=(BOCU1_START_NEG_3+diff)<<16; - } else { - /* four bytes */ - diff-=BOCU1_REACH_NEG_3; - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result=BOCU1_TRAIL_TO_BYTE(m); - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result|=BOCU1_TRAIL_TO_BYTE(m)<<8; - - /* - * We know that NEGDIVMOD would deliver - * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. - * Avoid division and modulo for performance. - */ - m=diff+BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m)<<16; - - result|=BOCU1_MIN<<24; - } - } - return result; -} - - -static void U_CALLCONV -_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const UChar *source, *sourceLimit; - uint8_t *target; - int32_t targetCapacity; - int32_t *offsets; - - int32_t prev, c, diff; - - int32_t sourceIndex, nextSourceIndex; - - /* set up the local pointers */ - cnv=pArgs->converter; - source=pArgs->source; - sourceLimit=pArgs->sourceLimit; - target=(uint8_t *)pArgs->target; - targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); - offsets=pArgs->offsets; - - /* get the converter state from UConverter */ - c=cnv->fromUChar32; - prev=(int32_t)cnv->fromUnicodeStatus; - if(prev==0) { - prev=BOCU1_ASCII_PREV; - } - - /* sourceIndex=-1 if the current character began in the previous buffer */ - sourceIndex= c==0 ? 0 : -1; - nextSourceIndex=0; - - /* conversion loop */ - if(c!=0 && targetCapacity>0) { - goto getTrail; - } - -fastSingle: - /* fast loop for single-byte differences */ - /* use only one loop counter variable, targetCapacity, not also source */ - diff=(int32_t)(sourceLimit-source); - if(targetCapacity>diff) { - targetCapacity=diff; - } - while(targetCapacity>0 && (c=*source)<0x3000) { - if(c<=0x20) { - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(uint8_t)c; - *offsets++=nextSourceIndex++; - ++source; - --targetCapacity; - } else { - diff=c-prev; - if(DIFF_IS_SINGLE(diff)) { - prev=BOCU1_SIMPLE_PREV(c); - *target++=(uint8_t)PACK_SINGLE_DIFF(diff); - *offsets++=nextSourceIndex++; - ++source; - --targetCapacity; - } else { - break; - } - } - } - /* restore real values */ - targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); - sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ - - /* regular loop for all cases */ - while(source<sourceLimit) { - if(targetCapacity>0) { - c=*source++; - ++nextSourceIndex; - - if(c<=0x20) { - /* - * ISO C0 control & space: - * Encode directly for MIME compatibility, - * and reset state except for space, to not disrupt compression. - */ - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(uint8_t)c; - *offsets++=sourceIndex; - --targetCapacity; - - sourceIndex=nextSourceIndex; - continue; - } - - if(U16_IS_LEAD(c)) { -getTrail: - if(source<sourceLimit) { - /* test the following code unit */ - UChar trail=*source; - if(U16_IS_TRAIL(trail)) { - ++source; - ++nextSourceIndex; - c=U16_GET_SUPPLEMENTARY(c, trail); - } - } else { - /* no more input */ - c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ - break; - } - } - - /* - * all other Unicode code points c==U+0021..U+10ffff - * are encoded with the difference c-prev - * - * a new prev is computed from c, - * placed in the middle of a 0x80-block (for most small scripts) or - * in the middle of the Unihan and Hangul blocks - * to statistically minimize the following difference - */ - diff=c-prev; - prev=BOCU1_PREV(c); - if(DIFF_IS_SINGLE(diff)) { - *target++=(uint8_t)PACK_SINGLE_DIFF(diff); - *offsets++=sourceIndex; - --targetCapacity; - sourceIndex=nextSourceIndex; - if(c<0x3000) { - goto fastSingle; - } - } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { - /* optimize 2-byte case */ - int32_t m; - - if(diff>=0) { - diff-=BOCU1_REACH_POS_1+1; - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - diff+=BOCU1_START_POS_2; - } else { - diff-=BOCU1_REACH_NEG_1; - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - diff+=BOCU1_START_NEG_2; - } - *target++=(uint8_t)diff; - *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); - *offsets++=sourceIndex; - *offsets++=sourceIndex; - targetCapacity-=2; - sourceIndex=nextSourceIndex; - } else { - int32_t length; /* will be 2..4 */ - - diff=packDiff(diff); - length=BOCU1_LENGTH_FROM_PACKED(diff); - - /* write the output character bytes from diff and length */ - /* from the first if in the loop we know that targetCapacity>0 */ - if(length<=targetCapacity) { - switch(length) { - /* each branch falls through to the next one */ - case 4: - *target++=(uint8_t)(diff>>24); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 3: - *target++=(uint8_t)(diff>>16); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(diff>>8); - *offsets++=sourceIndex; - /* case 1: handled above */ - *target++=(uint8_t)diff; - *offsets++=sourceIndex; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - targetCapacity-=length; - sourceIndex=nextSourceIndex; - } else { - uint8_t *charErrorBuffer; - - /* - * We actually do this backwards here: - * In order to save an intermediate variable, we output - * first to the overflow buffer what does not fit into the - * regular target. - */ - /* we know that 1<=targetCapacity<length<=4 */ - length-=targetCapacity; - charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; - switch(length) { - /* each branch falls through to the next one */ - case 3: - *charErrorBuffer++=(uint8_t)(diff>>16); - U_FALLTHROUGH; - case 2: - *charErrorBuffer++=(uint8_t)(diff>>8); - U_FALLTHROUGH; - case 1: - *charErrorBuffer=(uint8_t)diff; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - cnv->charErrorBufferLength=(int8_t)length; - - /* now output what fits into the regular target */ - diff>>=8*length; /* length was reduced by targetCapacity */ - switch(targetCapacity) { - /* each branch falls through to the next one */ - case 3: - *target++=(uint8_t)(diff>>16); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(diff>>8); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 1: - *target++=(uint8_t)diff; - *offsets++=sourceIndex; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - - /* target overflow */ - targetCapacity=0; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - } else { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - - /* set the converter state back into UConverter */ - cnv->fromUChar32= c<0 ? -c : 0; - cnv->fromUnicodeStatus=(uint32_t)prev; - - /* write back the updated pointers */ - pArgs->source=source; - pArgs->target=(char *)target; - pArgs->offsets=offsets; -} - -/* - * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. - * If a change is made in the original function, then either - * change this function the same way or - * re-copy the original function and remove the variables - * offsets, sourceIndex, and nextSourceIndex. - */ -static void U_CALLCONV -_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const UChar *source, *sourceLimit; - uint8_t *target; - int32_t targetCapacity; - - int32_t prev, c, diff; - - /* set up the local pointers */ - cnv=pArgs->converter; - source=pArgs->source; - sourceLimit=pArgs->sourceLimit; - target=(uint8_t *)pArgs->target; - targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); - - /* get the converter state from UConverter */ - c=cnv->fromUChar32; - prev=(int32_t)cnv->fromUnicodeStatus; - if(prev==0) { - prev=BOCU1_ASCII_PREV; - } - - /* conversion loop */ - if(c!=0 && targetCapacity>0) { - goto getTrail; - } - -fastSingle: - /* fast loop for single-byte differences */ - /* use only one loop counter variable, targetCapacity, not also source */ - diff=(int32_t)(sourceLimit-source); - if(targetCapacity>diff) { - targetCapacity=diff; - } - while(targetCapacity>0 && (c=*source)<0x3000) { - if(c<=0x20) { - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(uint8_t)c; - } else { - diff=c-prev; - if(DIFF_IS_SINGLE(diff)) { - prev=BOCU1_SIMPLE_PREV(c); - *target++=(uint8_t)PACK_SINGLE_DIFF(diff); - } else { - break; - } - } - ++source; - --targetCapacity; - } - /* restore real values */ - targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); - - /* regular loop for all cases */ - while(source<sourceLimit) { - if(targetCapacity>0) { - c=*source++; - - if(c<=0x20) { - /* - * ISO C0 control & space: - * Encode directly for MIME compatibility, - * and reset state except for space, to not disrupt compression. - */ - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(uint8_t)c; - --targetCapacity; - continue; - } - - if(U16_IS_LEAD(c)) { -getTrail: - if(source<sourceLimit) { - /* test the following code unit */ - UChar trail=*source; - if(U16_IS_TRAIL(trail)) { - ++source; - c=U16_GET_SUPPLEMENTARY(c, trail); - } - } else { - /* no more input */ - c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ - break; - } - } - - /* - * all other Unicode code points c==U+0021..U+10ffff - * are encoded with the difference c-prev - * - * a new prev is computed from c, - * placed in the middle of a 0x80-block (for most small scripts) or - * in the middle of the Unihan and Hangul blocks - * to statistically minimize the following difference - */ - diff=c-prev; - prev=BOCU1_PREV(c); - if(DIFF_IS_SINGLE(diff)) { - *target++=(uint8_t)PACK_SINGLE_DIFF(diff); - --targetCapacity; - if(c<0x3000) { - goto fastSingle; - } - } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { - /* optimize 2-byte case */ - int32_t m; - - if(diff>=0) { - diff-=BOCU1_REACH_POS_1+1; - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - diff+=BOCU1_START_POS_2; - } else { - diff-=BOCU1_REACH_NEG_1; - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - diff+=BOCU1_START_NEG_2; - } - *target++=(uint8_t)diff; - *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); - targetCapacity-=2; - } else { - int32_t length; /* will be 2..4 */ - - diff=packDiff(diff); - length=BOCU1_LENGTH_FROM_PACKED(diff); - - /* write the output character bytes from diff and length */ - /* from the first if in the loop we know that targetCapacity>0 */ - if(length<=targetCapacity) { - switch(length) { - /* each branch falls through to the next one */ - case 4: - *target++=(uint8_t)(diff>>24); - U_FALLTHROUGH; - case 3: - *target++=(uint8_t)(diff>>16); - /* case 2: handled above */ - *target++=(uint8_t)(diff>>8); - /* case 1: handled above */ - *target++=(uint8_t)diff; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - targetCapacity-=length; - } else { - uint8_t *charErrorBuffer; - - /* - * We actually do this backwards here: - * In order to save an intermediate variable, we output - * first to the overflow buffer what does not fit into the - * regular target. - */ - /* we know that 1<=targetCapacity<length<=4 */ - length-=targetCapacity; - charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; - switch(length) { - /* each branch falls through to the next one */ - case 3: - *charErrorBuffer++=(uint8_t)(diff>>16); - U_FALLTHROUGH; - case 2: - *charErrorBuffer++=(uint8_t)(diff>>8); - U_FALLTHROUGH; - case 1: - *charErrorBuffer=(uint8_t)diff; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - cnv->charErrorBufferLength=(int8_t)length; - - /* now output what fits into the regular target */ - diff>>=8*length; /* length was reduced by targetCapacity */ - switch(targetCapacity) { - /* each branch falls through to the next one */ - case 3: - *target++=(uint8_t)(diff>>16); - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(diff>>8); - U_FALLTHROUGH; - case 1: - *target++=(uint8_t)diff; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - - /* target overflow */ - targetCapacity=0; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - } else { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - - /* set the converter state back into UConverter */ - cnv->fromUChar32= c<0 ? -c : 0; - cnv->fromUnicodeStatus=(uint32_t)prev; - - /* write back the updated pointers */ - pArgs->source=source; - pArgs->target=(char *)target; -} - -/* BOCU-1-to-Unicode conversion functions ----------------------------------- */ - -/** - * Function for BOCU-1 decoder; handles multi-byte lead bytes. - * - * @param b lead byte; - * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD - * @return (diff<<2)|count - */ -static inline int32_t -decodeBocu1LeadByte(int32_t b) { - int32_t diff, count; - - if(b>=BOCU1_START_NEG_2) { - /* positive difference */ - if(b<BOCU1_START_POS_3) { - /* two bytes */ - diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; - count=1; - } else if(b<BOCU1_START_POS_4) { - /* three bytes */ - diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; - count=2; - } else { - /* four bytes */ - diff=BOCU1_REACH_POS_3+1; - count=3; - } - } else { - /* negative difference */ - if(b>=BOCU1_START_NEG_3) { - /* two bytes */ - diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; - count=1; - } else if(b>BOCU1_MIN) { - /* three bytes */ - diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; - count=2; - } else { - /* four bytes */ - diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; - count=3; - } - } - - /* return the state for decoding the trail byte(s) */ - return (diff<<2)|count; -} - -/** - * Function for BOCU-1 decoder; handles multi-byte trail bytes. - * - * @param count number of remaining trail bytes including this one - * @param b trail byte - * @return new delta for diff including b - <0 indicates an error - * - * @see decodeBocu1 - */ -static inline int32_t -decodeBocu1TrailByte(int32_t count, int32_t b) { - if(b<=0x20) { - /* skip some C0 controls and make the trail byte range contiguous */ - b=bocu1ByteToTrail[b]; - /* b<0 for an illegal trail byte value will result in return<0 below */ -#if BOCU1_MAX_TRAIL<0xff - } else if(b>BOCU1_MAX_TRAIL) { - return -99; -#endif - } else { - b-=BOCU1_TRAIL_BYTE_OFFSET; - } - - /* add trail byte into difference and decrement count */ - if(count==1) { - return b; - } else if(count==2) { - return b*BOCU1_TRAIL_COUNT; - } else /* count==3 */ { - return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); - } -} - -static void U_CALLCONV -_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const uint8_t *source, *sourceLimit; - UChar *target; - const UChar *targetLimit; - int32_t *offsets; - - int32_t prev, count, diff, c; - - int8_t byteIndex; - uint8_t *bytes; - - int32_t sourceIndex, nextSourceIndex; - - /* set up the local pointers */ - cnv=pArgs->converter; - source=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - target=pArgs->target; - targetLimit=pArgs->targetLimit; - offsets=pArgs->offsets; - - /* get the converter state from UConverter */ - prev=(int32_t)cnv->toUnicodeStatus; - if(prev==0) { - prev=BOCU1_ASCII_PREV; - } - diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ - count=diff&3; - diff>>=2; - - byteIndex=cnv->toULength; - bytes=cnv->toUBytes; - - /* sourceIndex=-1 if the current character began in the previous buffer */ - sourceIndex=byteIndex==0 ? 0 : -1; - nextSourceIndex=0; - - /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ - if(count>0 && byteIndex>0 && target<targetLimit) { - goto getTrail; - } - -fastSingle: - /* fast loop for single-byte differences */ - /* use count as the only loop counter variable */ - diff=(int32_t)(sourceLimit-source); - count=(int32_t)(pArgs->targetLimit-target); - if(count>diff) { - count=diff; - } - while(count>0) { - if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { - c=prev+(c-BOCU1_MIDDLE); - if(c<0x3000) { - *target++=(UChar)c; - *offsets++=nextSourceIndex++; - prev=BOCU1_SIMPLE_PREV(c); - } else { - break; - } - } else if(c<=0x20) { - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(UChar)c; - *offsets++=nextSourceIndex++; - } else { - break; - } - ++source; - --count; - } - sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ - - /* decode a sequence of single and lead bytes */ - while(source<sourceLimit) { - if(target>=targetLimit) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - - ++nextSourceIndex; - c=*source++; - if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { - /* Write a code point directly from a single-byte difference. */ - c=prev+(c-BOCU1_MIDDLE); - if(c<0x3000) { - *target++=(UChar)c; - *offsets++=sourceIndex; - prev=BOCU1_SIMPLE_PREV(c); - sourceIndex=nextSourceIndex; - goto fastSingle; - } - } else if(c<=0x20) { - /* - * Direct-encoded C0 control code or space. - * Reset prev for C0 control codes but not for space. - */ - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(UChar)c; - *offsets++=sourceIndex; - sourceIndex=nextSourceIndex; - continue; - } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { - /* Optimize two-byte case. */ - if(c>=BOCU1_MIDDLE) { - diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; - } else { - diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; - } - - /* trail byte */ - ++nextSourceIndex; - c=decodeBocu1TrailByte(1, *source++); - if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { - bytes[0]=source[-2]; - bytes[1]=source[-1]; - byteIndex=2; - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - break; - } - } else if(c==BOCU1_RESET) { - /* only reset the state, no code point */ - prev=BOCU1_ASCII_PREV; - sourceIndex=nextSourceIndex; - continue; - } else { - /* - * For multi-byte difference lead bytes, set the decoder state - * with the partial difference value from the lead byte and - * with the number of trail bytes. - */ - bytes[0]=(uint8_t)c; - byteIndex=1; - - diff=decodeBocu1LeadByte(c); - count=diff&3; - diff>>=2; -getTrail: - for(;;) { - if(source>=sourceLimit) { - goto endloop; - } - ++nextSourceIndex; - c=bytes[byteIndex++]=*source++; - - /* trail byte in any position */ - c=decodeBocu1TrailByte(count, c); - if(c<0) { - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - - diff+=c; - if(--count==0) { - /* final trail byte, deliver a code point */ - byteIndex=0; - c=prev+diff; - if((uint32_t)c>0x10ffff) { - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - break; - } - } - } - - /* calculate the next prev and output c */ - prev=BOCU1_PREV(c); - if(c<=0xffff) { - *target++=(UChar)c; - *offsets++=sourceIndex; - } else { - /* output surrogate pair */ - *target++=U16_LEAD(c); - if(target<targetLimit) { - *target++=U16_TRAIL(c); - *offsets++=sourceIndex; - *offsets++=sourceIndex; - } else { - /* target overflow */ - *offsets++=sourceIndex; - cnv->UCharErrorBuffer[0]=U16_TRAIL(c); - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - sourceIndex=nextSourceIndex; - } -endloop: - - if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { - /* set the converter state in UConverter to deal with the next character */ - cnv->toUnicodeStatus=BOCU1_ASCII_PREV; - cnv->mode=0; - } else { - /* set the converter state back into UConverter */ - cnv->toUnicodeStatus=(uint32_t)prev; - cnv->mode=(diff<<2)|count; - } - cnv->toULength=byteIndex; - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - pArgs->offsets=offsets; - return; -} - -/* - * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. - * If a change is made in the original function, then either - * change this function the same way or - * re-copy the original function and remove the variables - * offsets, sourceIndex, and nextSourceIndex. - */ -static void U_CALLCONV -_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const uint8_t *source, *sourceLimit; - UChar *target; - const UChar *targetLimit; - - int32_t prev, count, diff, c; - - int8_t byteIndex; - uint8_t *bytes; - - /* set up the local pointers */ - cnv=pArgs->converter; - source=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - target=pArgs->target; - targetLimit=pArgs->targetLimit; - - /* get the converter state from UConverter */ - prev=(int32_t)cnv->toUnicodeStatus; - if(prev==0) { - prev=BOCU1_ASCII_PREV; - } - diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ - count=diff&3; - diff>>=2; - - byteIndex=cnv->toULength; - bytes=cnv->toUBytes; - - /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ - if(count>0 && byteIndex>0 && target<targetLimit) { - goto getTrail; - } - -fastSingle: - /* fast loop for single-byte differences */ - /* use count as the only loop counter variable */ - diff=(int32_t)(sourceLimit-source); - count=(int32_t)(pArgs->targetLimit-target); - if(count>diff) { - count=diff; - } - while(count>0) { - if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { - c=prev+(c-BOCU1_MIDDLE); - if(c<0x3000) { - *target++=(UChar)c; - prev=BOCU1_SIMPLE_PREV(c); - } else { - break; - } - } else if(c<=0x20) { - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(UChar)c; - } else { - break; - } - ++source; - --count; - } - - /* decode a sequence of single and lead bytes */ - while(source<sourceLimit) { - if(target>=targetLimit) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - - c=*source++; - if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { - /* Write a code point directly from a single-byte difference. */ - c=prev+(c-BOCU1_MIDDLE); - if(c<0x3000) { - *target++=(UChar)c; - prev=BOCU1_SIMPLE_PREV(c); - goto fastSingle; - } - } else if(c<=0x20) { - /* - * Direct-encoded C0 control code or space. - * Reset prev for C0 control codes but not for space. - */ - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(UChar)c; - continue; - } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { - /* Optimize two-byte case. */ - if(c>=BOCU1_MIDDLE) { - diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; - } else { - diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; - } - - /* trail byte */ - c=decodeBocu1TrailByte(1, *source++); - if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { - bytes[0]=source[-2]; - bytes[1]=source[-1]; - byteIndex=2; - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - break; - } - } else if(c==BOCU1_RESET) { - /* only reset the state, no code point */ - prev=BOCU1_ASCII_PREV; - continue; - } else { - /* - * For multi-byte difference lead bytes, set the decoder state - * with the partial difference value from the lead byte and - * with the number of trail bytes. - */ - bytes[0]=(uint8_t)c; - byteIndex=1; - - diff=decodeBocu1LeadByte(c); - count=diff&3; - diff>>=2; -getTrail: - for(;;) { - if(source>=sourceLimit) { - goto endloop; - } - c=bytes[byteIndex++]=*source++; - - /* trail byte in any position */ - c=decodeBocu1TrailByte(count, c); - if(c<0) { - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - - diff+=c; - if(--count==0) { - /* final trail byte, deliver a code point */ - byteIndex=0; - c=prev+diff; - if((uint32_t)c>0x10ffff) { - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - break; - } - } - } - - /* calculate the next prev and output c */ - prev=BOCU1_PREV(c); - if(c<=0xffff) { - *target++=(UChar)c; - } else { - /* output surrogate pair */ - *target++=U16_LEAD(c); - if(target<targetLimit) { - *target++=U16_TRAIL(c); - } else { - /* target overflow */ - cnv->UCharErrorBuffer[0]=U16_TRAIL(c); - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - } -endloop: - - if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { - /* set the converter state in UConverter to deal with the next character */ - cnv->toUnicodeStatus=BOCU1_ASCII_PREV; - cnv->mode=0; - } else { - /* set the converter state back into UConverter */ - cnv->toUnicodeStatus=(uint32_t)prev; - cnv->mode=(diff<<2)|count; - } - cnv->toULength=byteIndex; - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - return; -} - -/* miscellaneous ------------------------------------------------------------ */ - -static const UConverterImpl _Bocu1Impl={ - UCNV_BOCU1, - - NULL, - NULL, - - NULL, - NULL, - NULL, - - _Bocu1ToUnicode, - _Bocu1ToUnicodeWithOffsets, - _Bocu1FromUnicode, - _Bocu1FromUnicodeWithOffsets, - NULL, - - NULL, - NULL, - NULL, - NULL, - ucnv_getCompleteUnicodeSet, - - NULL, - NULL -}; - -static const UConverterStaticData _Bocu1StaticData={ - sizeof(UConverterStaticData), - "BOCU-1", - 1214, /* CCSID for BOCU-1 */ - UCNV_IBM, UCNV_BOCU1, - 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ - { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ - FALSE, FALSE, - 0, - 0, - { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ -}; - -const UConverterSharedData _Bocu1Data= - UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl); - -#endif + +/* Faster versions of packDiff() for single-byte-encoded diff values. */ + +/** Is a diff value encodable in a single byte? */ +#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) + +/** Encode a diff value in a single byte. */ +#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) + +/** Is a diff value encodable in two bytes? */ +#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) + +/* BOCU-1 implementation functions ------------------------------------------ */ + +#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) + +/** + * Compute the next "previous" value for differencing + * from the current code point. + * + * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) + * @return "previous code point" state value + */ +static inline int32_t +bocu1Prev(int32_t c) { + /* compute new prev */ + if(/* 0x3040<=c && */ c<=0x309f) { + /* Hiragana is not 128-aligned */ + return 0x3070; + } else if(0x4e00<=c && c<=0x9fa5) { + /* CJK Unihan */ + return 0x4e00-BOCU1_REACH_NEG_2; + } else if(0xac00<=c /* && c<=0xd7a3 */) { + /* Korean Hangul */ + return (0xd7a3+0xac00)/2; + } else { + /* mostly small scripts */ + return BOCU1_SIMPLE_PREV(c); + } +} + +/** Fast version of bocu1Prev() for most scripts. */ +#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) + +/* + * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. + * The UConverter fields are used as follows: + * + * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) + * + * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) + * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) + */ + +/* BOCU-1-from-Unicode conversion functions --------------------------------- */ + +/** + * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes + * and return a packed integer with them. + * + * The encoding favors small absolute differences with short encodings + * to compress runs of same-script characters. + * + * Optimized version with unrolled loops and fewer floating-point operations + * than the standard packDiff(). + * + * @param diff difference value -0x10ffff..0x10ffff + * @return + * 0x010000zz for 1-byte sequence zz + * 0x0200yyzz for 2-byte sequence yy zz + * 0x03xxyyzz for 3-byte sequence xx yy zz + * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) + */ +static int32_t +packDiff(int32_t diff) { + int32_t result, m; + + U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ + if(diff>=BOCU1_REACH_NEG_1) { + /* mostly positive differences, and single-byte negative ones */ +#if 0 /* single-byte case handled in macros, see below */ + if(diff<=BOCU1_REACH_POS_1) { + /* single byte */ + return 0x01000000|(BOCU1_MIDDLE+diff); + } else +#endif + if(diff<=BOCU1_REACH_POS_2) { + /* two bytes */ + diff-=BOCU1_REACH_POS_1+1; + result=0x02000000; + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m); + + result|=(BOCU1_START_POS_2+diff)<<8; + } else if(diff<=BOCU1_REACH_POS_3) { + /* three bytes */ + diff-=BOCU1_REACH_POS_2+1; + result=0x03000000; + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m); + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + result|=(BOCU1_START_POS_3+diff)<<16; + } else { + /* four bytes */ + diff-=BOCU1_REACH_POS_3+1; + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result=BOCU1_TRAIL_TO_BYTE(m); + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + /* + * We know that / and % would deliver quotient 0 and rest=diff. + * Avoid division and modulo for performance. + */ + result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; + + result|=((uint32_t)BOCU1_START_POS_4)<<24; + } + } else { + /* two- to four-byte negative differences */ + if(diff>=BOCU1_REACH_NEG_2) { + /* two bytes */ + diff-=BOCU1_REACH_NEG_1; + result=0x02000000; + + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m); + + result|=(BOCU1_START_NEG_2+diff)<<8; + } else if(diff>=BOCU1_REACH_NEG_3) { + /* three bytes */ + diff-=BOCU1_REACH_NEG_2; + result=0x03000000; + + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m); + + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + result|=(BOCU1_START_NEG_3+diff)<<16; + } else { + /* four bytes */ + diff-=BOCU1_REACH_NEG_3; + + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result=BOCU1_TRAIL_TO_BYTE(m); + + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + /* + * We know that NEGDIVMOD would deliver + * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. + * Avoid division and modulo for performance. + */ + m=diff+BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m)<<16; + + result|=BOCU1_MIN<<24; + } + } + return result; +} + + +static void U_CALLCONV +_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const UChar *source, *sourceLimit; + uint8_t *target; + int32_t targetCapacity; + int32_t *offsets; + + int32_t prev, c, diff; + + int32_t sourceIndex, nextSourceIndex; + + /* set up the local pointers */ + cnv=pArgs->converter; + source=pArgs->source; + sourceLimit=pArgs->sourceLimit; + target=(uint8_t *)pArgs->target; + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); + offsets=pArgs->offsets; + + /* get the converter state from UConverter */ + c=cnv->fromUChar32; + prev=(int32_t)cnv->fromUnicodeStatus; + if(prev==0) { + prev=BOCU1_ASCII_PREV; + } + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex= c==0 ? 0 : -1; + nextSourceIndex=0; + + /* conversion loop */ + if(c!=0 && targetCapacity>0) { + goto getTrail; + } + +fastSingle: + /* fast loop for single-byte differences */ + /* use only one loop counter variable, targetCapacity, not also source */ + diff=(int32_t)(sourceLimit-source); + if(targetCapacity>diff) { + targetCapacity=diff; + } + while(targetCapacity>0 && (c=*source)<0x3000) { + if(c<=0x20) { + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + *target++=(uint8_t)c; + *offsets++=nextSourceIndex++; + ++source; + --targetCapacity; + } else { + diff=c-prev; + if(DIFF_IS_SINGLE(diff)) { + prev=BOCU1_SIMPLE_PREV(c); + *target++=(uint8_t)PACK_SINGLE_DIFF(diff); + *offsets++=nextSourceIndex++; + ++source; + --targetCapacity; + } else { + break; + } + } + } + /* restore real values */ + targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); + sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ + + /* regular loop for all cases */ + while(source<sourceLimit) { + if(targetCapacity>0) { + c=*source++; + ++nextSourceIndex; + + if(c<=0x20) { + /* + * ISO C0 control & space: + * Encode directly for MIME compatibility, + * and reset state except for space, to not disrupt compression. + */ + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + *target++=(uint8_t)c; + *offsets++=sourceIndex; + --targetCapacity; + + sourceIndex=nextSourceIndex; + continue; + } + + if(U16_IS_LEAD(c)) { +getTrail: + if(source<sourceLimit) { + /* test the following code unit */ + UChar trail=*source; + if(U16_IS_TRAIL(trail)) { + ++source; + ++nextSourceIndex; + c=U16_GET_SUPPLEMENTARY(c, trail); + } + } else { + /* no more input */ + c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ + break; + } + } + + /* + * all other Unicode code points c==U+0021..U+10ffff + * are encoded with the difference c-prev + * + * a new prev is computed from c, + * placed in the middle of a 0x80-block (for most small scripts) or + * in the middle of the Unihan and Hangul blocks + * to statistically minimize the following difference + */ + diff=c-prev; + prev=BOCU1_PREV(c); + if(DIFF_IS_SINGLE(diff)) { + *target++=(uint8_t)PACK_SINGLE_DIFF(diff); + *offsets++=sourceIndex; + --targetCapacity; + sourceIndex=nextSourceIndex; + if(c<0x3000) { + goto fastSingle; + } + } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { + /* optimize 2-byte case */ + int32_t m; + + if(diff>=0) { + diff-=BOCU1_REACH_POS_1+1; + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + diff+=BOCU1_START_POS_2; + } else { + diff-=BOCU1_REACH_NEG_1; + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + diff+=BOCU1_START_NEG_2; + } + *target++=(uint8_t)diff; + *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); + *offsets++=sourceIndex; + *offsets++=sourceIndex; + targetCapacity-=2; + sourceIndex=nextSourceIndex; + } else { + int32_t length; /* will be 2..4 */ + + diff=packDiff(diff); + length=BOCU1_LENGTH_FROM_PACKED(diff); + + /* write the output character bytes from diff and length */ + /* from the first if in the loop we know that targetCapacity>0 */ + if(length<=targetCapacity) { + switch(length) { + /* each branch falls through to the next one */ + case 4: + *target++=(uint8_t)(diff>>24); + *offsets++=sourceIndex; + U_FALLTHROUGH; + case 3: + *target++=(uint8_t)(diff>>16); + *offsets++=sourceIndex; + U_FALLTHROUGH; + case 2: + *target++=(uint8_t)(diff>>8); + *offsets++=sourceIndex; + /* case 1: handled above */ + *target++=(uint8_t)diff; + *offsets++=sourceIndex; + U_FALLTHROUGH; + default: + /* will never occur */ + break; + } + targetCapacity-=length; + sourceIndex=nextSourceIndex; + } else { + uint8_t *charErrorBuffer; + + /* + * We actually do this backwards here: + * In order to save an intermediate variable, we output + * first to the overflow buffer what does not fit into the + * regular target. + */ + /* we know that 1<=targetCapacity<length<=4 */ + length-=targetCapacity; + charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; + switch(length) { + /* each branch falls through to the next one */ + case 3: + *charErrorBuffer++=(uint8_t)(diff>>16); + U_FALLTHROUGH; + case 2: + *charErrorBuffer++=(uint8_t)(diff>>8); + U_FALLTHROUGH; + case 1: + *charErrorBuffer=(uint8_t)diff; + U_FALLTHROUGH; + default: + /* will never occur */ + break; + } + cnv->charErrorBufferLength=(int8_t)length; + + /* now output what fits into the regular target */ + diff>>=8*length; /* length was reduced by targetCapacity */ + switch(targetCapacity) { + /* each branch falls through to the next one */ + case 3: + *target++=(uint8_t)(diff>>16); + *offsets++=sourceIndex; + U_FALLTHROUGH; + case 2: + *target++=(uint8_t)(diff>>8); + *offsets++=sourceIndex; + U_FALLTHROUGH; + case 1: + *target++=(uint8_t)diff; + *offsets++=sourceIndex; + U_FALLTHROUGH; + default: + /* will never occur */ + break; + } + + /* target overflow */ + targetCapacity=0; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } else { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } + + /* set the converter state back into UConverter */ + cnv->fromUChar32= c<0 ? -c : 0; + cnv->fromUnicodeStatus=(uint32_t)prev; + + /* write back the updated pointers */ + pArgs->source=source; + pArgs->target=(char *)target; + pArgs->offsets=offsets; +} + +/* + * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. + * If a change is made in the original function, then either + * change this function the same way or + * re-copy the original function and remove the variables + * offsets, sourceIndex, and nextSourceIndex. + */ +static void U_CALLCONV +_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const UChar *source, *sourceLimit; + uint8_t *target; + int32_t targetCapacity; + + int32_t prev, c, diff; + + /* set up the local pointers */ + cnv=pArgs->converter; + source=pArgs->source; + sourceLimit=pArgs->sourceLimit; + target=(uint8_t *)pArgs->target; + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); + + /* get the converter state from UConverter */ + c=cnv->fromUChar32; + prev=(int32_t)cnv->fromUnicodeStatus; + if(prev==0) { + prev=BOCU1_ASCII_PREV; + } + + /* conversion loop */ + if(c!=0 && targetCapacity>0) { + goto getTrail; + } + +fastSingle: + /* fast loop for single-byte differences */ + /* use only one loop counter variable, targetCapacity, not also source */ + diff=(int32_t)(sourceLimit-source); + if(targetCapacity>diff) { + targetCapacity=diff; + } + while(targetCapacity>0 && (c=*source)<0x3000) { + if(c<=0x20) { + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + *target++=(uint8_t)c; + } else { + diff=c-prev; + if(DIFF_IS_SINGLE(diff)) { + prev=BOCU1_SIMPLE_PREV(c); + *target++=(uint8_t)PACK_SINGLE_DIFF(diff); + } else { + break; + } + } + ++source; + --targetCapacity; + } + /* restore real values */ + targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); + + /* regular loop for all cases */ + while(source<sourceLimit) { + if(targetCapacity>0) { + c=*source++; + + if(c<=0x20) { + /* + * ISO C0 control & space: + * Encode directly for MIME compatibility, + * and reset state except for space, to not disrupt compression. + */ + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + *target++=(uint8_t)c; + --targetCapacity; + continue; + } + + if(U16_IS_LEAD(c)) { +getTrail: + if(source<sourceLimit) { + /* test the following code unit */ + UChar trail=*source; + if(U16_IS_TRAIL(trail)) { + ++source; + c=U16_GET_SUPPLEMENTARY(c, trail); + } + } else { + /* no more input */ + c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ + break; + } + } + + /* + * all other Unicode code points c==U+0021..U+10ffff + * are encoded with the difference c-prev + * + * a new prev is computed from c, + * placed in the middle of a 0x80-block (for most small scripts) or + * in the middle of the Unihan and Hangul blocks + * to statistically minimize the following difference + */ + diff=c-prev; + prev=BOCU1_PREV(c); + if(DIFF_IS_SINGLE(diff)) { + *target++=(uint8_t)PACK_SINGLE_DIFF(diff); + --targetCapacity; + if(c<0x3000) { + goto fastSingle; + } + } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { + /* optimize 2-byte case */ + int32_t m; + + if(diff>=0) { + diff-=BOCU1_REACH_POS_1+1; + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + diff+=BOCU1_START_POS_2; + } else { + diff-=BOCU1_REACH_NEG_1; + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + diff+=BOCU1_START_NEG_2; + } + *target++=(uint8_t)diff; + *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); + targetCapacity-=2; + } else { + int32_t length; /* will be 2..4 */ + + diff=packDiff(diff); + length=BOCU1_LENGTH_FROM_PACKED(diff); + + /* write the output character bytes from diff and length */ + /* from the first if in the loop we know that targetCapacity>0 */ + if(length<=targetCapacity) { + switch(length) { + /* each branch falls through to the next one */ + case 4: + *target++=(uint8_t)(diff>>24); + U_FALLTHROUGH; + case 3: + *target++=(uint8_t)(diff>>16); + /* case 2: handled above */ + *target++=(uint8_t)(diff>>8); + /* case 1: handled above */ + *target++=(uint8_t)diff; + U_FALLTHROUGH; + default: + /* will never occur */ + break; + } + targetCapacity-=length; + } else { + uint8_t *charErrorBuffer; + + /* + * We actually do this backwards here: + * In order to save an intermediate variable, we output + * first to the overflow buffer what does not fit into the + * regular target. + */ + /* we know that 1<=targetCapacity<length<=4 */ + length-=targetCapacity; + charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; + switch(length) { + /* each branch falls through to the next one */ + case 3: + *charErrorBuffer++=(uint8_t)(diff>>16); + U_FALLTHROUGH; + case 2: + *charErrorBuffer++=(uint8_t)(diff>>8); + U_FALLTHROUGH; + case 1: + *charErrorBuffer=(uint8_t)diff; + U_FALLTHROUGH; + default: + /* will never occur */ + break; + } + cnv->charErrorBufferLength=(int8_t)length; + + /* now output what fits into the regular target */ + diff>>=8*length; /* length was reduced by targetCapacity */ + switch(targetCapacity) { + /* each branch falls through to the next one */ + case 3: + *target++=(uint8_t)(diff>>16); + U_FALLTHROUGH; + case 2: + *target++=(uint8_t)(diff>>8); + U_FALLTHROUGH; + case 1: + *target++=(uint8_t)diff; + U_FALLTHROUGH; + default: + /* will never occur */ + break; + } + + /* target overflow */ + targetCapacity=0; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } else { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } + + /* set the converter state back into UConverter */ + cnv->fromUChar32= c<0 ? -c : 0; + cnv->fromUnicodeStatus=(uint32_t)prev; + + /* write back the updated pointers */ + pArgs->source=source; + pArgs->target=(char *)target; +} + +/* BOCU-1-to-Unicode conversion functions ----------------------------------- */ + +/** + * Function for BOCU-1 decoder; handles multi-byte lead bytes. + * + * @param b lead byte; + * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD + * @return (diff<<2)|count + */ +static inline int32_t +decodeBocu1LeadByte(int32_t b) { + int32_t diff, count; + + if(b>=BOCU1_START_NEG_2) { + /* positive difference */ + if(b<BOCU1_START_POS_3) { + /* two bytes */ + diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; + count=1; + } else if(b<BOCU1_START_POS_4) { + /* three bytes */ + diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; + count=2; + } else { + /* four bytes */ + diff=BOCU1_REACH_POS_3+1; + count=3; + } + } else { + /* negative difference */ + if(b>=BOCU1_START_NEG_3) { + /* two bytes */ + diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; + count=1; + } else if(b>BOCU1_MIN) { + /* three bytes */ + diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; + count=2; + } else { + /* four bytes */ + diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; + count=3; + } + } + + /* return the state for decoding the trail byte(s) */ + return (diff<<2)|count; +} + +/** + * Function for BOCU-1 decoder; handles multi-byte trail bytes. + * + * @param count number of remaining trail bytes including this one + * @param b trail byte + * @return new delta for diff including b - <0 indicates an error + * + * @see decodeBocu1 + */ +static inline int32_t +decodeBocu1TrailByte(int32_t count, int32_t b) { + if(b<=0x20) { + /* skip some C0 controls and make the trail byte range contiguous */ + b=bocu1ByteToTrail[b]; + /* b<0 for an illegal trail byte value will result in return<0 below */ +#if BOCU1_MAX_TRAIL<0xff + } else if(b>BOCU1_MAX_TRAIL) { + return -99; +#endif + } else { + b-=BOCU1_TRAIL_BYTE_OFFSET; + } + + /* add trail byte into difference and decrement count */ + if(count==1) { + return b; + } else if(count==2) { + return b*BOCU1_TRAIL_COUNT; + } else /* count==3 */ { + return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); + } +} + +static void U_CALLCONV +_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const uint8_t *source, *sourceLimit; + UChar *target; + const UChar *targetLimit; + int32_t *offsets; + + int32_t prev, count, diff, c; + + int8_t byteIndex; + uint8_t *bytes; + + int32_t sourceIndex, nextSourceIndex; + + /* set up the local pointers */ + cnv=pArgs->converter; + source=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + target=pArgs->target; + targetLimit=pArgs->targetLimit; + offsets=pArgs->offsets; + + /* get the converter state from UConverter */ + prev=(int32_t)cnv->toUnicodeStatus; + if(prev==0) { + prev=BOCU1_ASCII_PREV; + } + diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ + count=diff&3; + diff>>=2; + + byteIndex=cnv->toULength; + bytes=cnv->toUBytes; + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex=byteIndex==0 ? 0 : -1; + nextSourceIndex=0; + + /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ + if(count>0 && byteIndex>0 && target<targetLimit) { + goto getTrail; + } + +fastSingle: + /* fast loop for single-byte differences */ + /* use count as the only loop counter variable */ + diff=(int32_t)(sourceLimit-source); + count=(int32_t)(pArgs->targetLimit-target); + if(count>diff) { + count=diff; + } + while(count>0) { + if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { + c=prev+(c-BOCU1_MIDDLE); + if(c<0x3000) { + *target++=(UChar)c; + *offsets++=nextSourceIndex++; + prev=BOCU1_SIMPLE_PREV(c); + } else { + break; + } + } else if(c<=0x20) { + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + *target++=(UChar)c; + *offsets++=nextSourceIndex++; + } else { + break; + } + ++source; + --count; + } + sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ + + /* decode a sequence of single and lead bytes */ + while(source<sourceLimit) { + if(target>=targetLimit) { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + + ++nextSourceIndex; + c=*source++; + if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { + /* Write a code point directly from a single-byte difference. */ + c=prev+(c-BOCU1_MIDDLE); + if(c<0x3000) { + *target++=(UChar)c; + *offsets++=sourceIndex; + prev=BOCU1_SIMPLE_PREV(c); + sourceIndex=nextSourceIndex; + goto fastSingle; + } + } else if(c<=0x20) { + /* + * Direct-encoded C0 control code or space. + * Reset prev for C0 control codes but not for space. + */ + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + *target++=(UChar)c; + *offsets++=sourceIndex; + sourceIndex=nextSourceIndex; + continue; + } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { + /* Optimize two-byte case. */ + if(c>=BOCU1_MIDDLE) { + diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; + } else { + diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; + } + + /* trail byte */ + ++nextSourceIndex; + c=decodeBocu1TrailByte(1, *source++); + if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { + bytes[0]=source[-2]; + bytes[1]=source[-1]; + byteIndex=2; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; + } + } else if(c==BOCU1_RESET) { + /* only reset the state, no code point */ + prev=BOCU1_ASCII_PREV; + sourceIndex=nextSourceIndex; + continue; + } else { + /* + * For multi-byte difference lead bytes, set the decoder state + * with the partial difference value from the lead byte and + * with the number of trail bytes. + */ + bytes[0]=(uint8_t)c; + byteIndex=1; + + diff=decodeBocu1LeadByte(c); + count=diff&3; + diff>>=2; +getTrail: + for(;;) { + if(source>=sourceLimit) { + goto endloop; + } + ++nextSourceIndex; + c=bytes[byteIndex++]=*source++; + + /* trail byte in any position */ + c=decodeBocu1TrailByte(count, c); + if(c<0) { + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + goto endloop; + } + + diff+=c; + if(--count==0) { + /* final trail byte, deliver a code point */ + byteIndex=0; + c=prev+diff; + if((uint32_t)c>0x10ffff) { + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + goto endloop; + } + break; + } + } + } + + /* calculate the next prev and output c */ + prev=BOCU1_PREV(c); + if(c<=0xffff) { + *target++=(UChar)c; + *offsets++=sourceIndex; + } else { + /* output surrogate pair */ + *target++=U16_LEAD(c); + if(target<targetLimit) { + *target++=U16_TRAIL(c); + *offsets++=sourceIndex; + *offsets++=sourceIndex; + } else { + /* target overflow */ + *offsets++=sourceIndex; + cnv->UCharErrorBuffer[0]=U16_TRAIL(c); + cnv->UCharErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } + sourceIndex=nextSourceIndex; + } +endloop: + + if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { + /* set the converter state in UConverter to deal with the next character */ + cnv->toUnicodeStatus=BOCU1_ASCII_PREV; + cnv->mode=0; + } else { + /* set the converter state back into UConverter */ + cnv->toUnicodeStatus=(uint32_t)prev; + cnv->mode=(diff<<2)|count; + } + cnv->toULength=byteIndex; + + /* write back the updated pointers */ + pArgs->source=(const char *)source; + pArgs->target=target; + pArgs->offsets=offsets; + return; +} + +/* + * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. + * If a change is made in the original function, then either + * change this function the same way or + * re-copy the original function and remove the variables + * offsets, sourceIndex, and nextSourceIndex. + */ +static void U_CALLCONV +_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const uint8_t *source, *sourceLimit; + UChar *target; + const UChar *targetLimit; + + int32_t prev, count, diff, c; + + int8_t byteIndex; + uint8_t *bytes; + + /* set up the local pointers */ + cnv=pArgs->converter; + source=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + target=pArgs->target; + targetLimit=pArgs->targetLimit; + + /* get the converter state from UConverter */ + prev=(int32_t)cnv->toUnicodeStatus; + if(prev==0) { + prev=BOCU1_ASCII_PREV; + } + diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ + count=diff&3; + diff>>=2; + + byteIndex=cnv->toULength; + bytes=cnv->toUBytes; + + /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ + if(count>0 && byteIndex>0 && target<targetLimit) { + goto getTrail; + } + +fastSingle: + /* fast loop for single-byte differences */ + /* use count as the only loop counter variable */ + diff=(int32_t)(sourceLimit-source); + count=(int32_t)(pArgs->targetLimit-target); + if(count>diff) { + count=diff; + } + while(count>0) { + if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { + c=prev+(c-BOCU1_MIDDLE); + if(c<0x3000) { + *target++=(UChar)c; + prev=BOCU1_SIMPLE_PREV(c); + } else { + break; + } + } else if(c<=0x20) { + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + *target++=(UChar)c; + } else { + break; + } + ++source; + --count; + } + + /* decode a sequence of single and lead bytes */ + while(source<sourceLimit) { + if(target>=targetLimit) { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + + c=*source++; + if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { + /* Write a code point directly from a single-byte difference. */ + c=prev+(c-BOCU1_MIDDLE); + if(c<0x3000) { + *target++=(UChar)c; + prev=BOCU1_SIMPLE_PREV(c); + goto fastSingle; + } + } else if(c<=0x20) { + /* + * Direct-encoded C0 control code or space. + * Reset prev for C0 control codes but not for space. + */ + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + *target++=(UChar)c; + continue; + } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { + /* Optimize two-byte case. */ + if(c>=BOCU1_MIDDLE) { + diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; + } else { + diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; + } + + /* trail byte */ + c=decodeBocu1TrailByte(1, *source++); + if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { + bytes[0]=source[-2]; + bytes[1]=source[-1]; + byteIndex=2; + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + break; + } + } else if(c==BOCU1_RESET) { + /* only reset the state, no code point */ + prev=BOCU1_ASCII_PREV; + continue; + } else { + /* + * For multi-byte difference lead bytes, set the decoder state + * with the partial difference value from the lead byte and + * with the number of trail bytes. + */ + bytes[0]=(uint8_t)c; + byteIndex=1; + + diff=decodeBocu1LeadByte(c); + count=diff&3; + diff>>=2; +getTrail: + for(;;) { + if(source>=sourceLimit) { + goto endloop; + } + c=bytes[byteIndex++]=*source++; + + /* trail byte in any position */ + c=decodeBocu1TrailByte(count, c); + if(c<0) { + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + goto endloop; + } + + diff+=c; + if(--count==0) { + /* final trail byte, deliver a code point */ + byteIndex=0; + c=prev+diff; + if((uint32_t)c>0x10ffff) { + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + goto endloop; + } + break; + } + } + } + + /* calculate the next prev and output c */ + prev=BOCU1_PREV(c); + if(c<=0xffff) { + *target++=(UChar)c; + } else { + /* output surrogate pair */ + *target++=U16_LEAD(c); + if(target<targetLimit) { + *target++=U16_TRAIL(c); + } else { + /* target overflow */ + cnv->UCharErrorBuffer[0]=U16_TRAIL(c); + cnv->UCharErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } +endloop: + + if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { + /* set the converter state in UConverter to deal with the next character */ + cnv->toUnicodeStatus=BOCU1_ASCII_PREV; + cnv->mode=0; + } else { + /* set the converter state back into UConverter */ + cnv->toUnicodeStatus=(uint32_t)prev; + cnv->mode=(diff<<2)|count; + } + cnv->toULength=byteIndex; + + /* write back the updated pointers */ + pArgs->source=(const char *)source; + pArgs->target=target; + return; +} + +/* miscellaneous ------------------------------------------------------------ */ + +static const UConverterImpl _Bocu1Impl={ + UCNV_BOCU1, + + NULL, + NULL, + + NULL, + NULL, + NULL, + + _Bocu1ToUnicode, + _Bocu1ToUnicodeWithOffsets, + _Bocu1FromUnicode, + _Bocu1FromUnicodeWithOffsets, + NULL, + + NULL, + NULL, + NULL, + NULL, + ucnv_getCompleteUnicodeSet, + + NULL, + NULL +}; + +static const UConverterStaticData _Bocu1StaticData={ + sizeof(UConverterStaticData), + "BOCU-1", + 1214, /* CCSID for BOCU-1 */ + UCNV_IBM, UCNV_BOCU1, + 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ + { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ + FALSE, FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +const UConverterSharedData _Bocu1Data= + UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl); + +#endif |