aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/icu/common/bmpset.cpp
diff options
context:
space:
mode:
authorneksard <neksard@yandex-team.ru>2022-02-10 16:45:23 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:23 +0300
commit8f7cf138264e0caa318144bf8a2c950e0b0a8593 (patch)
tree83bf5c8c8047c42d8475e6095df90ccdc3d1b57f /contrib/libs/icu/common/bmpset.cpp
parentd3a398281c6fd1d3672036cb2d63f842d2cb28c5 (diff)
downloadydb-8f7cf138264e0caa318144bf8a2c950e0b0a8593.tar.gz
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/common/bmpset.cpp')
-rw-r--r--contrib/libs/icu/common/bmpset.cpp1348
1 files changed, 674 insertions, 674 deletions
diff --git a/contrib/libs/icu/common/bmpset.cpp b/contrib/libs/icu/common/bmpset.cpp
index bc79f5e5a6..2cb4f56cbe 100644
--- a/contrib/libs/icu/common/bmpset.cpp
+++ b/contrib/libs/icu/common/bmpset.cpp
@@ -1,143 +1,143 @@
// © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-******************************************************************************
-*
-* Copyright (C) 2007-2012, International Business Machines
-* Corporation and others. All Rights Reserved.
-*
-******************************************************************************
-* file name: bmpset.cpp
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+******************************************************************************
+*
+* Copyright (C) 2007-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: bmpset.cpp
* encoding: UTF-8
-* tab size: 8 (not used)
-* indentation:4
-*
-* created on: 2007jan29
-* created by: Markus W. Scherer
-*/
-
-#include "unicode/utypes.h"
-#include "unicode/uniset.h"
-#include "unicode/utf8.h"
-#include "unicode/utf16.h"
-#include "cmemory.h"
-#include "bmpset.h"
-#include "uassert.h"
-
-U_NAMESPACE_BEGIN
-
-BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
- list(parentList), listLength(parentListLength) {
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2007jan29
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/uniset.h"
+#include "unicode/utf8.h"
+#include "unicode/utf16.h"
+#include "cmemory.h"
+#include "bmpset.h"
+#include "uassert.h"
+
+U_NAMESPACE_BEGIN
+
+BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
+ list(parentList), listLength(parentListLength) {
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
- uprv_memset(table7FF, 0, sizeof(table7FF));
- uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
-
- /*
- * Set the list indexes for binary searches for
- * U+0800, U+1000, U+2000, .., U+F000, U+10000.
- * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
- * looked up in the bit tables.
- * The last pair of indexes is for finding supplementary code points.
- */
- list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
- int32_t i;
- for(i=1; i<=0x10; ++i) {
- list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
- }
- list4kStarts[0x11]=listLength-1;
+ uprv_memset(table7FF, 0, sizeof(table7FF));
+ uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
+
+ /*
+ * Set the list indexes for binary searches for
+ * U+0800, U+1000, U+2000, .., U+F000, U+10000.
+ * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
+ * looked up in the bit tables.
+ * The last pair of indexes is for finding supplementary code points.
+ */
+ list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
+ int32_t i;
+ for(i=1; i<=0x10; ++i) {
+ list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
+ }
+ list4kStarts[0x11]=listLength-1;
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
-
- initBits();
- overrideIllegal();
-}
-
-BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
+
+ initBits();
+ overrideIllegal();
+}
+
+BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
containsFFFD(otherBMPSet.containsFFFD),
- list(newParentList), listLength(newParentListLength) {
+ list(newParentList), listLength(newParentListLength) {
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
- uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
- uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
- uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
-}
-
-BMPSet::~BMPSet() {
-}
-
-/*
- * Set bits in a bit rectangle in "vertical" bit organization.
- * start<limit<=0x800
- */
-static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
- U_ASSERT(start<limit);
- U_ASSERT(limit<=0x800);
-
- int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
- int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
-
- // Set one bit indicating an all-one block.
- uint32_t bits=(uint32_t)1<<lead;
- if((start+1)==limit) { // Single-character shortcut.
- table[trail]|=bits;
- return;
- }
-
- int32_t limitLead=limit>>6;
- int32_t limitTrail=limit&0x3f;
-
- if(lead==limitLead) {
- // Partial vertical bit column.
- while(trail<limitTrail) {
- table[trail++]|=bits;
- }
- } else {
- // Partial vertical bit column,
- // followed by a bit rectangle,
- // followed by another partial vertical bit column.
- if(trail>0) {
- do {
- table[trail++]|=bits;
- } while(trail<64);
- ++lead;
- }
- if(lead<limitLead) {
+ uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
+ uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
+ uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
+}
+
+BMPSet::~BMPSet() {
+}
+
+/*
+ * Set bits in a bit rectangle in "vertical" bit organization.
+ * start<limit<=0x800
+ */
+static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
+ U_ASSERT(start<limit);
+ U_ASSERT(limit<=0x800);
+
+ int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
+ int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
+
+ // Set one bit indicating an all-one block.
+ uint32_t bits=(uint32_t)1<<lead;
+ if((start+1)==limit) { // Single-character shortcut.
+ table[trail]|=bits;
+ return;
+ }
+
+ int32_t limitLead=limit>>6;
+ int32_t limitTrail=limit&0x3f;
+
+ if(lead==limitLead) {
+ // Partial vertical bit column.
+ while(trail<limitTrail) {
+ table[trail++]|=bits;
+ }
+ } else {
+ // Partial vertical bit column,
+ // followed by a bit rectangle,
+ // followed by another partial vertical bit column.
+ if(trail>0) {
+ do {
+ table[trail++]|=bits;
+ } while(trail<64);
+ ++lead;
+ }
+ if(lead<limitLead) {
bits=~(((unsigned)1<<lead)-1);
- if(limitLead<0x20) {
+ if(limitLead<0x20) {
bits&=((unsigned)1<<limitLead)-1;
- }
- for(trail=0; trail<64; ++trail) {
- table[trail]|=bits;
- }
- }
- // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
- // In that case, bits=1<<limitLead is undefined but the bits value
- // is not used because trail<limitTrail is already false.
- bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
- for(trail=0; trail<limitTrail; ++trail) {
- table[trail]|=bits;
- }
- }
-}
-
-void BMPSet::initBits() {
- UChar32 start, limit;
- int32_t listIndex=0;
-
+ }
+ for(trail=0; trail<64; ++trail) {
+ table[trail]|=bits;
+ }
+ }
+ // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
+ // In that case, bits=1<<limitLead is undefined but the bits value
+ // is not used because trail<limitTrail is already false.
+ bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
+ for(trail=0; trail<limitTrail; ++trail) {
+ table[trail]|=bits;
+ }
+ }
+}
+
+void BMPSet::initBits() {
+ UChar32 start, limit;
+ int32_t listIndex=0;
+
// Set latin1Contains[].
- do {
- start=list[listIndex++];
- if(listIndex<listLength) {
- limit=list[listIndex++];
- } else {
- limit=0x110000;
- }
+ do {
+ start=list[listIndex++];
+ if(listIndex<listLength) {
+ limit=list[listIndex++];
+ } else {
+ limit=0x110000;
+ }
if(start>=0x100) {
- break;
- }
- do {
+ break;
+ }
+ do {
latin1Contains[start++]=1;
} while(start<limit && start<0x100);
} while(limit<=0x100);
-
+
// Find the first range overlapping with (or after) 80..FF again,
// to include them in table7FF as well.
for(listIndex=0;;) {
@@ -155,587 +155,587 @@ void BMPSet::initBits() {
}
}
- // Set table7FF[].
- while(start<0x800) {
- set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
- if(limit>0x800) {
- start=0x800;
- break;
- }
-
- start=list[listIndex++];
- if(listIndex<listLength) {
- limit=list[listIndex++];
- } else {
- limit=0x110000;
- }
- }
-
- // Set bmpBlockBits[].
- int32_t minStart=0x800;
- while(start<0x10000) {
- if(limit>0x10000) {
- limit=0x10000;
- }
-
- if(start<minStart) {
- start=minStart;
- }
- if(start<limit) { // Else: Another range entirely in a known mixed-value block.
- if(start&0x3f) {
- // Mixed-value block of 64 code points.
- start>>=6;
- bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
- start=(start+1)<<6; // Round up to the next block boundary.
- minStart=start; // Ignore further ranges in this block.
- }
- if(start<limit) {
- if(start<(limit&~0x3f)) {
- // Multiple all-ones blocks of 64 code points each.
- set32x64Bits(bmpBlockBits, start>>6, limit>>6);
- }
-
- if(limit&0x3f) {
- // Mixed-value block of 64 code points.
- limit>>=6;
- bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
- limit=(limit+1)<<6; // Round up to the next block boundary.
- minStart=limit; // Ignore further ranges in this block.
- }
- }
- }
-
- if(limit==0x10000) {
- break;
- }
-
- start=list[listIndex++];
- if(listIndex<listLength) {
- limit=list[listIndex++];
- } else {
- limit=0x110000;
- }
- }
-}
-
-/*
- * Override some bits and bytes to the result of contains(FFFD)
- * for faster validity checking at runtime.
- * No need to set 0 values where they were reset to 0 in the constructor
- * and not modified by initBits().
+ // Set table7FF[].
+ while(start<0x800) {
+ set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
+ if(limit>0x800) {
+ start=0x800;
+ break;
+ }
+
+ start=list[listIndex++];
+ if(listIndex<listLength) {
+ limit=list[listIndex++];
+ } else {
+ limit=0x110000;
+ }
+ }
+
+ // Set bmpBlockBits[].
+ int32_t minStart=0x800;
+ while(start<0x10000) {
+ if(limit>0x10000) {
+ limit=0x10000;
+ }
+
+ if(start<minStart) {
+ start=minStart;
+ }
+ if(start<limit) { // Else: Another range entirely in a known mixed-value block.
+ if(start&0x3f) {
+ // Mixed-value block of 64 code points.
+ start>>=6;
+ bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
+ start=(start+1)<<6; // Round up to the next block boundary.
+ minStart=start; // Ignore further ranges in this block.
+ }
+ if(start<limit) {
+ if(start<(limit&~0x3f)) {
+ // Multiple all-ones blocks of 64 code points each.
+ set32x64Bits(bmpBlockBits, start>>6, limit>>6);
+ }
+
+ if(limit&0x3f) {
+ // Mixed-value block of 64 code points.
+ limit>>=6;
+ bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
+ limit=(limit+1)<<6; // Round up to the next block boundary.
+ minStart=limit; // Ignore further ranges in this block.
+ }
+ }
+ }
+
+ if(limit==0x10000) {
+ break;
+ }
+
+ start=list[listIndex++];
+ if(listIndex<listLength) {
+ limit=list[listIndex++];
+ } else {
+ limit=0x110000;
+ }
+ }
+}
+
+/*
+ * Override some bits and bytes to the result of contains(FFFD)
+ * for faster validity checking at runtime.
+ * No need to set 0 values where they were reset to 0 in the constructor
+ * and not modified by initBits().
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
- * Need to set 0 values for surrogates D800..DFFF.
- */
-void BMPSet::overrideIllegal() {
- uint32_t bits, mask;
- int32_t i;
-
+ * Need to set 0 values for surrogates D800..DFFF.
+ */
+void BMPSet::overrideIllegal() {
+ uint32_t bits, mask;
+ int32_t i;
+
if(containsFFFD) {
- bits=3; // Lead bytes 0xC0 and 0xC1.
- for(i=0; i<64; ++i) {
- table7FF[i]|=bits;
- }
-
- bits=1; // Lead byte 0xE0.
- for(i=0; i<32; ++i) { // First half of 4k block.
- bmpBlockBits[i]|=bits;
- }
-
+ bits=3; // Lead bytes 0xC0 and 0xC1.
+ for(i=0; i<64; ++i) {
+ table7FF[i]|=bits;
+ }
+
+ bits=1; // Lead byte 0xE0.
+ for(i=0; i<32; ++i) { // First half of 4k block.
+ bmpBlockBits[i]|=bits;
+ }
+
mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
- bits=1<<0xd;
- for(i=32; i<64; ++i) { // Second half of 4k block.
- bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
- }
- } else {
+ bits=1<<0xd;
+ for(i=32; i<64; ++i) { // Second half of 4k block.
+ bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
+ }
+ } else {
mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
- for(i=32; i<64; ++i) { // Second half of 4k block.
- bmpBlockBits[i]&=mask;
- }
- }
-}
-
-int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
- /* Examples:
- findCodePoint(c)
- set list[] c=0 1 3 4 7 8
- === ============== ===========
- [] [110000] 0 0 0 0 0 0
- [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
- [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
- [:Any:] [0, 110000] 1 1 1 1 1 1
- */
-
- // Return the smallest i such that c < list[i]. Assume
- // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
- if (c < list[lo])
- return lo;
- // High runner test. c is often after the last range, so an
- // initial check for this condition pays off.
- if (lo >= hi || c >= list[hi-1])
- return hi;
- // invariant: c >= list[lo]
- // invariant: c < list[hi]
- for (;;) {
- int32_t i = (lo + hi) >> 1;
- if (i == lo) {
- break; // Found!
- } else if (c < list[i]) {
- hi = i;
- } else {
- lo = i;
- }
- }
- return hi;
-}
-
-UBool
-BMPSet::contains(UChar32 c) const {
+ for(i=32; i<64; ++i) { // Second half of 4k block.
+ bmpBlockBits[i]&=mask;
+ }
+ }
+}
+
+int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
+ /* Examples:
+ findCodePoint(c)
+ set list[] c=0 1 3 4 7 8
+ === ============== ===========
+ [] [110000] 0 0 0 0 0 0
+ [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
+ [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
+ [:Any:] [0, 110000] 1 1 1 1 1 1
+ */
+
+ // Return the smallest i such that c < list[i]. Assume
+ // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
+ if (c < list[lo])
+ return lo;
+ // High runner test. c is often after the last range, so an
+ // initial check for this condition pays off.
+ if (lo >= hi || c >= list[hi-1])
+ return hi;
+ // invariant: c >= list[lo]
+ // invariant: c < list[hi]
+ for (;;) {
+ int32_t i = (lo + hi) >> 1;
+ if (i == lo) {
+ break; // Found!
+ } else if (c < list[i]) {
+ hi = i;
+ } else {
+ lo = i;
+ }
+ }
+ return hi;
+}
+
+UBool
+BMPSet::contains(UChar32 c) const {
if((uint32_t)c<=0xff) {
return (UBool)latin1Contains[c];
- } else if((uint32_t)c<=0x7ff) {
- return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
- } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
- int lead=c>>12;
- uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
- if(twoBits<=1) {
- // All 64 code points with the same bits 15..6
- // are either in the set or not.
- return (UBool)twoBits;
- } else {
- // Look up the code point in its 4k block of code points.
- return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
- }
- } else if((uint32_t)c<=0x10ffff) {
- // surrogate or supplementary code point
- return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
- } else {
- // Out-of-range code points get FALSE, consistent with long-standing
- // behavior of UnicodeSet::contains(c).
- return FALSE;
- }
-}
-
-/*
- * Check for sufficient length for trail unit for each surrogate pair.
- * Handle single surrogates as surrogate code points as usual in ICU.
- */
-const UChar *
-BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
- UChar c, c2;
-
- if(spanCondition) {
- // span
- do {
- c=*s;
+ } else if((uint32_t)c<=0x7ff) {
+ return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
+ } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
+ int lead=c>>12;
+ uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
+ if(twoBits<=1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ return (UBool)twoBits;
+ } else {
+ // Look up the code point in its 4k block of code points.
+ return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
+ }
+ } else if((uint32_t)c<=0x10ffff) {
+ // surrogate or supplementary code point
+ return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
+ } else {
+ // Out-of-range code points get FALSE, consistent with long-standing
+ // behavior of UnicodeSet::contains(c).
+ return FALSE;
+ }
+}
+
+/*
+ * Check for sufficient length for trail unit for each surrogate pair.
+ * Handle single surrogates as surrogate code points as usual in ICU.
+ */
+const UChar *
+BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
+ UChar c, c2;
+
+ if(spanCondition) {
+ // span
+ do {
+ c=*s;
if(c<=0xff) {
if(!latin1Contains[c]) {
- break;
- }
- } else if(c<=0x7ff) {
- if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
- break;
- }
- } else if(c<0xd800 || c>=0xe000) {
- int lead=c>>12;
- uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
- if(twoBits<=1) {
- // All 64 code points with the same bits 15..6
- // are either in the set or not.
- if(twoBits==0) {
- break;
- }
- } else {
- // Look up the code point in its 4k block of code points.
- if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
- break;
- }
- }
- } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
- // surrogate code point
- if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
- break;
- }
- } else {
- // surrogate pair
- if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
- break;
- }
- ++s;
- }
- } while(++s<limit);
- } else {
- // span not
- do {
- c=*s;
+ break;
+ }
+ } else if(c<=0x7ff) {
+ if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
+ break;
+ }
+ } else if(c<0xd800 || c>=0xe000) {
+ int lead=c>>12;
+ uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
+ if(twoBits<=1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ if(twoBits==0) {
+ break;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
+ break;
+ }
+ }
+ } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
+ // surrogate code point
+ if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
+ break;
+ }
+ } else {
+ // surrogate pair
+ if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
+ break;
+ }
+ ++s;
+ }
+ } while(++s<limit);
+ } else {
+ // span not
+ do {
+ c=*s;
if(c<=0xff) {
if(latin1Contains[c]) {
- break;
- }
- } else if(c<=0x7ff) {
- if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
- break;
- }
- } else if(c<0xd800 || c>=0xe000) {
- int lead=c>>12;
- uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
- if(twoBits<=1) {
- // All 64 code points with the same bits 15..6
- // are either in the set or not.
- if(twoBits!=0) {
- break;
- }
- } else {
- // Look up the code point in its 4k block of code points.
- if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
- break;
- }
- }
- } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
- // surrogate code point
- if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
- break;
- }
- } else {
- // surrogate pair
- if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
- break;
- }
- ++s;
- }
- } while(++s<limit);
- }
- return s;
-}
-
-/* Symmetrical with span(). */
-const UChar *
-BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
- UChar c, c2;
-
- if(spanCondition) {
- // span
- for(;;) {
- c=*(--limit);
+ break;
+ }
+ } else if(c<=0x7ff) {
+ if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
+ break;
+ }
+ } else if(c<0xd800 || c>=0xe000) {
+ int lead=c>>12;
+ uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
+ if(twoBits<=1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ if(twoBits!=0) {
+ break;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
+ break;
+ }
+ }
+ } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
+ // surrogate code point
+ if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
+ break;
+ }
+ } else {
+ // surrogate pair
+ if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
+ break;
+ }
+ ++s;
+ }
+ } while(++s<limit);
+ }
+ return s;
+}
+
+/* Symmetrical with span(). */
+const UChar *
+BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
+ UChar c, c2;
+
+ if(spanCondition) {
+ // span
+ for(;;) {
+ c=*(--limit);
if(c<=0xff) {
if(!latin1Contains[c]) {
- break;
- }
- } else if(c<=0x7ff) {
- if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
- break;
- }
- } else if(c<0xd800 || c>=0xe000) {
- int lead=c>>12;
- uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
- if(twoBits<=1) {
- // All 64 code points with the same bits 15..6
- // are either in the set or not.
- if(twoBits==0) {
- break;
- }
- } else {
- // Look up the code point in its 4k block of code points.
- if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
- break;
- }
- }
- } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
- // surrogate code point
- if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
- break;
- }
- } else {
- // surrogate pair
- if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
- break;
- }
- --limit;
- }
- if(s==limit) {
- return s;
- }
- }
- } else {
- // span not
- for(;;) {
- c=*(--limit);
+ break;
+ }
+ } else if(c<=0x7ff) {
+ if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
+ break;
+ }
+ } else if(c<0xd800 || c>=0xe000) {
+ int lead=c>>12;
+ uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
+ if(twoBits<=1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ if(twoBits==0) {
+ break;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
+ break;
+ }
+ }
+ } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
+ // surrogate code point
+ if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
+ break;
+ }
+ } else {
+ // surrogate pair
+ if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
+ break;
+ }
+ --limit;
+ }
+ if(s==limit) {
+ return s;
+ }
+ }
+ } else {
+ // span not
+ for(;;) {
+ c=*(--limit);
if(c<=0xff) {
if(latin1Contains[c]) {
- break;
- }
- } else if(c<=0x7ff) {
- if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
- break;
- }
- } else if(c<0xd800 || c>=0xe000) {
- int lead=c>>12;
- uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
- if(twoBits<=1) {
- // All 64 code points with the same bits 15..6
- // are either in the set or not.
- if(twoBits!=0) {
- break;
- }
- } else {
- // Look up the code point in its 4k block of code points.
- if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
- break;
- }
- }
- } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
- // surrogate code point
- if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
- break;
- }
- } else {
- // surrogate pair
- if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
- break;
- }
- --limit;
- }
- if(s==limit) {
- return s;
- }
- }
- }
- return limit+1;
-}
-
-/*
- * Precheck for sufficient trail bytes at end of string only once per span.
- * Check validity.
- */
-const uint8_t *
-BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
- const uint8_t *limit=s+length;
- uint8_t b=*s;
+ break;
+ }
+ } else if(c<=0x7ff) {
+ if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
+ break;
+ }
+ } else if(c<0xd800 || c>=0xe000) {
+ int lead=c>>12;
+ uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
+ if(twoBits<=1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ if(twoBits!=0) {
+ break;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
+ break;
+ }
+ }
+ } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
+ // surrogate code point
+ if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
+ break;
+ }
+ } else {
+ // surrogate pair
+ if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
+ break;
+ }
+ --limit;
+ }
+ if(s==limit) {
+ return s;
+ }
+ }
+ }
+ return limit+1;
+}
+
+/*
+ * Precheck for sufficient trail bytes at end of string only once per span.
+ * Check validity.
+ */
+const uint8_t *
+BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
+ const uint8_t *limit=s+length;
+ uint8_t b=*s;
if(U8_IS_SINGLE(b)) {
- // Initial all-ASCII span.
- if(spanCondition) {
- do {
+ // Initial all-ASCII span.
+ if(spanCondition) {
+ do {
if(!latin1Contains[b] || ++s==limit) {
- return s;
- }
- b=*s;
+ return s;
+ }
+ b=*s;
} while(U8_IS_SINGLE(b));
- } else {
- do {
+ } else {
+ do {
if(latin1Contains[b] || ++s==limit) {
- return s;
- }
- b=*s;
+ return s;
+ }
+ b=*s;
} while(U8_IS_SINGLE(b));
- }
- length=(int32_t)(limit-s);
- }
-
- if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
- spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
- }
-
- const uint8_t *limit0=limit;
-
- /*
- * Make sure that the last 1/2/3/4-byte sequence before limit is complete
- * or runs into a lead byte.
- * In the span loop compare s with limit only once
- * per multi-byte character.
- *
- * Give a trailing illegal sequence the same value as the result of contains(FFFD),
- * including it if that is part of the span, otherwise set limit0 to before
- * the truncated sequence.
- */
- b=*(limit-1);
- if((int8_t)b<0) {
- // b>=0x80: lead or trail byte
- if(b<0xc0) {
- // single trail byte, check for preceding 3- or 4-byte lead byte
- if(length>=2 && (b=*(limit-2))>=0xe0) {
- limit-=2;
+ }
+ length=(int32_t)(limit-s);
+ }
+
+ if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
+ spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
+ }
+
+ const uint8_t *limit0=limit;
+
+ /*
+ * Make sure that the last 1/2/3/4-byte sequence before limit is complete
+ * or runs into a lead byte.
+ * In the span loop compare s with limit only once
+ * per multi-byte character.
+ *
+ * Give a trailing illegal sequence the same value as the result of contains(FFFD),
+ * including it if that is part of the span, otherwise set limit0 to before
+ * the truncated sequence.
+ */
+ b=*(limit-1);
+ if((int8_t)b<0) {
+ // b>=0x80: lead or trail byte
+ if(b<0xc0) {
+ // single trail byte, check for preceding 3- or 4-byte lead byte
+ if(length>=2 && (b=*(limit-2))>=0xe0) {
+ limit-=2;
if(containsFFFD!=spanCondition) {
- limit0=limit;
- }
- } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
- // 4-byte lead byte with only two trail bytes
- limit-=3;
+ limit0=limit;
+ }
+ } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
+ // 4-byte lead byte with only two trail bytes
+ limit-=3;
if(containsFFFD!=spanCondition) {
- limit0=limit;
- }
- }
- } else {
- // lead byte with no trail bytes
- --limit;
+ limit0=limit;
+ }
+ }
+ } else {
+ // lead byte with no trail bytes
+ --limit;
if(containsFFFD!=spanCondition) {
- limit0=limit;
- }
- }
- }
-
- uint8_t t1, t2, t3;
-
- while(s<limit) {
- b=*s;
+ limit0=limit;
+ }
+ }
+ }
+
+ uint8_t t1, t2, t3;
+
+ while(s<limit) {
+ b=*s;
if(U8_IS_SINGLE(b)) {
// ASCII
- if(spanCondition) {
- do {
+ if(spanCondition) {
+ do {
if(!latin1Contains[b]) {
- return s;
- } else if(++s==limit) {
- return limit0;
- }
- b=*s;
+ return s;
+ } else if(++s==limit) {
+ return limit0;
+ }
+ b=*s;
} while(U8_IS_SINGLE(b));
- } else {
- do {
+ } else {
+ do {
if(latin1Contains[b]) {
- return s;
- } else if(++s==limit) {
- return limit0;
- }
- b=*s;
+ return s;
+ } else if(++s==limit) {
+ return limit0;
+ }
+ b=*s;
} while(U8_IS_SINGLE(b));
- }
- }
- ++s; // Advance past the lead byte.
- if(b>=0xe0) {
- if(b<0xf0) {
- if( /* handle U+0000..U+FFFF inline */
- (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
- (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
- ) {
- b&=0xf;
- uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
- if(twoBits<=1) {
- // All 64 code points with this lead byte and middle trail byte
- // are either in the set or not.
- if(twoBits!=(uint32_t)spanCondition) {
- return s-1;
- }
- } else {
- // Look up the code point in its 4k block of code points.
- UChar32 c=(b<<12)|(t1<<6)|t2;
- if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
- return s-1;
- }
- }
- s+=2;
- continue;
- }
- } else if( /* handle U+10000..U+10FFFF inline */
- (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
- (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
- (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
- ) {
- // Give an illegal sequence the same value as the result of contains(FFFD).
- UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
- if( ( (0x10000<=c && c<=0x10ffff) ?
- containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
+ }
+ }
+ ++s; // Advance past the lead byte.
+ if(b>=0xe0) {
+ if(b<0xf0) {
+ if( /* handle U+0000..U+FFFF inline */
+ (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
+ (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
+ ) {
+ b&=0xf;
+ uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
+ if(twoBits<=1) {
+ // All 64 code points with this lead byte and middle trail byte
+ // are either in the set or not.
+ if(twoBits!=(uint32_t)spanCondition) {
+ return s-1;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ UChar32 c=(b<<12)|(t1<<6)|t2;
+ if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
+ return s-1;
+ }
+ }
+ s+=2;
+ continue;
+ }
+ } else if( /* handle U+10000..U+10FFFF inline */
+ (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
+ (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
+ (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
+ ) {
+ // Give an illegal sequence the same value as the result of contains(FFFD).
+ UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
+ if( ( (0x10000<=c && c<=0x10ffff) ?
+ containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
containsFFFD
- ) != spanCondition
- ) {
- return s-1;
- }
- s+=3;
- continue;
- }
+ ) != spanCondition
+ ) {
+ return s-1;
+ }
+ s+=3;
+ continue;
+ }
} else {
- if( /* handle U+0000..U+07FF inline */
+ if( /* handle U+0000..U+07FF inline */
b>=0xc0 &&
- (t1=(uint8_t)(*s-0x80)) <= 0x3f
- ) {
- if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
- return s-1;
- }
- ++s;
- continue;
- }
- }
-
- // Give an illegal sequence the same value as the result of contains(FFFD).
- // Handle each byte of an illegal sequence separately to simplify the code;
- // no need to optimize error handling.
+ (t1=(uint8_t)(*s-0x80)) <= 0x3f
+ ) {
+ if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
+ return s-1;
+ }
+ ++s;
+ continue;
+ }
+ }
+
+ // Give an illegal sequence the same value as the result of contains(FFFD).
+ // Handle each byte of an illegal sequence separately to simplify the code;
+ // no need to optimize error handling.
if(containsFFFD!=spanCondition) {
- return s-1;
- }
- }
-
- return limit0;
-}
-
-/*
- * While going backwards through UTF-8 optimize only for ASCII.
- * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
- * possible to tell from the last byte in a multi-byte sequence how many
- * preceding bytes there should be. Therefore, going backwards through UTF-8
- * is much harder than going forward.
- */
-int32_t
-BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
- if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
- spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
- }
-
- uint8_t b;
-
- do {
- b=s[--length];
+ return s-1;
+ }
+ }
+
+ return limit0;
+}
+
+/*
+ * While going backwards through UTF-8 optimize only for ASCII.
+ * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
+ * possible to tell from the last byte in a multi-byte sequence how many
+ * preceding bytes there should be. Therefore, going backwards through UTF-8
+ * is much harder than going forward.
+ */
+int32_t
+BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
+ if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
+ spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
+ }
+
+ uint8_t b;
+
+ do {
+ b=s[--length];
if(U8_IS_SINGLE(b)) {
- // ASCII sub-span
- if(spanCondition) {
- do {
+ // ASCII sub-span
+ if(spanCondition) {
+ do {
if(!latin1Contains[b]) {
- return length+1;
- } else if(length==0) {
- return 0;
- }
- b=s[--length];
+ return length+1;
+ } else if(length==0) {
+ return 0;
+ }
+ b=s[--length];
} while(U8_IS_SINGLE(b));
- } else {
- do {
+ } else {
+ do {
if(latin1Contains[b]) {
- return length+1;
- } else if(length==0) {
- return 0;
- }
- b=s[--length];
+ return length+1;
+ } else if(length==0) {
+ return 0;
+ }
+ b=s[--length];
} while(U8_IS_SINGLE(b));
- }
- }
-
- int32_t prev=length;
- UChar32 c;
- // trail byte: collect a multi-byte character
- // (or lead byte in last-trail position)
- c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
- // c is a valid code point, not ASCII, not a surrogate
- if(c<=0x7ff) {
- if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
- return prev+1;
- }
- } else if(c<=0xffff) {
- int lead=c>>12;
- uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
- if(twoBits<=1) {
- // All 64 code points with the same bits 15..6
- // are either in the set or not.
- if(twoBits!=(uint32_t)spanCondition) {
- return prev+1;
- }
- } else {
- // Look up the code point in its 4k block of code points.
- if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
- return prev+1;
- }
- }
- } else {
- if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
- return prev+1;
- }
- }
- } while(length>0);
- return 0;
-}
-
-U_NAMESPACE_END
+ }
+ }
+
+ int32_t prev=length;
+ UChar32 c;
+ // trail byte: collect a multi-byte character
+ // (or lead byte in last-trail position)
+ c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
+ // c is a valid code point, not ASCII, not a surrogate
+ if(c<=0x7ff) {
+ if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
+ return prev+1;
+ }
+ } else if(c<=0xffff) {
+ int lead=c>>12;
+ uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
+ if(twoBits<=1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ if(twoBits!=(uint32_t)spanCondition) {
+ return prev+1;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
+ return prev+1;
+ }
+ }
+ } else {
+ if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
+ return prev+1;
+ }
+ }
+ } while(length>0);
+ return 0;
+}
+
+U_NAMESPACE_END