summaryrefslogtreecommitdiffstats
path: root/contrib/libs/icu/common/unisetspan.cpp
diff options
context:
space:
mode:
authorneksard <[email protected]>2022-02-10 16:45:33 +0300
committerDaniil Cherednik <[email protected]>2022-02-10 16:45:33 +0300
commit1d9c550e7c38e051d7961f576013a482003a70d9 (patch)
treeb2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/common/unisetspan.cpp
parent8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff)
Restoring authorship annotation for <[email protected]>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/common/unisetspan.cpp')
-rw-r--r--contrib/libs/icu/common/unisetspan.cpp2992
1 files changed, 1496 insertions, 1496 deletions
diff --git a/contrib/libs/icu/common/unisetspan.cpp b/contrib/libs/icu/common/unisetspan.cpp
index 1490a435a1a..68e44d91ee7 100644
--- a/contrib/libs/icu/common/unisetspan.cpp
+++ b/contrib/libs/icu/common/unisetspan.cpp
@@ -1,1509 +1,1509 @@
// © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-******************************************************************************
-*
-* Copyright (C) 2007-2012, International Business Machines
-* Corporation and others. All Rights Reserved.
-*
-******************************************************************************
-* file name: unisetspan.cpp
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+******************************************************************************
+*
+* Copyright (C) 2007-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: unisetspan.cpp
* encoding: UTF-8
-* tab size: 8 (not used)
-* indentation:4
-*
-* created on: 2007mar01
-* created by: Markus W. Scherer
-*/
-
-#include "unicode/utypes.h"
-#include "unicode/uniset.h"
-#include "unicode/ustring.h"
-#include "unicode/utf8.h"
-#include "unicode/utf16.h"
-#include "cmemory.h"
-#include "uvector.h"
-#include "unisetspan.h"
-
-U_NAMESPACE_BEGIN
-
-/*
- * List of offsets from the current position from where to try matching
- * a code point or a string.
- * Store offsets rather than indexes to simplify the code and use the same list
- * for both increments (in span()) and decrements (in spanBack()).
- *
- * Assumption: The maximum offset is limited, and the offsets that are stored
- * at any one time are relatively dense, that is, there are normally no gaps of
- * hundreds or thousands of offset values.
- *
- * The implementation uses a circular buffer of byte flags,
- * each indicating whether the corresponding offset is in the list.
- * This avoids inserting into a sorted list of offsets (or absolute indexes) and
- * physically moving part of the list.
- *
- * Note: In principle, the caller should setMaxLength() to the maximum of the
- * max string length and U16_LENGTH/U8_LENGTH to account for
- * "long" single code points.
- * However, this implementation uses at least a staticList with more than
- * U8_LENGTH entries anyway.
- *
- * Note: If maxLength were guaranteed to be no more than 32 or 64,
- * the list could be stored as bit flags in a single integer.
- * Rather than handling a circular buffer with a start list index,
- * the integer would simply be shifted when lower offsets are removed.
- * UnicodeSet does not have a limit on the lengths of strings.
- */
-class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
-public:
- OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
-
- ~OffsetList() {
- if(list!=staticList) {
- uprv_free(list);
- }
- }
-
- // Call exactly once if the list is to be used.
- void setMaxLength(int32_t maxLength) {
- if(maxLength<=(int32_t)sizeof(staticList)) {
- capacity=(int32_t)sizeof(staticList);
- } else {
- UBool *l=(UBool *)uprv_malloc(maxLength);
- if(l!=NULL) {
- list=l;
- capacity=maxLength;
- }
- }
- uprv_memset(list, 0, capacity);
- }
-
- void clear() {
- uprv_memset(list, 0, capacity);
- start=length=0;
- }
-
- UBool isEmpty() const {
- return (UBool)(length==0);
- }
-
- // Reduce all stored offsets by delta, used when the current position
- // moves by delta.
- // There must not be any offsets lower than delta.
- // If there is an offset equal to delta, it is removed.
- // delta=[1..maxLength]
- void shift(int32_t delta) {
- int32_t i=start+delta;
- if(i>=capacity) {
- i-=capacity;
- }
- if(list[i]) {
- list[i]=FALSE;
- --length;
- }
- start=i;
- }
-
- // Add an offset. The list must not contain it yet.
- // offset=[1..maxLength]
- void addOffset(int32_t offset) {
- int32_t i=start+offset;
- if(i>=capacity) {
- i-=capacity;
- }
- list[i]=TRUE;
- ++length;
- }
-
- // offset=[1..maxLength]
- UBool containsOffset(int32_t offset) const {
- int32_t i=start+offset;
- if(i>=capacity) {
- i-=capacity;
- }
- return list[i];
- }
-
- // Find the lowest stored offset from a non-empty list, remove it,
- // and reduce all other offsets by this minimum.
- // Returns [1..maxLength].
- int32_t popMinimum() {
- // Look for the next offset in list[start+1..capacity-1].
- int32_t i=start, result;
- while(++i<capacity) {
- if(list[i]) {
- list[i]=FALSE;
- --length;
- result=i-start;
- start=i;
- return result;
- }
- }
- // i==capacity
-
- // Wrap around and look for the next offset in list[0..start].
- // Since the list is not empty, there will be one.
- result=capacity-start;
- i=0;
- while(!list[i]) {
- ++i;
- }
- list[i]=FALSE;
- --length;
- start=i;
- return result+=i;
- }
-
-private:
- UBool *list;
- int32_t capacity;
- int32_t length;
- int32_t start;
-
- UBool staticList[16];
-};
-
-// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
-static int32_t
-getUTF8Length(const UChar *s, int32_t length) {
- UErrorCode errorCode=U_ZERO_ERROR;
- int32_t length8=0;
- u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
- if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
- return length8;
- } else {
- // The string contains an unpaired surrogate.
- // Ignore this string.
- return 0;
- }
-}
-
-// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
-static int32_t
-appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
- UErrorCode errorCode=U_ZERO_ERROR;
- int32_t length8=0;
- u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
- if(U_SUCCESS(errorCode)) {
- return length8;
- } else {
- // The string contains an unpaired surrogate.
- // Ignore this string.
- return 0;
- }
-}
-
-static inline uint8_t
-makeSpanLengthByte(int32_t spanLength) {
- // 0xfe==UnicodeSetStringSpan::LONG_SPAN
- return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
-}
-
-// Construct for all variants of span(), or only for any one variant.
-// Initialize as little as possible, for single use.
-UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
- const UVector &setStrings,
- uint32_t which)
- : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
- utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
- utf8Length(0),
- maxLength16(0), maxLength8(0),
- all((UBool)(which==ALL)) {
- spanSet.retainAll(set);
- if(which&NOT_CONTAINED) {
- // Default to the same sets.
- // addToSpanNotSet() will create a separate set if necessary.
- pSpanNotSet=&spanSet;
- }
-
- // Determine if the strings even need to be taken into account at all for span() etc.
- // If any string is relevant, then all strings need to be used for
- // span(longest match) but only the relevant ones for span(while contained).
- // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
- // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
- // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
- // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
- int32_t stringsLength=strings.size();
-
- int32_t i, spanLength;
- UBool someRelevant=FALSE;
- for(i=0; i<stringsLength; ++i) {
- const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
- const UChar *s16=string.getBuffer();
- int32_t length16=string.length();
- UBool thisRelevant;
- spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
- if(spanLength<length16) { // Relevant string.
- someRelevant=thisRelevant=TRUE;
- } else {
- thisRelevant=FALSE;
- }
- if((which&UTF16) && length16>maxLength16) {
- maxLength16=length16;
- }
- if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
- int32_t length8=getUTF8Length(s16, length16);
- utf8Length+=length8;
- if(length8>maxLength8) {
- maxLength8=length8;
- }
- }
- }
- if(!someRelevant) {
- maxLength16=maxLength8=0;
- return;
- }
-
- // Freeze after checking for the need to use strings at all because freezing
- // a set takes some time and memory which are wasted if there are no relevant strings.
- if(all) {
- spanSet.freeze();
- }
-
- uint8_t *spanBackLengths;
- uint8_t *spanUTF8Lengths;
- uint8_t *spanBackUTF8Lengths;
-
- // Allocate a block of meta data.
- int32_t allocSize;
- if(all) {
- // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
- allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
- } else {
- allocSize=stringsLength; // One set of span lengths.
- if(which&UTF8) {
- // UTF-8 lengths and UTF-8 strings.
- allocSize+=stringsLength*4+utf8Length;
- }
- }
- if(allocSize<=(int32_t)sizeof(staticLengths)) {
- utf8Lengths=staticLengths;
- } else {
- utf8Lengths=(int32_t *)uprv_malloc(allocSize);
- if(utf8Lengths==NULL) {
- maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
- return; // Out of memory.
- }
- }
-
- if(all) {
- // Store span lengths for all span() variants.
- spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
- spanBackLengths=spanLengths+stringsLength;
- spanUTF8Lengths=spanBackLengths+stringsLength;
- spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
- utf8=spanBackUTF8Lengths+stringsLength;
- } else {
- // Store span lengths for only one span() variant.
- if(which&UTF8) {
- spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
- utf8=spanLengths+stringsLength;
- } else {
- spanLengths=(uint8_t *)utf8Lengths;
- }
- spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
- }
-
- // Set the meta data and pSpanNotSet and write the UTF-8 strings.
- int32_t utf8Count=0; // Count UTF-8 bytes written so far.
-
- for(i=0; i<stringsLength; ++i) {
- const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
- const UChar *s16=string.getBuffer();
- int32_t length16=string.length();
- spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
- if(spanLength<length16) { // Relevant string.
- if(which&UTF16) {
- if(which&CONTAINED) {
- if(which&FWD) {
- spanLengths[i]=makeSpanLengthByte(spanLength);
- }
- if(which&BACK) {
- spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
- spanBackLengths[i]=makeSpanLengthByte(spanLength);
- }
- } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
- spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
- }
- }
- if(which&UTF8) {
- uint8_t *s8=utf8+utf8Count;
- int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
- utf8Count+=utf8Lengths[i]=length8;
- if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
- spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
- } else { // Relevant for UTF-8.
- if(which&CONTAINED) {
- if(which&FWD) {
- spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
- spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
- }
- if(which&BACK) {
- spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
- spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
- }
- } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
- spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
- }
- }
- }
- if(which&NOT_CONTAINED) {
- // Add string start and end code points to the spanNotSet so that
- // a span(while not contained) stops before any string.
- UChar32 c;
- if(which&FWD) {
- int32_t len=0;
- U16_NEXT(s16, len, length16, c);
- addToSpanNotSet(c);
- }
- if(which&BACK) {
- int32_t len=length16;
- U16_PREV(s16, 0, len, c);
- addToSpanNotSet(c);
- }
- }
- } else { // Irrelevant string.
- if(which&UTF8) {
- if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
- uint8_t *s8=utf8+utf8Count;
- int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
- utf8Count+=utf8Lengths[i]=length8;
- } else {
- utf8Lengths[i]=0;
- }
- }
- if(all) {
- spanLengths[i]=spanBackLengths[i]=
- spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
- (uint8_t)ALL_CP_CONTAINED;
- } else {
- // All spanXYZLengths pointers contain the same address.
- spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
- }
- }
- }
-
- // Finish.
- if(all) {
- pSpanNotSet->freeze();
- }
-}
-
-// Copy constructor. Assumes which==ALL for a frozen set.
-UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
- const UVector &newParentSetStrings)
- : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
- utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
- utf8Length(otherStringSpan.utf8Length),
- maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
- all(TRUE) {
- if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
- pSpanNotSet=&spanSet;
- } else {
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2007mar01
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/uniset.h"
+#include "unicode/ustring.h"
+#include "unicode/utf8.h"
+#include "unicode/utf16.h"
+#include "cmemory.h"
+#include "uvector.h"
+#include "unisetspan.h"
+
+U_NAMESPACE_BEGIN
+
+/*
+ * List of offsets from the current position from where to try matching
+ * a code point or a string.
+ * Store offsets rather than indexes to simplify the code and use the same list
+ * for both increments (in span()) and decrements (in spanBack()).
+ *
+ * Assumption: The maximum offset is limited, and the offsets that are stored
+ * at any one time are relatively dense, that is, there are normally no gaps of
+ * hundreds or thousands of offset values.
+ *
+ * The implementation uses a circular buffer of byte flags,
+ * each indicating whether the corresponding offset is in the list.
+ * This avoids inserting into a sorted list of offsets (or absolute indexes) and
+ * physically moving part of the list.
+ *
+ * Note: In principle, the caller should setMaxLength() to the maximum of the
+ * max string length and U16_LENGTH/U8_LENGTH to account for
+ * "long" single code points.
+ * However, this implementation uses at least a staticList with more than
+ * U8_LENGTH entries anyway.
+ *
+ * Note: If maxLength were guaranteed to be no more than 32 or 64,
+ * the list could be stored as bit flags in a single integer.
+ * Rather than handling a circular buffer with a start list index,
+ * the integer would simply be shifted when lower offsets are removed.
+ * UnicodeSet does not have a limit on the lengths of strings.
+ */
+class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
+public:
+ OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
+
+ ~OffsetList() {
+ if(list!=staticList) {
+ uprv_free(list);
+ }
+ }
+
+ // Call exactly once if the list is to be used.
+ void setMaxLength(int32_t maxLength) {
+ if(maxLength<=(int32_t)sizeof(staticList)) {
+ capacity=(int32_t)sizeof(staticList);
+ } else {
+ UBool *l=(UBool *)uprv_malloc(maxLength);
+ if(l!=NULL) {
+ list=l;
+ capacity=maxLength;
+ }
+ }
+ uprv_memset(list, 0, capacity);
+ }
+
+ void clear() {
+ uprv_memset(list, 0, capacity);
+ start=length=0;
+ }
+
+ UBool isEmpty() const {
+ return (UBool)(length==0);
+ }
+
+ // Reduce all stored offsets by delta, used when the current position
+ // moves by delta.
+ // There must not be any offsets lower than delta.
+ // If there is an offset equal to delta, it is removed.
+ // delta=[1..maxLength]
+ void shift(int32_t delta) {
+ int32_t i=start+delta;
+ if(i>=capacity) {
+ i-=capacity;
+ }
+ if(list[i]) {
+ list[i]=FALSE;
+ --length;
+ }
+ start=i;
+ }
+
+ // Add an offset. The list must not contain it yet.
+ // offset=[1..maxLength]
+ void addOffset(int32_t offset) {
+ int32_t i=start+offset;
+ if(i>=capacity) {
+ i-=capacity;
+ }
+ list[i]=TRUE;
+ ++length;
+ }
+
+ // offset=[1..maxLength]
+ UBool containsOffset(int32_t offset) const {
+ int32_t i=start+offset;
+ if(i>=capacity) {
+ i-=capacity;
+ }
+ return list[i];
+ }
+
+ // Find the lowest stored offset from a non-empty list, remove it,
+ // and reduce all other offsets by this minimum.
+ // Returns [1..maxLength].
+ int32_t popMinimum() {
+ // Look for the next offset in list[start+1..capacity-1].
+ int32_t i=start, result;
+ while(++i<capacity) {
+ if(list[i]) {
+ list[i]=FALSE;
+ --length;
+ result=i-start;
+ start=i;
+ return result;
+ }
+ }
+ // i==capacity
+
+ // Wrap around and look for the next offset in list[0..start].
+ // Since the list is not empty, there will be one.
+ result=capacity-start;
+ i=0;
+ while(!list[i]) {
+ ++i;
+ }
+ list[i]=FALSE;
+ --length;
+ start=i;
+ return result+=i;
+ }
+
+private:
+ UBool *list;
+ int32_t capacity;
+ int32_t length;
+ int32_t start;
+
+ UBool staticList[16];
+};
+
+// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
+static int32_t
+getUTF8Length(const UChar *s, int32_t length) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ int32_t length8=0;
+ u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
+ if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
+ return length8;
+ } else {
+ // The string contains an unpaired surrogate.
+ // Ignore this string.
+ return 0;
+ }
+}
+
+// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
+static int32_t
+appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ int32_t length8=0;
+ u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
+ if(U_SUCCESS(errorCode)) {
+ return length8;
+ } else {
+ // The string contains an unpaired surrogate.
+ // Ignore this string.
+ return 0;
+ }
+}
+
+static inline uint8_t
+makeSpanLengthByte(int32_t spanLength) {
+ // 0xfe==UnicodeSetStringSpan::LONG_SPAN
+ return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
+}
+
+// Construct for all variants of span(), or only for any one variant.
+// Initialize as little as possible, for single use.
+UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
+ const UVector &setStrings,
+ uint32_t which)
+ : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
+ utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
+ utf8Length(0),
+ maxLength16(0), maxLength8(0),
+ all((UBool)(which==ALL)) {
+ spanSet.retainAll(set);
+ if(which&NOT_CONTAINED) {
+ // Default to the same sets.
+ // addToSpanNotSet() will create a separate set if necessary.
+ pSpanNotSet=&spanSet;
+ }
+
+ // Determine if the strings even need to be taken into account at all for span() etc.
+ // If any string is relevant, then all strings need to be used for
+ // span(longest match) but only the relevant ones for span(while contained).
+ // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
+ // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
+ // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
+ // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
+ int32_t stringsLength=strings.size();
+
+ int32_t i, spanLength;
+ UBool someRelevant=FALSE;
+ for(i=0; i<stringsLength; ++i) {
+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+ const UChar *s16=string.getBuffer();
+ int32_t length16=string.length();
+ UBool thisRelevant;
+ spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
+ if(spanLength<length16) { // Relevant string.
+ someRelevant=thisRelevant=TRUE;
+ } else {
+ thisRelevant=FALSE;
+ }
+ if((which&UTF16) && length16>maxLength16) {
+ maxLength16=length16;
+ }
+ if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
+ int32_t length8=getUTF8Length(s16, length16);
+ utf8Length+=length8;
+ if(length8>maxLength8) {
+ maxLength8=length8;
+ }
+ }
+ }
+ if(!someRelevant) {
+ maxLength16=maxLength8=0;
+ return;
+ }
+
+ // Freeze after checking for the need to use strings at all because freezing
+ // a set takes some time and memory which are wasted if there are no relevant strings.
+ if(all) {
+ spanSet.freeze();
+ }
+
+ uint8_t *spanBackLengths;
+ uint8_t *spanUTF8Lengths;
+ uint8_t *spanBackUTF8Lengths;
+
+ // Allocate a block of meta data.
+ int32_t allocSize;
+ if(all) {
+ // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
+ allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
+ } else {
+ allocSize=stringsLength; // One set of span lengths.
+ if(which&UTF8) {
+ // UTF-8 lengths and UTF-8 strings.
+ allocSize+=stringsLength*4+utf8Length;
+ }
+ }
+ if(allocSize<=(int32_t)sizeof(staticLengths)) {
+ utf8Lengths=staticLengths;
+ } else {
+ utf8Lengths=(int32_t *)uprv_malloc(allocSize);
+ if(utf8Lengths==NULL) {
+ maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
+ return; // Out of memory.
+ }
+ }
+
+ if(all) {
+ // Store span lengths for all span() variants.
+ spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
+ spanBackLengths=spanLengths+stringsLength;
+ spanUTF8Lengths=spanBackLengths+stringsLength;
+ spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
+ utf8=spanBackUTF8Lengths+stringsLength;
+ } else {
+ // Store span lengths for only one span() variant.
+ if(which&UTF8) {
+ spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
+ utf8=spanLengths+stringsLength;
+ } else {
+ spanLengths=(uint8_t *)utf8Lengths;
+ }
+ spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
+ }
+
+ // Set the meta data and pSpanNotSet and write the UTF-8 strings.
+ int32_t utf8Count=0; // Count UTF-8 bytes written so far.
+
+ for(i=0; i<stringsLength; ++i) {
+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+ const UChar *s16=string.getBuffer();
+ int32_t length16=string.length();
+ spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
+ if(spanLength<length16) { // Relevant string.
+ if(which&UTF16) {
+ if(which&CONTAINED) {
+ if(which&FWD) {
+ spanLengths[i]=makeSpanLengthByte(spanLength);
+ }
+ if(which&BACK) {
+ spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
+ spanBackLengths[i]=makeSpanLengthByte(spanLength);
+ }
+ } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
+ spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
+ }
+ }
+ if(which&UTF8) {
+ uint8_t *s8=utf8+utf8Count;
+ int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
+ utf8Count+=utf8Lengths[i]=length8;
+ if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
+ spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
+ } else { // Relevant for UTF-8.
+ if(which&CONTAINED) {
+ if(which&FWD) {
+ spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
+ spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
+ }
+ if(which&BACK) {
+ spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
+ spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
+ }
+ } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
+ spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
+ }
+ }
+ }
+ if(which&NOT_CONTAINED) {
+ // Add string start and end code points to the spanNotSet so that
+ // a span(while not contained) stops before any string.
+ UChar32 c;
+ if(which&FWD) {
+ int32_t len=0;
+ U16_NEXT(s16, len, length16, c);
+ addToSpanNotSet(c);
+ }
+ if(which&BACK) {
+ int32_t len=length16;
+ U16_PREV(s16, 0, len, c);
+ addToSpanNotSet(c);
+ }
+ }
+ } else { // Irrelevant string.
+ if(which&UTF8) {
+ if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
+ uint8_t *s8=utf8+utf8Count;
+ int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
+ utf8Count+=utf8Lengths[i]=length8;
+ } else {
+ utf8Lengths[i]=0;
+ }
+ }
+ if(all) {
+ spanLengths[i]=spanBackLengths[i]=
+ spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
+ (uint8_t)ALL_CP_CONTAINED;
+ } else {
+ // All spanXYZLengths pointers contain the same address.
+ spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
+ }
+ }
+ }
+
+ // Finish.
+ if(all) {
+ pSpanNotSet->freeze();
+ }
+}
+
+// Copy constructor. Assumes which==ALL for a frozen set.
+UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
+ const UVector &newParentSetStrings)
+ : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
+ utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
+ utf8Length(otherStringSpan.utf8Length),
+ maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
+ all(TRUE) {
+ if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
+ pSpanNotSet=&spanSet;
+ } else {
pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
- }
-
- // Allocate a block of meta data.
- // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
- int32_t stringsLength=strings.size();
- int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
- if(allocSize<=(int32_t)sizeof(staticLengths)) {
- utf8Lengths=staticLengths;
- } else {
- utf8Lengths=(int32_t *)uprv_malloc(allocSize);
- if(utf8Lengths==NULL) {
- maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
- return; // Out of memory.
- }
- }
-
- spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
- utf8=spanLengths+stringsLength*4;
- uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
-}
-
-UnicodeSetStringSpan::~UnicodeSetStringSpan() {
- if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
- delete pSpanNotSet;
- }
- if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
- uprv_free(utf8Lengths);
- }
-}
-
-void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
- if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
- if(spanSet.contains(c)) {
- return; // Nothing to do.
- }
+ }
+
+ // Allocate a block of meta data.
+ // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
+ int32_t stringsLength=strings.size();
+ int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
+ if(allocSize<=(int32_t)sizeof(staticLengths)) {
+ utf8Lengths=staticLengths;
+ } else {
+ utf8Lengths=(int32_t *)uprv_malloc(allocSize);
+ if(utf8Lengths==NULL) {
+ maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
+ return; // Out of memory.
+ }
+ }
+
+ spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
+ utf8=spanLengths+stringsLength*4;
+ uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
+}
+
+UnicodeSetStringSpan::~UnicodeSetStringSpan() {
+ if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
+ delete pSpanNotSet;
+ }
+ if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
+ uprv_free(utf8Lengths);
+ }
+}
+
+void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
+ if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
+ if(spanSet.contains(c)) {
+ return; // Nothing to do.
+ }
UnicodeSet *newSet=spanSet.cloneAsThawed();
- if(newSet==NULL) {
- return; // Out of memory.
- } else {
- pSpanNotSet=newSet;
- }
- }
- pSpanNotSet->add(c);
-}
-
-// Compare strings without any argument checks. Requires length>0.
-static inline UBool
-matches16(const UChar *s, const UChar *t, int32_t length) {
- do {
- if(*s++!=*t++) {
- return FALSE;
- }
- } while(--length>0);
- return TRUE;
-}
-
-static inline UBool
-matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
- do {
- if(*s++!=*t++) {
- return FALSE;
- }
- } while(--length>0);
- return TRUE;
-}
-
-// Compare 16-bit Unicode strings (which may be malformed UTF-16)
-// at code point boundaries.
-// That is, each edge of a match must not be in the middle of a surrogate pair.
-static inline UBool
-matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
- s+=start;
- limit-=start;
- return matches16(s, t, length) &&
- !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
- !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
-}
-
-// Does the set contain the next code point?
-// If so, return its length; otherwise return its negative length.
-static inline int32_t
-spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
- UChar c=*s, c2;
- if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
- return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
- }
- return set.contains(c) ? 1 : -1;
-}
-
-static inline int32_t
-spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
- UChar c=s[length-1], c2;
- if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
- return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
- }
- return set.contains(c) ? 1 : -1;
-}
-
-static inline int32_t
-spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
- UChar32 c=*s;
+ if(newSet==NULL) {
+ return; // Out of memory.
+ } else {
+ pSpanNotSet=newSet;
+ }
+ }
+ pSpanNotSet->add(c);
+}
+
+// Compare strings without any argument checks. Requires length>0.
+static inline UBool
+matches16(const UChar *s, const UChar *t, int32_t length) {
+ do {
+ if(*s++!=*t++) {
+ return FALSE;
+ }
+ } while(--length>0);
+ return TRUE;
+}
+
+static inline UBool
+matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
+ do {
+ if(*s++!=*t++) {
+ return FALSE;
+ }
+ } while(--length>0);
+ return TRUE;
+}
+
+// Compare 16-bit Unicode strings (which may be malformed UTF-16)
+// at code point boundaries.
+// That is, each edge of a match must not be in the middle of a surrogate pair.
+static inline UBool
+matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
+ s+=start;
+ limit-=start;
+ return matches16(s, t, length) &&
+ !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
+ !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
+}
+
+// Does the set contain the next code point?
+// If so, return its length; otherwise return its negative length.
+static inline int32_t
+spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
+ UChar c=*s, c2;
+ if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
+ return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
+ }
+ return set.contains(c) ? 1 : -1;
+}
+
+static inline int32_t
+spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
+ UChar c=s[length-1], c2;
+ if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
+ return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
+ }
+ return set.contains(c) ? 1 : -1;
+}
+
+static inline int32_t
+spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
+ UChar32 c=*s;
if(U8_IS_SINGLE(c)) {
- return set.contains(c) ? 1 : -1;
- }
- // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
- int32_t i=0;
- U8_NEXT_OR_FFFD(s, i, length, c);
- return set.contains(c) ? i : -i;
-}
-
-static inline int32_t
-spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
- UChar32 c=s[length-1];
+ return set.contains(c) ? 1 : -1;
+ }
+ // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
+ int32_t i=0;
+ U8_NEXT_OR_FFFD(s, i, length, c);
+ return set.contains(c) ? i : -i;
+}
+
+static inline int32_t
+spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
+ UChar32 c=s[length-1];
if(U8_IS_SINGLE(c)) {
- return set.contains(c) ? 1 : -1;
- }
- int32_t i=length-1;
- c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
- length-=i;
- return set.contains(c) ? length : -length;
-}
-
-/*
- * Note: In span() when spanLength==0 (after a string match, or at the beginning
- * after an empty code point span) and in spanNot() and spanNotUTF8(),
- * string matching could use a binary search
- * because all string matches are done from the same start index.
- *
- * For UTF-8, this would require a comparison function that returns UTF-16 order.
- *
- * This optimization should not be necessary for normal UnicodeSets because
- * most sets have no strings, and most sets with strings have
- * very few very short strings.
- * For cases with many strings, it might be better to use a different API
- * and implementation with a DFA (state machine).
- */
-
-/*
- * Algorithm for span(USET_SPAN_CONTAINED)
- *
- * Theoretical algorithm:
- * - Iterate through the string, and at each code point boundary:
- * + If the code point there is in the set, then remember to continue after it.
- * + If a set string matches at the current position, then remember to continue after it.
- * + Either recursively span for each code point or string match,
- * or recursively span for all but the shortest one and
- * iteratively continue the span with the shortest local match.
- * + Remember the longest recursive span (the farthest end point).
- * + If there is no match at the current position, neither for the code point there
- * nor for any set string, then stop and return the longest recursive span length.
- *
- * Optimized implementation:
- *
- * (We assume that most sets will have very few very short strings.
- * A span using a string-less set is extremely fast.)
- *
- * Create and cache a spanSet which contains all of the single code points
- * of the original set but none of its strings.
- *
- * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
- * - Loop:
- * + Try to match each set string at the end of the spanLength.
- * ~ Set strings that start with set-contained code points must be matched
- * with a partial overlap because the recursive algorithm would have tried
- * to match them at every position.
- * ~ Set strings that entirely consist of set-contained code points
- * are irrelevant for span(USET_SPAN_CONTAINED) because the
- * recursive algorithm would continue after them anyway
- * and find the longest recursive match from their end.
- * ~ Rather than recursing, note each end point of a set string match.
- * + If no set string matched after spanSet.span(), then return
- * with where the spanSet.span() ended.
- * + If at least one set string matched after spanSet.span(), then
- * pop the shortest string match end point and continue
- * the loop, trying to match all set strings from there.
- * + If at least one more set string matched after a previous string match,
- * then test if the code point after the previous string match is also
- * contained in the set.
- * Continue the loop with the shortest end point of either this code point
- * or a matching set string.
- * + If no more set string matched after a previous string match,
- * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
- * Stop if spanLength==0, otherwise continue the loop.
- *
- * By noting each end point of a set string match,
- * the function visits each string position at most once and finishes
- * in linear time.
- *
- * The recursive algorithm may visit the same string position many times
- * if multiple paths lead to it and finishes in exponential time.
- */
-
-/*
- * Algorithm for span(USET_SPAN_SIMPLE)
- *
- * Theoretical algorithm:
- * - Iterate through the string, and at each code point boundary:
- * + If the code point there is in the set, then remember to continue after it.
- * + If a set string matches at the current position, then remember to continue after it.
- * + Continue from the farthest match position and ignore all others.
- * + If there is no match at the current position,
- * then stop and return the current position.
- *
- * Optimized implementation:
- *
- * (Same assumption and spanSet as above.)
- *
- * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
- * - Loop:
- * + Try to match each set string at the end of the spanLength.
- * ~ Set strings that start with set-contained code points must be matched
- * with a partial overlap because the standard algorithm would have tried
- * to match them earlier.
- * ~ Set strings that entirely consist of set-contained code points
- * must be matched with a full overlap because the longest-match algorithm
- * would hide set string matches that end earlier.
- * Such set strings need not be matched earlier inside the code point span
- * because the standard algorithm would then have continued after
- * the set string match anyway.
- * ~ Remember the longest set string match (farthest end point) from the earliest
- * starting point.
- * + If no set string matched after spanSet.span(), then return
- * with where the spanSet.span() ended.
- * + If at least one set string matched, then continue the loop after the
- * longest match from the earliest position.
- * + If no more set string matched after a previous string match,
- * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
- * Stop if spanLength==0, otherwise continue the loop.
- */
-
-int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
- if(spanCondition==USET_SPAN_NOT_CONTAINED) {
- return spanNot(s, length);
- }
- int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
- if(spanLength==length) {
- return length;
- }
-
- // Consider strings; they may overlap with the span.
- OffsetList offsets;
- if(spanCondition==USET_SPAN_CONTAINED) {
- // Use offset list to try all possibilities.
- offsets.setMaxLength(maxLength16);
- }
- int32_t pos=spanLength, rest=length-pos;
- int32_t i, stringsLength=strings.size();
- for(;;) {
- if(spanCondition==USET_SPAN_CONTAINED) {
- for(i=0; i<stringsLength; ++i) {
- int32_t overlap=spanLengths[i];
- if(overlap==ALL_CP_CONTAINED) {
- continue; // Irrelevant string.
- }
- const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
- const UChar *s16=string.getBuffer();
- int32_t length16=string.length();
-
- // Try to match this string at pos-overlap..pos.
- if(overlap>=LONG_SPAN) {
- overlap=length16;
- // While contained: No point matching fully inside the code point span.
- U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
- }
- if(overlap>spanLength) {
- overlap=spanLength;
- }
- int32_t inc=length16-overlap; // Keep overlap+inc==length16.
- for(;;) {
- if(inc>rest) {
- break;
- }
- // Try to match if the increment is not listed already.
- if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
- if(inc==rest) {
- return length; // Reached the end of the string.
- }
- offsets.addOffset(inc);
- }
- if(overlap==0) {
- break;
- }
- --overlap;
- ++inc;
- }
- }
- } else /* USET_SPAN_SIMPLE */ {
- int32_t maxInc=0, maxOverlap=0;
- for(i=0; i<stringsLength; ++i) {
- int32_t overlap=spanLengths[i];
- // For longest match, we do need to try to match even an all-contained string
- // to find the match from the earliest start.
-
- const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
- const UChar *s16=string.getBuffer();
- int32_t length16=string.length();
-
- // Try to match this string at pos-overlap..pos.
- if(overlap>=LONG_SPAN) {
- overlap=length16;
- // Longest match: Need to match fully inside the code point span
- // to find the match from the earliest start.
- }
- if(overlap>spanLength) {
- overlap=spanLength;
- }
- int32_t inc=length16-overlap; // Keep overlap+inc==length16.
- for(;;) {
- if(inc>rest || overlap<maxOverlap) {
- break;
- }
- // Try to match if the string is longer or starts earlier.
- if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
- matches16CPB(s, pos-overlap, length, s16, length16)
- ) {
- maxInc=inc; // Longest match from earliest start.
- maxOverlap=overlap;
- break;
- }
- --overlap;
- ++inc;
- }
- }
-
- if(maxInc!=0 || maxOverlap!=0) {
- // Longest-match algorithm, and there was a string match.
- // Simply continue after it.
- pos+=maxInc;
- rest-=maxInc;
- if(rest==0) {
- return length; // Reached the end of the string.
- }
- spanLength=0; // Match strings from after a string match.
- continue;
- }
- }
- // Finished trying to match all strings at pos.
-
- if(spanLength!=0 || pos==0) {
- // The position is after an unlimited code point span (spanLength!=0),
- // not after a string match.
- // The only position where spanLength==0 after a span is pos==0.
- // Otherwise, an unlimited code point span is only tried again when no
- // strings match, and if such a non-initial span fails we stop.
- if(offsets.isEmpty()) {
- return pos; // No strings matched after a span.
- }
- // Match strings from after the next string match.
- } else {
- // The position is after a string match (or a single code point).
- if(offsets.isEmpty()) {
- // No more strings matched after a previous string match.
- // Try another code point span from after the last string match.
- spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
- if( spanLength==rest || // Reached the end of the string, or
- spanLength==0 // neither strings nor span progressed.
- ) {
- return pos+spanLength;
- }
- pos+=spanLength;
- rest-=spanLength;
- continue; // spanLength>0: Match strings from after a span.
- } else {
- // Try to match only one code point from after a string match if some
- // string matched beyond it, so that we try all possible positions
- // and don't overshoot.
- spanLength=spanOne(spanSet, s+pos, rest);
- if(spanLength>0) {
- if(spanLength==rest) {
- return length; // Reached the end of the string.
- }
- // Match strings after this code point.
- // There cannot be any increments below it because UnicodeSet strings
- // contain multiple code points.
- pos+=spanLength;
- rest-=spanLength;
- offsets.shift(spanLength);
- spanLength=0;
- continue; // Match strings from after a single code point.
- }
- // Match strings from after the next string match.
- }
- }
- int32_t minOffset=offsets.popMinimum();
- pos+=minOffset;
- rest-=minOffset;
- spanLength=0; // Match strings from after a string match.
- }
-}
-
-int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
- if(spanCondition==USET_SPAN_NOT_CONTAINED) {
- return spanNotBack(s, length);
- }
- int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
- if(pos==0) {
- return 0;
- }
- int32_t spanLength=length-pos;
-
- // Consider strings; they may overlap with the span.
- OffsetList offsets;
- if(spanCondition==USET_SPAN_CONTAINED) {
- // Use offset list to try all possibilities.
- offsets.setMaxLength(maxLength16);
- }
- int32_t i, stringsLength=strings.size();
- uint8_t *spanBackLengths=spanLengths;
- if(all) {
- spanBackLengths+=stringsLength;
- }
- for(;;) {
- if(spanCondition==USET_SPAN_CONTAINED) {
- for(i=0; i<stringsLength; ++i) {
- int32_t overlap=spanBackLengths[i];
- if(overlap==ALL_CP_CONTAINED) {
- continue; // Irrelevant string.
- }
- const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
- const UChar *s16=string.getBuffer();
- int32_t length16=string.length();
-
- // Try to match this string at pos-(length16-overlap)..pos-length16.
- if(overlap>=LONG_SPAN) {
- overlap=length16;
- // While contained: No point matching fully inside the code point span.
- int32_t len1=0;
- U16_FWD_1(s16, len1, overlap);
- overlap-=len1; // Length of the string minus the first code point.
- }
- if(overlap>spanLength) {
- overlap=spanLength;
- }
- int32_t dec=length16-overlap; // Keep dec+overlap==length16.
- for(;;) {
- if(dec>pos) {
- break;
- }
- // Try to match if the decrement is not listed already.
- if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
- if(dec==pos) {
- return 0; // Reached the start of the string.
- }
- offsets.addOffset(dec);
- }
- if(overlap==0) {
- break;
- }
- --overlap;
- ++dec;
- }
- }
- } else /* USET_SPAN_SIMPLE */ {
- int32_t maxDec=0, maxOverlap=0;
- for(i=0; i<stringsLength; ++i) {
- int32_t overlap=spanBackLengths[i];
- // For longest match, we do need to try to match even an all-contained string
- // to find the match from the latest end.
-
- const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
- const UChar *s16=string.getBuffer();
- int32_t length16=string.length();
-
- // Try to match this string at pos-(length16-overlap)..pos-length16.
- if(overlap>=LONG_SPAN) {
- overlap=length16;
- // Longest match: Need to match fully inside the code point span
- // to find the match from the latest end.
- }
- if(overlap>spanLength) {
- overlap=spanLength;
- }
- int32_t dec=length16-overlap; // Keep dec+overlap==length16.
- for(;;) {
- if(dec>pos || overlap<maxOverlap) {
- break;
- }
- // Try to match if the string is longer or ends later.
- if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
- matches16CPB(s, pos-dec, length, s16, length16)
- ) {
- maxDec=dec; // Longest match from latest end.
- maxOverlap=overlap;
- break;
- }
- --overlap;
- ++dec;
- }
- }
-
- if(maxDec!=0 || maxOverlap!=0) {
- // Longest-match algorithm, and there was a string match.
- // Simply continue before it.
- pos-=maxDec;
- if(pos==0) {
- return 0; // Reached the start of the string.
- }
- spanLength=0; // Match strings from before a string match.
- continue;
- }
- }
- // Finished trying to match all strings at pos.
-
- if(spanLength!=0 || pos==length) {
- // The position is before an unlimited code point span (spanLength!=0),
- // not before a string match.
- // The only position where spanLength==0 before a span is pos==length.
- // Otherwise, an unlimited code point span is only tried again when no
- // strings match, and if such a non-initial span fails we stop.
- if(offsets.isEmpty()) {
- return pos; // No strings matched before a span.
- }
- // Match strings from before the next string match.
- } else {
- // The position is before a string match (or a single code point).
- if(offsets.isEmpty()) {
- // No more strings matched before a previous string match.
- // Try another code point span from before the last string match.
- int32_t oldPos=pos;
- pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
- spanLength=oldPos-pos;
- if( pos==0 || // Reached the start of the string, or
- spanLength==0 // neither strings nor span progressed.
- ) {
- return pos;
- }
- continue; // spanLength>0: Match strings from before a span.
- } else {
- // Try to match only one code point from before a string match if some
- // string matched beyond it, so that we try all possible positions
- // and don't overshoot.
- spanLength=spanOneBack(spanSet, s, pos);
- if(spanLength>0) {
- if(spanLength==pos) {
- return 0; // Reached the start of the string.
- }
- // Match strings before this code point.
- // There cannot be any decrements below it because UnicodeSet strings
- // contain multiple code points.
- pos-=spanLength;
- offsets.shift(spanLength);
- spanLength=0;
- continue; // Match strings from before a single code point.
- }
- // Match strings from before the next string match.
- }
- }
- pos-=offsets.popMinimum();
- spanLength=0; // Match strings from before a string match.
- }
-}
-
-int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
- if(spanCondition==USET_SPAN_NOT_CONTAINED) {
- return spanNotUTF8(s, length);
- }
- int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
- if(spanLength==length) {
- return length;
- }
-
- // Consider strings; they may overlap with the span.
- OffsetList offsets;
- if(spanCondition==USET_SPAN_CONTAINED) {
- // Use offset list to try all possibilities.
- offsets.setMaxLength(maxLength8);
- }
- int32_t pos=spanLength, rest=length-pos;
- int32_t i, stringsLength=strings.size();
- uint8_t *spanUTF8Lengths=spanLengths;
- if(all) {
- spanUTF8Lengths+=2*stringsLength;
- }
- for(;;) {
- const uint8_t *s8=utf8;
- int32_t length8;
- if(spanCondition==USET_SPAN_CONTAINED) {
- for(i=0; i<stringsLength; ++i) {
- length8=utf8Lengths[i];
- if(length8==0) {
- continue; // String not representable in UTF-8.
- }
- int32_t overlap=spanUTF8Lengths[i];
- if(overlap==ALL_CP_CONTAINED) {
- s8+=length8;
- continue; // Irrelevant string.
- }
-
- // Try to match this string at pos-overlap..pos.
- if(overlap>=LONG_SPAN) {
- overlap=length8;
- // While contained: No point matching fully inside the code point span.
- U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
- }
- if(overlap>spanLength) {
- overlap=spanLength;
- }
- int32_t inc=length8-overlap; // Keep overlap+inc==length8.
- for(;;) {
- if(inc>rest) {
- break;
- }
- // Try to match if the increment is not listed already.
- // Match at code point boundaries. (The UTF-8 strings were converted
- // from UTF-16 and are guaranteed to be well-formed.)
+ return set.contains(c) ? 1 : -1;
+ }
+ int32_t i=length-1;
+ c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
+ length-=i;
+ return set.contains(c) ? length : -length;
+}
+
+/*
+ * Note: In span() when spanLength==0 (after a string match, or at the beginning
+ * after an empty code point span) and in spanNot() and spanNotUTF8(),
+ * string matching could use a binary search
+ * because all string matches are done from the same start index.
+ *
+ * For UTF-8, this would require a comparison function that returns UTF-16 order.
+ *
+ * This optimization should not be necessary for normal UnicodeSets because
+ * most sets have no strings, and most sets with strings have
+ * very few very short strings.
+ * For cases with many strings, it might be better to use a different API
+ * and implementation with a DFA (state machine).
+ */
+
+/*
+ * Algorithm for span(USET_SPAN_CONTAINED)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ * + If the code point there is in the set, then remember to continue after it.
+ * + If a set string matches at the current position, then remember to continue after it.
+ * + Either recursively span for each code point or string match,
+ * or recursively span for all but the shortest one and
+ * iteratively continue the span with the shortest local match.
+ * + Remember the longest recursive span (the farthest end point).
+ * + If there is no match at the current position, neither for the code point there
+ * nor for any set string, then stop and return the longest recursive span length.
+ *
+ * Optimized implementation:
+ *
+ * (We assume that most sets will have very few very short strings.
+ * A span using a string-less set is extremely fast.)
+ *
+ * Create and cache a spanSet which contains all of the single code points
+ * of the original set but none of its strings.
+ *
+ * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
+ * - Loop:
+ * + Try to match each set string at the end of the spanLength.
+ * ~ Set strings that start with set-contained code points must be matched
+ * with a partial overlap because the recursive algorithm would have tried
+ * to match them at every position.
+ * ~ Set strings that entirely consist of set-contained code points
+ * are irrelevant for span(USET_SPAN_CONTAINED) because the
+ * recursive algorithm would continue after them anyway
+ * and find the longest recursive match from their end.
+ * ~ Rather than recursing, note each end point of a set string match.
+ * + If no set string matched after spanSet.span(), then return
+ * with where the spanSet.span() ended.
+ * + If at least one set string matched after spanSet.span(), then
+ * pop the shortest string match end point and continue
+ * the loop, trying to match all set strings from there.
+ * + If at least one more set string matched after a previous string match,
+ * then test if the code point after the previous string match is also
+ * contained in the set.
+ * Continue the loop with the shortest end point of either this code point
+ * or a matching set string.
+ * + If no more set string matched after a previous string match,
+ * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
+ * Stop if spanLength==0, otherwise continue the loop.
+ *
+ * By noting each end point of a set string match,
+ * the function visits each string position at most once and finishes
+ * in linear time.
+ *
+ * The recursive algorithm may visit the same string position many times
+ * if multiple paths lead to it and finishes in exponential time.
+ */
+
+/*
+ * Algorithm for span(USET_SPAN_SIMPLE)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ * + If the code point there is in the set, then remember to continue after it.
+ * + If a set string matches at the current position, then remember to continue after it.
+ * + Continue from the farthest match position and ignore all others.
+ * + If there is no match at the current position,
+ * then stop and return the current position.
+ *
+ * Optimized implementation:
+ *
+ * (Same assumption and spanSet as above.)
+ *
+ * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
+ * - Loop:
+ * + Try to match each set string at the end of the spanLength.
+ * ~ Set strings that start with set-contained code points must be matched
+ * with a partial overlap because the standard algorithm would have tried
+ * to match them earlier.
+ * ~ Set strings that entirely consist of set-contained code points
+ * must be matched with a full overlap because the longest-match algorithm
+ * would hide set string matches that end earlier.
+ * Such set strings need not be matched earlier inside the code point span
+ * because the standard algorithm would then have continued after
+ * the set string match anyway.
+ * ~ Remember the longest set string match (farthest end point) from the earliest
+ * starting point.
+ * + If no set string matched after spanSet.span(), then return
+ * with where the spanSet.span() ended.
+ * + If at least one set string matched, then continue the loop after the
+ * longest match from the earliest position.
+ * + If no more set string matched after a previous string match,
+ * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
+ * Stop if spanLength==0, otherwise continue the loop.
+ */
+
+int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+ return spanNot(s, length);
+ }
+ int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
+ if(spanLength==length) {
+ return length;
+ }
+
+ // Consider strings; they may overlap with the span.
+ OffsetList offsets;
+ if(spanCondition==USET_SPAN_CONTAINED) {
+ // Use offset list to try all possibilities.
+ offsets.setMaxLength(maxLength16);
+ }
+ int32_t pos=spanLength, rest=length-pos;
+ int32_t i, stringsLength=strings.size();
+ for(;;) {
+ if(spanCondition==USET_SPAN_CONTAINED) {
+ for(i=0; i<stringsLength; ++i) {
+ int32_t overlap=spanLengths[i];
+ if(overlap==ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+ const UChar *s16=string.getBuffer();
+ int32_t length16=string.length();
+
+ // Try to match this string at pos-overlap..pos.
+ if(overlap>=LONG_SPAN) {
+ overlap=length16;
+ // While contained: No point matching fully inside the code point span.
+ U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
+ }
+ if(overlap>spanLength) {
+ overlap=spanLength;
+ }
+ int32_t inc=length16-overlap; // Keep overlap+inc==length16.
+ for(;;) {
+ if(inc>rest) {
+ break;
+ }
+ // Try to match if the increment is not listed already.
+ if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
+ if(inc==rest) {
+ return length; // Reached the end of the string.
+ }
+ offsets.addOffset(inc);
+ }
+ if(overlap==0) {
+ break;
+ }
+ --overlap;
+ ++inc;
+ }
+ }
+ } else /* USET_SPAN_SIMPLE */ {
+ int32_t maxInc=0, maxOverlap=0;
+ for(i=0; i<stringsLength; ++i) {
+ int32_t overlap=spanLengths[i];
+ // For longest match, we do need to try to match even an all-contained string
+ // to find the match from the earliest start.
+
+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+ const UChar *s16=string.getBuffer();
+ int32_t length16=string.length();
+
+ // Try to match this string at pos-overlap..pos.
+ if(overlap>=LONG_SPAN) {
+ overlap=length16;
+ // Longest match: Need to match fully inside the code point span
+ // to find the match from the earliest start.
+ }
+ if(overlap>spanLength) {
+ overlap=spanLength;
+ }
+ int32_t inc=length16-overlap; // Keep overlap+inc==length16.
+ for(;;) {
+ if(inc>rest || overlap<maxOverlap) {
+ break;
+ }
+ // Try to match if the string is longer or starts earlier.
+ if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
+ matches16CPB(s, pos-overlap, length, s16, length16)
+ ) {
+ maxInc=inc; // Longest match from earliest start.
+ maxOverlap=overlap;
+ break;
+ }
+ --overlap;
+ ++inc;
+ }
+ }
+
+ if(maxInc!=0 || maxOverlap!=0) {
+ // Longest-match algorithm, and there was a string match.
+ // Simply continue after it.
+ pos+=maxInc;
+ rest-=maxInc;
+ if(rest==0) {
+ return length; // Reached the end of the string.
+ }
+ spanLength=0; // Match strings from after a string match.
+ continue;
+ }
+ }
+ // Finished trying to match all strings at pos.
+
+ if(spanLength!=0 || pos==0) {
+ // The position is after an unlimited code point span (spanLength!=0),
+ // not after a string match.
+ // The only position where spanLength==0 after a span is pos==0.
+ // Otherwise, an unlimited code point span is only tried again when no
+ // strings match, and if such a non-initial span fails we stop.
+ if(offsets.isEmpty()) {
+ return pos; // No strings matched after a span.
+ }
+ // Match strings from after the next string match.
+ } else {
+ // The position is after a string match (or a single code point).
+ if(offsets.isEmpty()) {
+ // No more strings matched after a previous string match.
+ // Try another code point span from after the last string match.
+ spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
+ if( spanLength==rest || // Reached the end of the string, or
+ spanLength==0 // neither strings nor span progressed.
+ ) {
+ return pos+spanLength;
+ }
+ pos+=spanLength;
+ rest-=spanLength;
+ continue; // spanLength>0: Match strings from after a span.
+ } else {
+ // Try to match only one code point from after a string match if some
+ // string matched beyond it, so that we try all possible positions
+ // and don't overshoot.
+ spanLength=spanOne(spanSet, s+pos, rest);
+ if(spanLength>0) {
+ if(spanLength==rest) {
+ return length; // Reached the end of the string.
+ }
+ // Match strings after this code point.
+ // There cannot be any increments below it because UnicodeSet strings
+ // contain multiple code points.
+ pos+=spanLength;
+ rest-=spanLength;
+ offsets.shift(spanLength);
+ spanLength=0;
+ continue; // Match strings from after a single code point.
+ }
+ // Match strings from after the next string match.
+ }
+ }
+ int32_t minOffset=offsets.popMinimum();
+ pos+=minOffset;
+ rest-=minOffset;
+ spanLength=0; // Match strings from after a string match.
+ }
+}
+
+int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+ return spanNotBack(s, length);
+ }
+ int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
+ if(pos==0) {
+ return 0;
+ }
+ int32_t spanLength=length-pos;
+
+ // Consider strings; they may overlap with the span.
+ OffsetList offsets;
+ if(spanCondition==USET_SPAN_CONTAINED) {
+ // Use offset list to try all possibilities.
+ offsets.setMaxLength(maxLength16);
+ }
+ int32_t i, stringsLength=strings.size();
+ uint8_t *spanBackLengths=spanLengths;
+ if(all) {
+ spanBackLengths+=stringsLength;
+ }
+ for(;;) {
+ if(spanCondition==USET_SPAN_CONTAINED) {
+ for(i=0; i<stringsLength; ++i) {
+ int32_t overlap=spanBackLengths[i];
+ if(overlap==ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+ const UChar *s16=string.getBuffer();
+ int32_t length16=string.length();
+
+ // Try to match this string at pos-(length16-overlap)..pos-length16.
+ if(overlap>=LONG_SPAN) {
+ overlap=length16;
+ // While contained: No point matching fully inside the code point span.
+ int32_t len1=0;
+ U16_FWD_1(s16, len1, overlap);
+ overlap-=len1; // Length of the string minus the first code point.
+ }
+ if(overlap>spanLength) {
+ overlap=spanLength;
+ }
+ int32_t dec=length16-overlap; // Keep dec+overlap==length16.
+ for(;;) {
+ if(dec>pos) {
+ break;
+ }
+ // Try to match if the decrement is not listed already.
+ if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
+ if(dec==pos) {
+ return 0; // Reached the start of the string.
+ }
+ offsets.addOffset(dec);
+ }
+ if(overlap==0) {
+ break;
+ }
+ --overlap;
+ ++dec;
+ }
+ }
+ } else /* USET_SPAN_SIMPLE */ {
+ int32_t maxDec=0, maxOverlap=0;
+ for(i=0; i<stringsLength; ++i) {
+ int32_t overlap=spanBackLengths[i];
+ // For longest match, we do need to try to match even an all-contained string
+ // to find the match from the latest end.
+
+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+ const UChar *s16=string.getBuffer();
+ int32_t length16=string.length();
+
+ // Try to match this string at pos-(length16-overlap)..pos-length16.
+ if(overlap>=LONG_SPAN) {
+ overlap=length16;
+ // Longest match: Need to match fully inside the code point span
+ // to find the match from the latest end.
+ }
+ if(overlap>spanLength) {
+ overlap=spanLength;
+ }
+ int32_t dec=length16-overlap; // Keep dec+overlap==length16.
+ for(;;) {
+ if(dec>pos || overlap<maxOverlap) {
+ break;
+ }
+ // Try to match if the string is longer or ends later.
+ if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
+ matches16CPB(s, pos-dec, length, s16, length16)
+ ) {
+ maxDec=dec; // Longest match from latest end.
+ maxOverlap=overlap;
+ break;
+ }
+ --overlap;
+ ++dec;
+ }
+ }
+
+ if(maxDec!=0 || maxOverlap!=0) {
+ // Longest-match algorithm, and there was a string match.
+ // Simply continue before it.
+ pos-=maxDec;
+ if(pos==0) {
+ return 0; // Reached the start of the string.
+ }
+ spanLength=0; // Match strings from before a string match.
+ continue;
+ }
+ }
+ // Finished trying to match all strings at pos.
+
+ if(spanLength!=0 || pos==length) {
+ // The position is before an unlimited code point span (spanLength!=0),
+ // not before a string match.
+ // The only position where spanLength==0 before a span is pos==length.
+ // Otherwise, an unlimited code point span is only tried again when no
+ // strings match, and if such a non-initial span fails we stop.
+ if(offsets.isEmpty()) {
+ return pos; // No strings matched before a span.
+ }
+ // Match strings from before the next string match.
+ } else {
+ // The position is before a string match (or a single code point).
+ if(offsets.isEmpty()) {
+ // No more strings matched before a previous string match.
+ // Try another code point span from before the last string match.
+ int32_t oldPos=pos;
+ pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
+ spanLength=oldPos-pos;
+ if( pos==0 || // Reached the start of the string, or
+ spanLength==0 // neither strings nor span progressed.
+ ) {
+ return pos;
+ }
+ continue; // spanLength>0: Match strings from before a span.
+ } else {
+ // Try to match only one code point from before a string match if some
+ // string matched beyond it, so that we try all possible positions
+ // and don't overshoot.
+ spanLength=spanOneBack(spanSet, s, pos);
+ if(spanLength>0) {
+ if(spanLength==pos) {
+ return 0; // Reached the start of the string.
+ }
+ // Match strings before this code point.
+ // There cannot be any decrements below it because UnicodeSet strings
+ // contain multiple code points.
+ pos-=spanLength;
+ offsets.shift(spanLength);
+ spanLength=0;
+ continue; // Match strings from before a single code point.
+ }
+ // Match strings from before the next string match.
+ }
+ }
+ pos-=offsets.popMinimum();
+ spanLength=0; // Match strings from before a string match.
+ }
+}
+
+int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+ return spanNotUTF8(s, length);
+ }
+ int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
+ if(spanLength==length) {
+ return length;
+ }
+
+ // Consider strings; they may overlap with the span.
+ OffsetList offsets;
+ if(spanCondition==USET_SPAN_CONTAINED) {
+ // Use offset list to try all possibilities.
+ offsets.setMaxLength(maxLength8);
+ }
+ int32_t pos=spanLength, rest=length-pos;
+ int32_t i, stringsLength=strings.size();
+ uint8_t *spanUTF8Lengths=spanLengths;
+ if(all) {
+ spanUTF8Lengths+=2*stringsLength;
+ }
+ for(;;) {
+ const uint8_t *s8=utf8;
+ int32_t length8;
+ if(spanCondition==USET_SPAN_CONTAINED) {
+ for(i=0; i<stringsLength; ++i) {
+ length8=utf8Lengths[i];
+ if(length8==0) {
+ continue; // String not representable in UTF-8.
+ }
+ int32_t overlap=spanUTF8Lengths[i];
+ if(overlap==ALL_CP_CONTAINED) {
+ s8+=length8;
+ continue; // Irrelevant string.
+ }
+
+ // Try to match this string at pos-overlap..pos.
+ if(overlap>=LONG_SPAN) {
+ overlap=length8;
+ // While contained: No point matching fully inside the code point span.
+ U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
+ }
+ if(overlap>spanLength) {
+ overlap=spanLength;
+ }
+ int32_t inc=length8-overlap; // Keep overlap+inc==length8.
+ for(;;) {
+ if(inc>rest) {
+ break;
+ }
+ // Try to match if the increment is not listed already.
+ // Match at code point boundaries. (The UTF-8 strings were converted
+ // from UTF-16 and are guaranteed to be well-formed.)
if(!U8_IS_TRAIL(s[pos-overlap]) &&
!offsets.containsOffset(inc) &&
matches8(s+pos-overlap, s8, length8)) {
- if(inc==rest) {
- return length; // Reached the end of the string.
- }
- offsets.addOffset(inc);
- }
- if(overlap==0) {
- break;
- }
- --overlap;
- ++inc;
- }
- s8+=length8;
- }
- } else /* USET_SPAN_SIMPLE */ {
- int32_t maxInc=0, maxOverlap=0;
- for(i=0; i<stringsLength; ++i) {
- length8=utf8Lengths[i];
- if(length8==0) {
- continue; // String not representable in UTF-8.
- }
- int32_t overlap=spanUTF8Lengths[i];
- // For longest match, we do need to try to match even an all-contained string
- // to find the match from the earliest start.
-
- // Try to match this string at pos-overlap..pos.
- if(overlap>=LONG_SPAN) {
- overlap=length8;
- // Longest match: Need to match fully inside the code point span
- // to find the match from the earliest start.
- }
- if(overlap>spanLength) {
- overlap=spanLength;
- }
- int32_t inc=length8-overlap; // Keep overlap+inc==length8.
- for(;;) {
- if(inc>rest || overlap<maxOverlap) {
- break;
- }
- // Try to match if the string is longer or starts earlier.
- // Match at code point boundaries. (The UTF-8 strings were converted
- // from UTF-16 and are guaranteed to be well-formed.)
+ if(inc==rest) {
+ return length; // Reached the end of the string.
+ }
+ offsets.addOffset(inc);
+ }
+ if(overlap==0) {
+ break;
+ }
+ --overlap;
+ ++inc;
+ }
+ s8+=length8;
+ }
+ } else /* USET_SPAN_SIMPLE */ {
+ int32_t maxInc=0, maxOverlap=0;
+ for(i=0; i<stringsLength; ++i) {
+ length8=utf8Lengths[i];
+ if(length8==0) {
+ continue; // String not representable in UTF-8.
+ }
+ int32_t overlap=spanUTF8Lengths[i];
+ // For longest match, we do need to try to match even an all-contained string
+ // to find the match from the earliest start.
+
+ // Try to match this string at pos-overlap..pos.
+ if(overlap>=LONG_SPAN) {
+ overlap=length8;
+ // Longest match: Need to match fully inside the code point span
+ // to find the match from the earliest start.
+ }
+ if(overlap>spanLength) {
+ overlap=spanLength;
+ }
+ int32_t inc=length8-overlap; // Keep overlap+inc==length8.
+ for(;;) {
+ if(inc>rest || overlap<maxOverlap) {
+ break;
+ }
+ // Try to match if the string is longer or starts earlier.
+ // Match at code point boundaries. (The UTF-8 strings were converted
+ // from UTF-16 and are guaranteed to be well-formed.)
if(!U8_IS_TRAIL(s[pos-overlap]) &&
(overlap>maxOverlap ||
/* redundant overlap==maxOverlap && */ inc>maxInc) &&
matches8(s+pos-overlap, s8, length8)) {
- maxInc=inc; // Longest match from earliest start.
- maxOverlap=overlap;
- break;
- }
- --overlap;
- ++inc;
- }
- s8+=length8;
- }
-
- if(maxInc!=0 || maxOverlap!=0) {
- // Longest-match algorithm, and there was a string match.
- // Simply continue after it.
- pos+=maxInc;
- rest-=maxInc;
- if(rest==0) {
- return length; // Reached the end of the string.
- }
- spanLength=0; // Match strings from after a string match.
- continue;
- }
- }
- // Finished trying to match all strings at pos.
-
- if(spanLength!=0 || pos==0) {
- // The position is after an unlimited code point span (spanLength!=0),
- // not after a string match.
- // The only position where spanLength==0 after a span is pos==0.
- // Otherwise, an unlimited code point span is only tried again when no
- // strings match, and if such a non-initial span fails we stop.
- if(offsets.isEmpty()) {
- return pos; // No strings matched after a span.
- }
- // Match strings from after the next string match.
- } else {
- // The position is after a string match (or a single code point).
- if(offsets.isEmpty()) {
- // No more strings matched after a previous string match.
- // Try another code point span from after the last string match.
- spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
- if( spanLength==rest || // Reached the end of the string, or
- spanLength==0 // neither strings nor span progressed.
- ) {
- return pos+spanLength;
- }
- pos+=spanLength;
- rest-=spanLength;
- continue; // spanLength>0: Match strings from after a span.
- } else {
- // Try to match only one code point from after a string match if some
- // string matched beyond it, so that we try all possible positions
- // and don't overshoot.
- spanLength=spanOneUTF8(spanSet, s+pos, rest);
- if(spanLength>0) {
- if(spanLength==rest) {
- return length; // Reached the end of the string.
- }
- // Match strings after this code point.
- // There cannot be any increments below it because UnicodeSet strings
- // contain multiple code points.
- pos+=spanLength;
- rest-=spanLength;
- offsets.shift(spanLength);
- spanLength=0;
- continue; // Match strings from after a single code point.
- }
- // Match strings from after the next string match.
- }
- }
- int32_t minOffset=offsets.popMinimum();
- pos+=minOffset;
- rest-=minOffset;
- spanLength=0; // Match strings from after a string match.
- }
-}
-
-int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
- if(spanCondition==USET_SPAN_NOT_CONTAINED) {
- return spanNotBackUTF8(s, length);
- }
- int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
- if(pos==0) {
- return 0;
- }
- int32_t spanLength=length-pos;
-
- // Consider strings; they may overlap with the span.
- OffsetList offsets;
- if(spanCondition==USET_SPAN_CONTAINED) {
- // Use offset list to try all possibilities.
- offsets.setMaxLength(maxLength8);
- }
- int32_t i, stringsLength=strings.size();
- uint8_t *spanBackUTF8Lengths=spanLengths;
- if(all) {
- spanBackUTF8Lengths+=3*stringsLength;
- }
- for(;;) {
- const uint8_t *s8=utf8;
- int32_t length8;
- if(spanCondition==USET_SPAN_CONTAINED) {
- for(i=0; i<stringsLength; ++i) {
- length8=utf8Lengths[i];
- if(length8==0) {
- continue; // String not representable in UTF-8.
- }
- int32_t overlap=spanBackUTF8Lengths[i];
- if(overlap==ALL_CP_CONTAINED) {
- s8+=length8;
- continue; // Irrelevant string.
- }
-
- // Try to match this string at pos-(length8-overlap)..pos-length8.
- if(overlap>=LONG_SPAN) {
- overlap=length8;
- // While contained: No point matching fully inside the code point span.
- int32_t len1=0;
- U8_FWD_1(s8, len1, overlap);
- overlap-=len1; // Length of the string minus the first code point.
- }
- if(overlap>spanLength) {
- overlap=spanLength;
- }
- int32_t dec=length8-overlap; // Keep dec+overlap==length8.
- for(;;) {
- if(dec>pos) {
- break;
- }
- // Try to match if the decrement is not listed already.
- // Match at code point boundaries. (The UTF-8 strings were converted
- // from UTF-16 and are guaranteed to be well-formed.)
- if( !U8_IS_TRAIL(s[pos-dec]) &&
- !offsets.containsOffset(dec) &&
- matches8(s+pos-dec, s8, length8)
- ) {
- if(dec==pos) {
- return 0; // Reached the start of the string.
- }
- offsets.addOffset(dec);
- }
- if(overlap==0) {
- break;
- }
- --overlap;
- ++dec;
- }
- s8+=length8;
- }
- } else /* USET_SPAN_SIMPLE */ {
- int32_t maxDec=0, maxOverlap=0;
- for(i=0; i<stringsLength; ++i) {
- length8=utf8Lengths[i];
- if(length8==0) {
- continue; // String not representable in UTF-8.
- }
- int32_t overlap=spanBackUTF8Lengths[i];
- // For longest match, we do need to try to match even an all-contained string
- // to find the match from the latest end.
-
- // Try to match this string at pos-(length8-overlap)..pos-length8.
- if(overlap>=LONG_SPAN) {
- overlap=length8;
- // Longest match: Need to match fully inside the code point span
- // to find the match from the latest end.
- }
- if(overlap>spanLength) {
- overlap=spanLength;
- }
- int32_t dec=length8-overlap; // Keep dec+overlap==length8.
- for(;;) {
- if(dec>pos || overlap<maxOverlap) {
- break;
- }
- // Try to match if the string is longer or ends later.
- // Match at code point boundaries. (The UTF-8 strings were converted
- // from UTF-16 and are guaranteed to be well-formed.)
- if( !U8_IS_TRAIL(s[pos-dec]) &&
- (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
- matches8(s+pos-dec, s8, length8)
- ) {
- maxDec=dec; // Longest match from latest end.
- maxOverlap=overlap;
- break;
- }
- --overlap;
- ++dec;
- }
- s8+=length8;
- }
-
- if(maxDec!=0 || maxOverlap!=0) {
- // Longest-match algorithm, and there was a string match.
- // Simply continue before it.
- pos-=maxDec;
- if(pos==0) {
- return 0; // Reached the start of the string.
- }
- spanLength=0; // Match strings from before a string match.
- continue;
- }
- }
- // Finished trying to match all strings at pos.
-
- if(spanLength!=0 || pos==length) {
- // The position is before an unlimited code point span (spanLength!=0),
- // not before a string match.
- // The only position where spanLength==0 before a span is pos==length.
- // Otherwise, an unlimited code point span is only tried again when no
- // strings match, and if such a non-initial span fails we stop.
- if(offsets.isEmpty()) {
- return pos; // No strings matched before a span.
- }
- // Match strings from before the next string match.
- } else {
- // The position is before a string match (or a single code point).
- if(offsets.isEmpty()) {
- // No more strings matched before a previous string match.
- // Try another code point span from before the last string match.
- int32_t oldPos=pos;
- pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
- spanLength=oldPos-pos;
- if( pos==0 || // Reached the start of the string, or
- spanLength==0 // neither strings nor span progressed.
- ) {
- return pos;
- }
- continue; // spanLength>0: Match strings from before a span.
- } else {
- // Try to match only one code point from before a string match if some
- // string matched beyond it, so that we try all possible positions
- // and don't overshoot.
- spanLength=spanOneBackUTF8(spanSet, s, pos);
- if(spanLength>0) {
- if(spanLength==pos) {
- return 0; // Reached the start of the string.
- }
- // Match strings before this code point.
- // There cannot be any decrements below it because UnicodeSet strings
- // contain multiple code points.
- pos-=spanLength;
- offsets.shift(spanLength);
- spanLength=0;
- continue; // Match strings from before a single code point.
- }
- // Match strings from before the next string match.
- }
- }
- pos-=offsets.popMinimum();
- spanLength=0; // Match strings from before a string match.
- }
-}
-
-/*
- * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
- *
- * Theoretical algorithm:
- * - Iterate through the string, and at each code point boundary:
- * + If the code point there is in the set, then return with the current position.
- * + If a set string matches at the current position, then return with the current position.
- *
- * Optimized implementation:
- *
- * (Same assumption as for span() above.)
- *
- * Create and cache a spanNotSet which contains all of the single code points
- * of the original set but none of its strings.
- * For each set string add its initial code point to the spanNotSet.
- * (Also add its final code point for spanNotBack().)
- *
- * - Loop:
- * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
- * + If the current code point is in the original set, then
- * return the current position.
- * + If any set string matches at the current position, then
- * return the current position.
- * + If there is no match at the current position, neither for the code point there
- * nor for any set string, then skip this code point and continue the loop.
- * This happens for set-string-initial code points that were added to spanNotSet
- * when there is not actually a match for such a set string.
- */
-
-int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
- int32_t pos=0, rest=length;
- int32_t i, stringsLength=strings.size();
- do {
- // Span until we find a code point from the set,
- // or a code point that starts or ends some string.
- i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
- if(i==rest) {
- return length; // Reached the end of the string.
- }
- pos+=i;
- rest-=i;
-
- // Check whether the current code point is in the original set,
- // without the string starts and ends.
- int32_t cpLength=spanOne(spanSet, s+pos, rest);
- if(cpLength>0) {
- return pos; // There is a set element at pos.
- }
-
- // Try to match the strings at pos.
- for(i=0; i<stringsLength; ++i) {
- if(spanLengths[i]==ALL_CP_CONTAINED) {
- continue; // Irrelevant string.
- }
- const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
- const UChar *s16=string.getBuffer();
- int32_t length16=string.length();
- if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
- return pos; // There is a set element at pos.
- }
- }
-
- // The span(while not contained) ended on a string start/end which is
- // not in the original set. Skip this code point and continue.
- // cpLength<0
- pos-=cpLength;
- rest+=cpLength;
- } while(rest!=0);
- return length; // Reached the end of the string.
-}
-
-int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
- int32_t pos=length;
- int32_t i, stringsLength=strings.size();
- do {
- // Span until we find a code point from the set,
- // or a code point that starts or ends some string.
- pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
- if(pos==0) {
- return 0; // Reached the start of the string.
- }
-
- // Check whether the current code point is in the original set,
- // without the string starts and ends.
- int32_t cpLength=spanOneBack(spanSet, s, pos);
- if(cpLength>0) {
- return pos; // There is a set element at pos.
- }
-
- // Try to match the strings at pos.
- for(i=0; i<stringsLength; ++i) {
- // Use spanLengths rather than a spanBackLengths pointer because
- // it is easier and we only need to know whether the string is irrelevant
- // which is the same in either array.
- if(spanLengths[i]==ALL_CP_CONTAINED) {
- continue; // Irrelevant string.
- }
- const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
- const UChar *s16=string.getBuffer();
- int32_t length16=string.length();
- if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
- return pos; // There is a set element at pos.
- }
- }
-
- // The span(while not contained) ended on a string start/end which is
- // not in the original set. Skip this code point and continue.
- // cpLength<0
- pos+=cpLength;
- } while(pos!=0);
- return 0; // Reached the start of the string.
-}
-
-int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
- int32_t pos=0, rest=length;
- int32_t i, stringsLength=strings.size();
- uint8_t *spanUTF8Lengths=spanLengths;
- if(all) {
- spanUTF8Lengths+=2*stringsLength;
- }
- do {
- // Span until we find a code point from the set,
- // or a code point that starts or ends some string.
- i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
- if(i==rest) {
- return length; // Reached the end of the string.
- }
- pos+=i;
- rest-=i;
-
- // Check whether the current code point is in the original set,
- // without the string starts and ends.
- int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
- if(cpLength>0) {
- return pos; // There is a set element at pos.
- }
-
- // Try to match the strings at pos.
- const uint8_t *s8=utf8;
- int32_t length8;
- for(i=0; i<stringsLength; ++i) {
- length8=utf8Lengths[i];
- // ALL_CP_CONTAINED: Irrelevant string.
- if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
- return pos; // There is a set element at pos.
- }
- s8+=length8;
- }
-
- // The span(while not contained) ended on a string start/end which is
- // not in the original set. Skip this code point and continue.
- // cpLength<0
- pos-=cpLength;
- rest+=cpLength;
- } while(rest!=0);
- return length; // Reached the end of the string.
-}
-
-int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
- int32_t pos=length;
- int32_t i, stringsLength=strings.size();
- uint8_t *spanBackUTF8Lengths=spanLengths;
- if(all) {
- spanBackUTF8Lengths+=3*stringsLength;
- }
- do {
- // Span until we find a code point from the set,
- // or a code point that starts or ends some string.
- pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
- if(pos==0) {
- return 0; // Reached the start of the string.
- }
-
- // Check whether the current code point is in the original set,
- // without the string starts and ends.
- int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
- if(cpLength>0) {
- return pos; // There is a set element at pos.
- }
-
- // Try to match the strings at pos.
- const uint8_t *s8=utf8;
- int32_t length8;
- for(i=0; i<stringsLength; ++i) {
- length8=utf8Lengths[i];
- // ALL_CP_CONTAINED: Irrelevant string.
- if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
- return pos; // There is a set element at pos.
- }
- s8+=length8;
- }
-
- // The span(while not contained) ended on a string start/end which is
- // not in the original set. Skip this code point and continue.
- // cpLength<0
- pos+=cpLength;
- } while(pos!=0);
- return 0; // Reached the start of the string.
-}
-
-U_NAMESPACE_END
+ maxInc=inc; // Longest match from earliest start.
+ maxOverlap=overlap;
+ break;
+ }
+ --overlap;
+ ++inc;
+ }
+ s8+=length8;
+ }
+
+ if(maxInc!=0 || maxOverlap!=0) {
+ // Longest-match algorithm, and there was a string match.
+ // Simply continue after it.
+ pos+=maxInc;
+ rest-=maxInc;
+ if(rest==0) {
+ return length; // Reached the end of the string.
+ }
+ spanLength=0; // Match strings from after a string match.
+ continue;
+ }
+ }
+ // Finished trying to match all strings at pos.
+
+ if(spanLength!=0 || pos==0) {
+ // The position is after an unlimited code point span (spanLength!=0),
+ // not after a string match.
+ // The only position where spanLength==0 after a span is pos==0.
+ // Otherwise, an unlimited code point span is only tried again when no
+ // strings match, and if such a non-initial span fails we stop.
+ if(offsets.isEmpty()) {
+ return pos; // No strings matched after a span.
+ }
+ // Match strings from after the next string match.
+ } else {
+ // The position is after a string match (or a single code point).
+ if(offsets.isEmpty()) {
+ // No more strings matched after a previous string match.
+ // Try another code point span from after the last string match.
+ spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
+ if( spanLength==rest || // Reached the end of the string, or
+ spanLength==0 // neither strings nor span progressed.
+ ) {
+ return pos+spanLength;
+ }
+ pos+=spanLength;
+ rest-=spanLength;
+ continue; // spanLength>0: Match strings from after a span.
+ } else {
+ // Try to match only one code point from after a string match if some
+ // string matched beyond it, so that we try all possible positions
+ // and don't overshoot.
+ spanLength=spanOneUTF8(spanSet, s+pos, rest);
+ if(spanLength>0) {
+ if(spanLength==rest) {
+ return length; // Reached the end of the string.
+ }
+ // Match strings after this code point.
+ // There cannot be any increments below it because UnicodeSet strings
+ // contain multiple code points.
+ pos+=spanLength;
+ rest-=spanLength;
+ offsets.shift(spanLength);
+ spanLength=0;
+ continue; // Match strings from after a single code point.
+ }
+ // Match strings from after the next string match.
+ }
+ }
+ int32_t minOffset=offsets.popMinimum();
+ pos+=minOffset;
+ rest-=minOffset;
+ spanLength=0; // Match strings from after a string match.
+ }
+}
+
+int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+ return spanNotBackUTF8(s, length);
+ }
+ int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
+ if(pos==0) {
+ return 0;
+ }
+ int32_t spanLength=length-pos;
+
+ // Consider strings; they may overlap with the span.
+ OffsetList offsets;
+ if(spanCondition==USET_SPAN_CONTAINED) {
+ // Use offset list to try all possibilities.
+ offsets.setMaxLength(maxLength8);
+ }
+ int32_t i, stringsLength=strings.size();
+ uint8_t *spanBackUTF8Lengths=spanLengths;
+ if(all) {
+ spanBackUTF8Lengths+=3*stringsLength;
+ }
+ for(;;) {
+ const uint8_t *s8=utf8;
+ int32_t length8;
+ if(spanCondition==USET_SPAN_CONTAINED) {
+ for(i=0; i<stringsLength; ++i) {
+ length8=utf8Lengths[i];
+ if(length8==0) {
+ continue; // String not representable in UTF-8.
+ }
+ int32_t overlap=spanBackUTF8Lengths[i];
+ if(overlap==ALL_CP_CONTAINED) {
+ s8+=length8;
+ continue; // Irrelevant string.
+ }
+
+ // Try to match this string at pos-(length8-overlap)..pos-length8.
+ if(overlap>=LONG_SPAN) {
+ overlap=length8;
+ // While contained: No point matching fully inside the code point span.
+ int32_t len1=0;
+ U8_FWD_1(s8, len1, overlap);
+ overlap-=len1; // Length of the string minus the first code point.
+ }
+ if(overlap>spanLength) {
+ overlap=spanLength;
+ }
+ int32_t dec=length8-overlap; // Keep dec+overlap==length8.
+ for(;;) {
+ if(dec>pos) {
+ break;
+ }
+ // Try to match if the decrement is not listed already.
+ // Match at code point boundaries. (The UTF-8 strings were converted
+ // from UTF-16 and are guaranteed to be well-formed.)
+ if( !U8_IS_TRAIL(s[pos-dec]) &&
+ !offsets.containsOffset(dec) &&
+ matches8(s+pos-dec, s8, length8)
+ ) {
+ if(dec==pos) {
+ return 0; // Reached the start of the string.
+ }
+ offsets.addOffset(dec);
+ }
+ if(overlap==0) {
+ break;
+ }
+ --overlap;
+ ++dec;
+ }
+ s8+=length8;
+ }
+ } else /* USET_SPAN_SIMPLE */ {
+ int32_t maxDec=0, maxOverlap=0;
+ for(i=0; i<stringsLength; ++i) {
+ length8=utf8Lengths[i];
+ if(length8==0) {
+ continue; // String not representable in UTF-8.
+ }
+ int32_t overlap=spanBackUTF8Lengths[i];
+ // For longest match, we do need to try to match even an all-contained string
+ // to find the match from the latest end.
+
+ // Try to match this string at pos-(length8-overlap)..pos-length8.
+ if(overlap>=LONG_SPAN) {
+ overlap=length8;
+ // Longest match: Need to match fully inside the code point span
+ // to find the match from the latest end.
+ }
+ if(overlap>spanLength) {
+ overlap=spanLength;
+ }
+ int32_t dec=length8-overlap; // Keep dec+overlap==length8.
+ for(;;) {
+ if(dec>pos || overlap<maxOverlap) {
+ break;
+ }
+ // Try to match if the string is longer or ends later.
+ // Match at code point boundaries. (The UTF-8 strings were converted
+ // from UTF-16 and are guaranteed to be well-formed.)
+ if( !U8_IS_TRAIL(s[pos-dec]) &&
+ (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
+ matches8(s+pos-dec, s8, length8)
+ ) {
+ maxDec=dec; // Longest match from latest end.
+ maxOverlap=overlap;
+ break;
+ }
+ --overlap;
+ ++dec;
+ }
+ s8+=length8;
+ }
+
+ if(maxDec!=0 || maxOverlap!=0) {
+ // Longest-match algorithm, and there was a string match.
+ // Simply continue before it.
+ pos-=maxDec;
+ if(pos==0) {
+ return 0; // Reached the start of the string.
+ }
+ spanLength=0; // Match strings from before a string match.
+ continue;
+ }
+ }
+ // Finished trying to match all strings at pos.
+
+ if(spanLength!=0 || pos==length) {
+ // The position is before an unlimited code point span (spanLength!=0),
+ // not before a string match.
+ // The only position where spanLength==0 before a span is pos==length.
+ // Otherwise, an unlimited code point span is only tried again when no
+ // strings match, and if such a non-initial span fails we stop.
+ if(offsets.isEmpty()) {
+ return pos; // No strings matched before a span.
+ }
+ // Match strings from before the next string match.
+ } else {
+ // The position is before a string match (or a single code point).
+ if(offsets.isEmpty()) {
+ // No more strings matched before a previous string match.
+ // Try another code point span from before the last string match.
+ int32_t oldPos=pos;
+ pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
+ spanLength=oldPos-pos;
+ if( pos==0 || // Reached the start of the string, or
+ spanLength==0 // neither strings nor span progressed.
+ ) {
+ return pos;
+ }
+ continue; // spanLength>0: Match strings from before a span.
+ } else {
+ // Try to match only one code point from before a string match if some
+ // string matched beyond it, so that we try all possible positions
+ // and don't overshoot.
+ spanLength=spanOneBackUTF8(spanSet, s, pos);
+ if(spanLength>0) {
+ if(spanLength==pos) {
+ return 0; // Reached the start of the string.
+ }
+ // Match strings before this code point.
+ // There cannot be any decrements below it because UnicodeSet strings
+ // contain multiple code points.
+ pos-=spanLength;
+ offsets.shift(spanLength);
+ spanLength=0;
+ continue; // Match strings from before a single code point.
+ }
+ // Match strings from before the next string match.
+ }
+ }
+ pos-=offsets.popMinimum();
+ spanLength=0; // Match strings from before a string match.
+ }
+}
+
+/*
+ * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ * + If the code point there is in the set, then return with the current position.
+ * + If a set string matches at the current position, then return with the current position.
+ *
+ * Optimized implementation:
+ *
+ * (Same assumption as for span() above.)
+ *
+ * Create and cache a spanNotSet which contains all of the single code points
+ * of the original set but none of its strings.
+ * For each set string add its initial code point to the spanNotSet.
+ * (Also add its final code point for spanNotBack().)
+ *
+ * - Loop:
+ * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
+ * + If the current code point is in the original set, then
+ * return the current position.
+ * + If any set string matches at the current position, then
+ * return the current position.
+ * + If there is no match at the current position, neither for the code point there
+ * nor for any set string, then skip this code point and continue the loop.
+ * This happens for set-string-initial code points that were added to spanNotSet
+ * when there is not actually a match for such a set string.
+ */
+
+int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
+ int32_t pos=0, rest=length;
+ int32_t i, stringsLength=strings.size();
+ do {
+ // Span until we find a code point from the set,
+ // or a code point that starts or ends some string.
+ i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
+ if(i==rest) {
+ return length; // Reached the end of the string.
+ }
+ pos+=i;
+ rest-=i;
+
+ // Check whether the current code point is in the original set,
+ // without the string starts and ends.
+ int32_t cpLength=spanOne(spanSet, s+pos, rest);
+ if(cpLength>0) {
+ return pos; // There is a set element at pos.
+ }
+
+ // Try to match the strings at pos.
+ for(i=0; i<stringsLength; ++i) {
+ if(spanLengths[i]==ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+ const UChar *s16=string.getBuffer();
+ int32_t length16=string.length();
+ if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
+ return pos; // There is a set element at pos.
+ }
+ }
+
+ // The span(while not contained) ended on a string start/end which is
+ // not in the original set. Skip this code point and continue.
+ // cpLength<0
+ pos-=cpLength;
+ rest+=cpLength;
+ } while(rest!=0);
+ return length; // Reached the end of the string.
+}
+
+int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
+ int32_t pos=length;
+ int32_t i, stringsLength=strings.size();
+ do {
+ // Span until we find a code point from the set,
+ // or a code point that starts or ends some string.
+ pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
+ if(pos==0) {
+ return 0; // Reached the start of the string.
+ }
+
+ // Check whether the current code point is in the original set,
+ // without the string starts and ends.
+ int32_t cpLength=spanOneBack(spanSet, s, pos);
+ if(cpLength>0) {
+ return pos; // There is a set element at pos.
+ }
+
+ // Try to match the strings at pos.
+ for(i=0; i<stringsLength; ++i) {
+ // Use spanLengths rather than a spanBackLengths pointer because
+ // it is easier and we only need to know whether the string is irrelevant
+ // which is the same in either array.
+ if(spanLengths[i]==ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+ const UChar *s16=string.getBuffer();
+ int32_t length16=string.length();
+ if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
+ return pos; // There is a set element at pos.
+ }
+ }
+
+ // The span(while not contained) ended on a string start/end which is
+ // not in the original set. Skip this code point and continue.
+ // cpLength<0
+ pos+=cpLength;
+ } while(pos!=0);
+ return 0; // Reached the start of the string.
+}
+
+int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
+ int32_t pos=0, rest=length;
+ int32_t i, stringsLength=strings.size();
+ uint8_t *spanUTF8Lengths=spanLengths;
+ if(all) {
+ spanUTF8Lengths+=2*stringsLength;
+ }
+ do {
+ // Span until we find a code point from the set,
+ // or a code point that starts or ends some string.
+ i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
+ if(i==rest) {
+ return length; // Reached the end of the string.
+ }
+ pos+=i;
+ rest-=i;
+
+ // Check whether the current code point is in the original set,
+ // without the string starts and ends.
+ int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
+ if(cpLength>0) {
+ return pos; // There is a set element at pos.
+ }
+
+ // Try to match the strings at pos.
+ const uint8_t *s8=utf8;
+ int32_t length8;
+ for(i=0; i<stringsLength; ++i) {
+ length8=utf8Lengths[i];
+ // ALL_CP_CONTAINED: Irrelevant string.
+ if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
+ return pos; // There is a set element at pos.
+ }
+ s8+=length8;
+ }
+
+ // The span(while not contained) ended on a string start/end which is
+ // not in the original set. Skip this code point and continue.
+ // cpLength<0
+ pos-=cpLength;
+ rest+=cpLength;
+ } while(rest!=0);
+ return length; // Reached the end of the string.
+}
+
+int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
+ int32_t pos=length;
+ int32_t i, stringsLength=strings.size();
+ uint8_t *spanBackUTF8Lengths=spanLengths;
+ if(all) {
+ spanBackUTF8Lengths+=3*stringsLength;
+ }
+ do {
+ // Span until we find a code point from the set,
+ // or a code point that starts or ends some string.
+ pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
+ if(pos==0) {
+ return 0; // Reached the start of the string.
+ }
+
+ // Check whether the current code point is in the original set,
+ // without the string starts and ends.
+ int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
+ if(cpLength>0) {
+ return pos; // There is a set element at pos.
+ }
+
+ // Try to match the strings at pos.
+ const uint8_t *s8=utf8;
+ int32_t length8;
+ for(i=0; i<stringsLength; ++i) {
+ length8=utf8Lengths[i];
+ // ALL_CP_CONTAINED: Irrelevant string.
+ if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
+ return pos; // There is a set element at pos.
+ }
+ s8+=length8;
+ }
+
+ // The span(while not contained) ended on a string start/end which is
+ // not in the original set. Skip this code point and continue.
+ // cpLength<0
+ pos+=cpLength;
+ } while(pos!=0);
+ return 0; // Reached the start of the string.
+}
+
+U_NAMESPACE_END