Restoring authorship annotation for <[email protected]>. Commit 2 of 2.

author: neksard <[email protected]> 2022-02-10 16:45:33 +0300
committer: Daniil Cherednik <[email protected]> 2022-02-10 16:45:33 +0300
commit: 1d9c550e7c38e051d7961f576013a482003a70d9 (patch)
tree: b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/common/unisetspan.cpp
parent: 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff)
1 files changed, 1496 insertions, 1496 deletions
diff --git a/contrib/libs/icu/common/unisetspan.cpp b/contrib/libs/icu/common/unisetspan.cpp
index 1490a435a1a..68e44d91ee7 100644
--- a/contrib/libs/icu/common/unisetspan.cpp
+++ b/contrib/libs/icu/common/unisetspan.cpp
@@ -1,1509 +1,1509 @@
 // © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html 
-/* 
-****************************************************************************** 
-* 
-*   Copyright (C) 2007-2012, International Business Machines 
-*   Corporation and others.  All Rights Reserved. 
-* 
-****************************************************************************** 
-*   file name:  unisetspan.cpp 
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+******************************************************************************
+*
+*   Copyright (C) 2007-2012, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+******************************************************************************
+*   file name:  unisetspan.cpp
 *   encoding:   UTF-8
-*   tab size:   8 (not used) 
-*   indentation:4 
-* 
-*   created on: 2007mar01 
-*   created by: Markus W. Scherer 
-*/ 
- 
-#include "unicode/utypes.h" 
-#include "unicode/uniset.h" 
-#include "unicode/ustring.h" 
-#include "unicode/utf8.h" 
-#include "unicode/utf16.h" 
-#include "cmemory.h" 
-#include "uvector.h" 
-#include "unisetspan.h" 
- 
-U_NAMESPACE_BEGIN 
- 
-/* 
- * List of offsets from the current position from where to try matching 
- * a code point or a string. 
- * Store offsets rather than indexes to simplify the code and use the same list 
- * for both increments (in span()) and decrements (in spanBack()). 
- * 
- * Assumption: The maximum offset is limited, and the offsets that are stored 
- * at any one time are relatively dense, that is, there are normally no gaps of 
- * hundreds or thousands of offset values. 
- * 
- * The implementation uses a circular buffer of byte flags, 
- * each indicating whether the corresponding offset is in the list. 
- * This avoids inserting into a sorted list of offsets (or absolute indexes) and 
- * physically moving part of the list. 
- * 
- * Note: In principle, the caller should setMaxLength() to the maximum of the 
- * max string length and U16_LENGTH/U8_LENGTH to account for 
- * "long" single code points. 
- * However, this implementation uses at least a staticList with more than 
- * U8_LENGTH entries anyway. 
- * 
- * Note: If maxLength were guaranteed to be no more than 32 or 64, 
- * the list could be stored as bit flags in a single integer. 
- * Rather than handling a circular buffer with a start list index, 
- * the integer would simply be shifted when lower offsets are removed. 
- * UnicodeSet does not have a limit on the lengths of strings. 
- */ 
-class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory. 
-public: 
-    OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 
- 
-    ~OffsetList() { 
-        if(list!=staticList) { 
-            uprv_free(list); 
-        } 
-    } 
- 
-    // Call exactly once if the list is to be used. 
-    void setMaxLength(int32_t maxLength) { 
-        if(maxLength<=(int32_t)sizeof(staticList)) { 
-            capacity=(int32_t)sizeof(staticList); 
-        } else { 
-            UBool *l=(UBool *)uprv_malloc(maxLength); 
-            if(l!=NULL) { 
-                list=l; 
-                capacity=maxLength; 
-            } 
-        } 
-        uprv_memset(list, 0, capacity); 
-    } 
- 
-    void clear() { 
-        uprv_memset(list, 0, capacity); 
-        start=length=0; 
-    } 
- 
-    UBool isEmpty() const { 
-        return (UBool)(length==0); 
-    } 
- 
-    // Reduce all stored offsets by delta, used when the current position 
-    // moves by delta. 
-    // There must not be any offsets lower than delta. 
-    // If there is an offset equal to delta, it is removed. 
-    // delta=[1..maxLength] 
-    void shift(int32_t delta) { 
-        int32_t i=start+delta; 
-        if(i>=capacity) { 
-            i-=capacity; 
-        } 
-        if(list[i]) { 
-            list[i]=FALSE; 
-            --length; 
-        } 
-        start=i; 
-    } 
- 
-    // Add an offset. The list must not contain it yet. 
-    // offset=[1..maxLength] 
-    void addOffset(int32_t offset) { 
-        int32_t i=start+offset; 
-        if(i>=capacity) { 
-            i-=capacity; 
-        } 
-        list[i]=TRUE; 
-        ++length; 
-    } 
- 
-    // offset=[1..maxLength] 
-    UBool containsOffset(int32_t offset) const { 
-        int32_t i=start+offset; 
-        if(i>=capacity) { 
-            i-=capacity; 
-        } 
-        return list[i]; 
-    } 
- 
-    // Find the lowest stored offset from a non-empty list, remove it, 
-    // and reduce all other offsets by this minimum. 
-    // Returns [1..maxLength]. 
-    int32_t popMinimum() { 
-        // Look for the next offset in list[start+1..capacity-1]. 
-        int32_t i=start, result; 
-        while(++i<capacity) { 
-            if(list[i]) { 
-                list[i]=FALSE; 
-                --length; 
-                result=i-start; 
-                start=i; 
-                return result; 
-            } 
-        } 
-        // i==capacity 
- 
-        // Wrap around and look for the next offset in list[0..start]. 
-        // Since the list is not empty, there will be one. 
-        result=capacity-start; 
-        i=0; 
-        while(!list[i]) { 
-            ++i; 
-        } 
-        list[i]=FALSE; 
-        --length; 
-        start=i; 
-        return result+=i; 
-    } 
- 
-private: 
-    UBool *list; 
-    int32_t capacity; 
-    int32_t length; 
-    int32_t start; 
- 
-    UBool staticList[16]; 
-}; 
- 
-// Get the number of UTF-8 bytes for a UTF-16 (sub)string. 
-static int32_t 
-getUTF8Length(const UChar *s, int32_t length) { 
-    UErrorCode errorCode=U_ZERO_ERROR; 
-    int32_t length8=0; 
-    u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 
-    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 
-        return length8; 
-    } else { 
-        // The string contains an unpaired surrogate. 
-        // Ignore this string. 
-        return 0; 
-    } 
-} 
- 
-// Append the UTF-8 version of the string to t and return the appended UTF-8 length. 
-static int32_t 
-appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 
-    UErrorCode errorCode=U_ZERO_ERROR; 
-    int32_t length8=0; 
-    u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 
-    if(U_SUCCESS(errorCode)) { 
-        return length8; 
-    } else { 
-        // The string contains an unpaired surrogate. 
-        // Ignore this string. 
-        return 0; 
-    } 
-} 
- 
-static inline uint8_t 
-makeSpanLengthByte(int32_t spanLength) { 
-    // 0xfe==UnicodeSetStringSpan::LONG_SPAN 
-    return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 
-} 
- 
-// Construct for all variants of span(), or only for any one variant. 
-// Initialize as little as possible, for single use. 
-UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 
-                                           const UVector &setStrings, 
-                                           uint32_t which) 
-        : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 
-          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 
-          utf8Length(0), 
-          maxLength16(0), maxLength8(0), 
-          all((UBool)(which==ALL)) { 
-    spanSet.retainAll(set); 
-    if(which&NOT_CONTAINED) { 
-        // Default to the same sets. 
-        // addToSpanNotSet() will create a separate set if necessary. 
-        pSpanNotSet=&spanSet; 
-    } 
- 
-    // Determine if the strings even need to be taken into account at all for span() etc. 
-    // If any string is relevant, then all strings need to be used for 
-    // span(longest match) but only the relevant ones for span(while contained). 
-    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 
-    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED. 
-    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 
-    // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 
-    int32_t stringsLength=strings.size(); 
- 
-    int32_t i, spanLength; 
-    UBool someRelevant=FALSE; 
-    for(i=0; i<stringsLength; ++i) { 
-        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 
-        const UChar *s16=string.getBuffer(); 
-        int32_t length16=string.length(); 
-        UBool thisRelevant; 
-        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 
-        if(spanLength<length16) {  // Relevant string. 
-            someRelevant=thisRelevant=TRUE; 
-        } else { 
-            thisRelevant=FALSE; 
-        } 
-        if((which&UTF16) && length16>maxLength16) { 
-            maxLength16=length16; 
-        } 
-        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 
-            int32_t length8=getUTF8Length(s16, length16); 
-            utf8Length+=length8; 
-            if(length8>maxLength8) { 
-                maxLength8=length8; 
-            } 
-        } 
-    } 
-    if(!someRelevant) { 
-        maxLength16=maxLength8=0; 
-        return; 
-    } 
- 
-    // Freeze after checking for the need to use strings at all because freezing 
-    // a set takes some time and memory which are wasted if there are no relevant strings. 
-    if(all) { 
-        spanSet.freeze(); 
-    } 
- 
-    uint8_t *spanBackLengths; 
-    uint8_t *spanUTF8Lengths; 
-    uint8_t *spanBackUTF8Lengths; 
- 
-    // Allocate a block of meta data. 
-    int32_t allocSize; 
-    if(all) { 
-        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 
-        allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 
-    } else { 
-        allocSize=stringsLength;  // One set of span lengths. 
-        if(which&UTF8) { 
-            // UTF-8 lengths and UTF-8 strings. 
-            allocSize+=stringsLength*4+utf8Length; 
-        } 
-    } 
-    if(allocSize<=(int32_t)sizeof(staticLengths)) { 
-        utf8Lengths=staticLengths; 
-    } else { 
-        utf8Lengths=(int32_t *)uprv_malloc(allocSize); 
-        if(utf8Lengths==NULL) { 
-            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 
-            return;  // Out of memory. 
-        } 
-    } 
- 
-    if(all) { 
-        // Store span lengths for all span() variants. 
-        spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 
-        spanBackLengths=spanLengths+stringsLength; 
-        spanUTF8Lengths=spanBackLengths+stringsLength; 
-        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 
-        utf8=spanBackUTF8Lengths+stringsLength; 
-    } else { 
-        // Store span lengths for only one span() variant. 
-        if(which&UTF8) { 
-            spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 
-            utf8=spanLengths+stringsLength; 
-        } else { 
-            spanLengths=(uint8_t *)utf8Lengths; 
-        } 
-        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 
-    } 
- 
-    // Set the meta data and pSpanNotSet and write the UTF-8 strings. 
-    int32_t utf8Count=0;  // Count UTF-8 bytes written so far. 
- 
-    for(i=0; i<stringsLength; ++i) { 
-        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 
-        const UChar *s16=string.getBuffer(); 
-        int32_t length16=string.length(); 
-        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 
-        if(spanLength<length16) {  // Relevant string. 
-            if(which&UTF16) { 
-                if(which&CONTAINED) { 
-                    if(which&FWD) { 
-                        spanLengths[i]=makeSpanLengthByte(spanLength); 
-                    } 
-                    if(which&BACK) { 
-                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 
-                        spanBackLengths[i]=makeSpanLengthByte(spanLength); 
-                    } 
-                } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 
-                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag. 
-                } 
-            } 
-            if(which&UTF8) { 
-                uint8_t *s8=utf8+utf8Count; 
-                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 
-                utf8Count+=utf8Lengths[i]=length8; 
-                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8. 
-                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 
-                } else {  // Relevant for UTF-8. 
-                    if(which&CONTAINED) { 
-                        if(which&FWD) { 
-                            spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 
-                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 
-                        } 
-                        if(which&BACK) { 
-                            spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 
-                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 
-                        } 
-                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 
-                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag. 
-                    } 
-                } 
-            } 
-            if(which&NOT_CONTAINED) { 
-                // Add string start and end code points to the spanNotSet so that 
-                // a span(while not contained) stops before any string. 
-                UChar32 c; 
-                if(which&FWD) { 
-                    int32_t len=0; 
-                    U16_NEXT(s16, len, length16, c); 
-                    addToSpanNotSet(c); 
-                } 
-                if(which&BACK) { 
-                    int32_t len=length16; 
-                    U16_PREV(s16, 0, len, c); 
-                    addToSpanNotSet(c); 
-                } 
-            } 
-        } else {  // Irrelevant string. 
-            if(which&UTF8) { 
-                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH. 
-                    uint8_t *s8=utf8+utf8Count; 
-                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 
-                    utf8Count+=utf8Lengths[i]=length8; 
-                } else { 
-                    utf8Lengths[i]=0; 
-                } 
-            } 
-            if(all) { 
-                spanLengths[i]=spanBackLengths[i]= 
-                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 
-                        (uint8_t)ALL_CP_CONTAINED; 
-            } else { 
-                // All spanXYZLengths pointers contain the same address. 
-                spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 
-            } 
-        } 
-    } 
- 
-    // Finish. 
-    if(all) { 
-        pSpanNotSet->freeze(); 
-    } 
-} 
- 
-// Copy constructor. Assumes which==ALL for a frozen set. 
-UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 
-                                           const UVector &newParentSetStrings) 
-        : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 
-          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 
-          utf8Length(otherStringSpan.utf8Length), 
-          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 
-          all(TRUE) { 
-    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 
-        pSpanNotSet=&spanSet; 
-    } else { 
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2007mar01
+*   created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/uniset.h"
+#include "unicode/ustring.h"
+#include "unicode/utf8.h"
+#include "unicode/utf16.h"
+#include "cmemory.h"
+#include "uvector.h"
+#include "unisetspan.h"
+
+U_NAMESPACE_BEGIN
+
+/*
+ * List of offsets from the current position from where to try matching
+ * a code point or a string.
+ * Store offsets rather than indexes to simplify the code and use the same list
+ * for both increments (in span()) and decrements (in spanBack()).
+ *
+ * Assumption: The maximum offset is limited, and the offsets that are stored
+ * at any one time are relatively dense, that is, there are normally no gaps of
+ * hundreds or thousands of offset values.
+ *
+ * The implementation uses a circular buffer of byte flags,
+ * each indicating whether the corresponding offset is in the list.
+ * This avoids inserting into a sorted list of offsets (or absolute indexes) and
+ * physically moving part of the list.
+ *
+ * Note: In principle, the caller should setMaxLength() to the maximum of the
+ * max string length and U16_LENGTH/U8_LENGTH to account for
+ * "long" single code points.
+ * However, this implementation uses at least a staticList with more than
+ * U8_LENGTH entries anyway.
+ *
+ * Note: If maxLength were guaranteed to be no more than 32 or 64,
+ * the list could be stored as bit flags in a single integer.
+ * Rather than handling a circular buffer with a start list index,
+ * the integer would simply be shifted when lower offsets are removed.
+ * UnicodeSet does not have a limit on the lengths of strings.
+ */
+class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
+public:
+    OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
+
+    ~OffsetList() {
+        if(list!=staticList) {
+            uprv_free(list);
+        }
+    }
+
+    // Call exactly once if the list is to be used.
+    void setMaxLength(int32_t maxLength) {
+        if(maxLength<=(int32_t)sizeof(staticList)) {
+            capacity=(int32_t)sizeof(staticList);
+        } else {
+            UBool *l=(UBool *)uprv_malloc(maxLength);
+            if(l!=NULL) {
+                list=l;
+                capacity=maxLength;
+            }
+        }
+        uprv_memset(list, 0, capacity);
+    }
+
+    void clear() {
+        uprv_memset(list, 0, capacity);
+        start=length=0;
+    }
+
+    UBool isEmpty() const {
+        return (UBool)(length==0);
+    }
+
+    // Reduce all stored offsets by delta, used when the current position
+    // moves by delta.
+    // There must not be any offsets lower than delta.
+    // If there is an offset equal to delta, it is removed.
+    // delta=[1..maxLength]
+    void shift(int32_t delta) {
+        int32_t i=start+delta;
+        if(i>=capacity) {
+            i-=capacity;
+        }
+        if(list[i]) {
+            list[i]=FALSE;
+            --length;
+        }
+        start=i;
+    }
+
+    // Add an offset. The list must not contain it yet.
+    // offset=[1..maxLength]
+    void addOffset(int32_t offset) {
+        int32_t i=start+offset;
+        if(i>=capacity) {
+            i-=capacity;
+        }
+        list[i]=TRUE;
+        ++length;
+    }
+
+    // offset=[1..maxLength]
+    UBool containsOffset(int32_t offset) const {
+        int32_t i=start+offset;
+        if(i>=capacity) {
+            i-=capacity;
+        }
+        return list[i];
+    }
+
+    // Find the lowest stored offset from a non-empty list, remove it,
+    // and reduce all other offsets by this minimum.
+    // Returns [1..maxLength].
+    int32_t popMinimum() {
+        // Look for the next offset in list[start+1..capacity-1].
+        int32_t i=start, result;
+        while(++i<capacity) {
+            if(list[i]) {
+                list[i]=FALSE;
+                --length;
+                result=i-start;
+                start=i;
+                return result;
+            }
+        }
+        // i==capacity
+
+        // Wrap around and look for the next offset in list[0..start].
+        // Since the list is not empty, there will be one.
+        result=capacity-start;
+        i=0;
+        while(!list[i]) {
+            ++i;
+        }
+        list[i]=FALSE;
+        --length;
+        start=i;
+        return result+=i;
+    }
+
+private:
+    UBool *list;
+    int32_t capacity;
+    int32_t length;
+    int32_t start;
+
+    UBool staticList[16];
+};
+
+// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
+static int32_t
+getUTF8Length(const UChar *s, int32_t length) {
+    UErrorCode errorCode=U_ZERO_ERROR;
+    int32_t length8=0;
+    u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
+    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
+        return length8;
+    } else {
+        // The string contains an unpaired surrogate.
+        // Ignore this string.
+        return 0;
+    }
+}
+
+// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
+static int32_t
+appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
+    UErrorCode errorCode=U_ZERO_ERROR;
+    int32_t length8=0;
+    u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
+    if(U_SUCCESS(errorCode)) {
+        return length8;
+    } else {
+        // The string contains an unpaired surrogate.
+        // Ignore this string.
+        return 0;
+    }
+}
+
+static inline uint8_t
+makeSpanLengthByte(int32_t spanLength) {
+    // 0xfe==UnicodeSetStringSpan::LONG_SPAN
+    return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
+}
+
+// Construct for all variants of span(), or only for any one variant.
+// Initialize as little as possible, for single use.
+UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
+                                           const UVector &setStrings,
+                                           uint32_t which)
+        : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
+          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
+          utf8Length(0),
+          maxLength16(0), maxLength8(0),
+          all((UBool)(which==ALL)) {
+    spanSet.retainAll(set);
+    if(which&NOT_CONTAINED) {
+        // Default to the same sets.
+        // addToSpanNotSet() will create a separate set if necessary.
+        pSpanNotSet=&spanSet;
+    }
+
+    // Determine if the strings even need to be taken into account at all for span() etc.
+    // If any string is relevant, then all strings need to be used for
+    // span(longest match) but only the relevant ones for span(while contained).
+    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
+    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
+    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
+    // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
+    int32_t stringsLength=strings.size();
+
+    int32_t i, spanLength;
+    UBool someRelevant=FALSE;
+    for(i=0; i<stringsLength; ++i) {
+        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+        const UChar *s16=string.getBuffer();
+        int32_t length16=string.length();
+        UBool thisRelevant;
+        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
+        if(spanLength<length16) {  // Relevant string.
+            someRelevant=thisRelevant=TRUE;
+        } else {
+            thisRelevant=FALSE;
+        }
+        if((which&UTF16) && length16>maxLength16) {
+            maxLength16=length16;
+        }
+        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
+            int32_t length8=getUTF8Length(s16, length16);
+            utf8Length+=length8;
+            if(length8>maxLength8) {
+                maxLength8=length8;
+            }
+        }
+    }
+    if(!someRelevant) {
+        maxLength16=maxLength8=0;
+        return;
+    }
+
+    // Freeze after checking for the need to use strings at all because freezing
+    // a set takes some time and memory which are wasted if there are no relevant strings.
+    if(all) {
+        spanSet.freeze();
+    }
+
+    uint8_t *spanBackLengths;
+    uint8_t *spanUTF8Lengths;
+    uint8_t *spanBackUTF8Lengths;
+
+    // Allocate a block of meta data.
+    int32_t allocSize;
+    if(all) {
+        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
+        allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
+    } else {
+        allocSize=stringsLength;  // One set of span lengths.
+        if(which&UTF8) {
+            // UTF-8 lengths and UTF-8 strings.
+            allocSize+=stringsLength*4+utf8Length;
+        }
+    }
+    if(allocSize<=(int32_t)sizeof(staticLengths)) {
+        utf8Lengths=staticLengths;
+    } else {
+        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
+        if(utf8Lengths==NULL) {
+            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
+            return;  // Out of memory.
+        }
+    }
+
+    if(all) {
+        // Store span lengths for all span() variants.
+        spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
+        spanBackLengths=spanLengths+stringsLength;
+        spanUTF8Lengths=spanBackLengths+stringsLength;
+        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
+        utf8=spanBackUTF8Lengths+stringsLength;
+    } else {
+        // Store span lengths for only one span() variant.
+        if(which&UTF8) {
+            spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
+            utf8=spanLengths+stringsLength;
+        } else {
+            spanLengths=(uint8_t *)utf8Lengths;
+        }
+        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
+    }
+
+    // Set the meta data and pSpanNotSet and write the UTF-8 strings.
+    int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
+
+    for(i=0; i<stringsLength; ++i) {
+        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+        const UChar *s16=string.getBuffer();
+        int32_t length16=string.length();
+        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
+        if(spanLength<length16) {  // Relevant string.
+            if(which&UTF16) {
+                if(which&CONTAINED) {
+                    if(which&FWD) {
+                        spanLengths[i]=makeSpanLengthByte(spanLength);
+                    }
+                    if(which&BACK) {
+                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
+                        spanBackLengths[i]=makeSpanLengthByte(spanLength);
+                    }
+                } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
+                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
+                }
+            }
+            if(which&UTF8) {
+                uint8_t *s8=utf8+utf8Count;
+                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
+                utf8Count+=utf8Lengths[i]=length8;
+                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
+                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
+                } else {  // Relevant for UTF-8.
+                    if(which&CONTAINED) {
+                        if(which&FWD) {
+                            spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
+                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
+                        }
+                        if(which&BACK) {
+                            spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
+                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
+                        }
+                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
+                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
+                    }
+                }
+            }
+            if(which&NOT_CONTAINED) {
+                // Add string start and end code points to the spanNotSet so that
+                // a span(while not contained) stops before any string.
+                UChar32 c;
+                if(which&FWD) {
+                    int32_t len=0;
+                    U16_NEXT(s16, len, length16, c);
+                    addToSpanNotSet(c);
+                }
+                if(which&BACK) {
+                    int32_t len=length16;
+                    U16_PREV(s16, 0, len, c);
+                    addToSpanNotSet(c);
+                }
+            }
+        } else {  // Irrelevant string.
+            if(which&UTF8) {
+                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
+                    uint8_t *s8=utf8+utf8Count;
+                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
+                    utf8Count+=utf8Lengths[i]=length8;
+                } else {
+                    utf8Lengths[i]=0;
+                }
+            }
+            if(all) {
+                spanLengths[i]=spanBackLengths[i]=
+                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
+                        (uint8_t)ALL_CP_CONTAINED;
+            } else {
+                // All spanXYZLengths pointers contain the same address.
+                spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
+            }
+        }
+    }
+
+    // Finish.
+    if(all) {
+        pSpanNotSet->freeze();
+    }
+}
+
+// Copy constructor. Assumes which==ALL for a frozen set.
+UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
+                                           const UVector &newParentSetStrings)
+        : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
+          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
+          utf8Length(otherStringSpan.utf8Length),
+          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
+          all(TRUE) {
+    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
+        pSpanNotSet=&spanSet;
+    } else {
         pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
-    } 
- 
-    // Allocate a block of meta data. 
-    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 
-    int32_t stringsLength=strings.size(); 
-    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 
-    if(allocSize<=(int32_t)sizeof(staticLengths)) { 
-        utf8Lengths=staticLengths; 
-    } else { 
-        utf8Lengths=(int32_t *)uprv_malloc(allocSize); 
-        if(utf8Lengths==NULL) { 
-            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 
-            return;  // Out of memory. 
-        } 
-    } 
- 
-    spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 
-    utf8=spanLengths+stringsLength*4; 
-    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 
-} 
- 
-UnicodeSetStringSpan::~UnicodeSetStringSpan() { 
-    if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 
-        delete pSpanNotSet; 
-    } 
-    if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 
-        uprv_free(utf8Lengths); 
-    } 
-} 
- 
-void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 
-    if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 
-        if(spanSet.contains(c)) { 
-            return;  // Nothing to do. 
-        } 
+    }
+
+    // Allocate a block of meta data.
+    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
+    int32_t stringsLength=strings.size();
+    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
+    if(allocSize<=(int32_t)sizeof(staticLengths)) {
+        utf8Lengths=staticLengths;
+    } else {
+        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
+        if(utf8Lengths==NULL) {
+            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
+            return;  // Out of memory.
+        }
+    }
+
+    spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
+    utf8=spanLengths+stringsLength*4;
+    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
+}
+
+UnicodeSetStringSpan::~UnicodeSetStringSpan() {
+    if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
+        delete pSpanNotSet;
+    }
+    if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
+        uprv_free(utf8Lengths);
+    }
+}
+
+void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
+    if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
+        if(spanSet.contains(c)) {
+            return;  // Nothing to do.
+        }
         UnicodeSet *newSet=spanSet.cloneAsThawed();
-        if(newSet==NULL) { 
-            return;  // Out of memory. 
-        } else { 
-            pSpanNotSet=newSet; 
-        } 
-    } 
-    pSpanNotSet->add(c); 
-} 
- 
-// Compare strings without any argument checks. Requires length>0. 
-static inline UBool 
-matches16(const UChar *s, const UChar *t, int32_t length) { 
-    do { 
-        if(*s++!=*t++) { 
-            return FALSE; 
-        } 
-    } while(--length>0); 
-    return TRUE; 
-} 
- 
-static inline UBool 
-matches8(const uint8_t *s, const uint8_t *t, int32_t length) { 
-    do { 
-        if(*s++!=*t++) { 
-            return FALSE; 
-        } 
-    } while(--length>0); 
-    return TRUE; 
-} 
- 
-// Compare 16-bit Unicode strings (which may be malformed UTF-16) 
-// at code point boundaries. 
-// That is, each edge of a match must not be in the middle of a surrogate pair. 
-static inline UBool 
-matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 
-    s+=start; 
-    limit-=start; 
-    return matches16(s, t, length) && 
-           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 
-           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 
-} 
- 
-// Does the set contain the next code point? 
-// If so, return its length; otherwise return its negative length. 
-static inline int32_t 
-spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 
-    UChar c=*s, c2; 
-    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 
-        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 
-    } 
-    return set.contains(c) ? 1 : -1; 
-} 
- 
-static inline int32_t 
-spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 
-    UChar c=s[length-1], c2; 
-    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 
-        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 
-    } 
-    return set.contains(c) ? 1 : -1; 
-} 
- 
-static inline int32_t 
-spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 
-    UChar32 c=*s; 
+        if(newSet==NULL) {
+            return;  // Out of memory.
+        } else {
+            pSpanNotSet=newSet;
+        }
+    }
+    pSpanNotSet->add(c);
+}
+
+// Compare strings without any argument checks. Requires length>0.
+static inline UBool
+matches16(const UChar *s, const UChar *t, int32_t length) {
+    do {
+        if(*s++!=*t++) {
+            return FALSE;
+        }
+    } while(--length>0);
+    return TRUE;
+}
+
+static inline UBool
+matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
+    do {
+        if(*s++!=*t++) {
+            return FALSE;
+        }
+    } while(--length>0);
+    return TRUE;
+}
+
+// Compare 16-bit Unicode strings (which may be malformed UTF-16)
+// at code point boundaries.
+// That is, each edge of a match must not be in the middle of a surrogate pair.
+static inline UBool
+matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
+    s+=start;
+    limit-=start;
+    return matches16(s, t, length) &&
+           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
+           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
+}
+
+// Does the set contain the next code point?
+// If so, return its length; otherwise return its negative length.
+static inline int32_t
+spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
+    UChar c=*s, c2;
+    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
+        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
+    }
+    return set.contains(c) ? 1 : -1;
+}
+
+static inline int32_t
+spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
+    UChar c=s[length-1], c2;
+    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
+        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
+    }
+    return set.contains(c) ? 1 : -1;
+}
+
+static inline int32_t
+spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
+    UChar32 c=*s;
     if(U8_IS_SINGLE(c)) {
-        return set.contains(c) ? 1 : -1; 
-    } 
-    // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). 
-    int32_t i=0; 
-    U8_NEXT_OR_FFFD(s, i, length, c); 
-    return set.contains(c) ? i : -i; 
-} 
- 
-static inline int32_t 
-spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 
-    UChar32 c=s[length-1]; 
+        return set.contains(c) ? 1 : -1;
+    }
+    // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
+    int32_t i=0;
+    U8_NEXT_OR_FFFD(s, i, length, c);
+    return set.contains(c) ? i : -i;
+}
+
+static inline int32_t
+spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
+    UChar32 c=s[length-1];
     if(U8_IS_SINGLE(c)) {
-        return set.contains(c) ? 1 : -1; 
-    } 
-    int32_t i=length-1; 
-    c=utf8_prevCharSafeBody(s, 0, &i, c, -3); 
-    length-=i; 
-    return set.contains(c) ? length : -length; 
-} 
- 
-/* 
- * Note: In span() when spanLength==0 (after a string match, or at the beginning 
- * after an empty code point span) and in spanNot() and spanNotUTF8(), 
- * string matching could use a binary search 
- * because all string matches are done from the same start index. 
- * 
- * For UTF-8, this would require a comparison function that returns UTF-16 order. 
- * 
- * This optimization should not be necessary for normal UnicodeSets because 
- * most sets have no strings, and most sets with strings have 
- * very few very short strings. 
- * For cases with many strings, it might be better to use a different API 
- * and implementation with a DFA (state machine). 
- */ 
- 
-/* 
- * Algorithm for span(USET_SPAN_CONTAINED) 
- * 
- * Theoretical algorithm: 
- * - Iterate through the string, and at each code point boundary: 
- *   + If the code point there is in the set, then remember to continue after it. 
- *   + If a set string matches at the current position, then remember to continue after it. 
- *   + Either recursively span for each code point or string match, 
- *     or recursively span for all but the shortest one and 
- *     iteratively continue the span with the shortest local match. 
- *   + Remember the longest recursive span (the farthest end point). 
- *   + If there is no match at the current position, neither for the code point there 
- *     nor for any set string, then stop and return the longest recursive span length. 
- * 
- * Optimized implementation: 
- * 
- * (We assume that most sets will have very few very short strings. 
- * A span using a string-less set is extremely fast.) 
- * 
- * Create and cache a spanSet which contains all of the single code points 
- * of the original set but none of its strings. 
- * 
- * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 
- * - Loop: 
- *   + Try to match each set string at the end of the spanLength. 
- *     ~ Set strings that start with set-contained code points must be matched 
- *       with a partial overlap because the recursive algorithm would have tried 
- *       to match them at every position. 
- *     ~ Set strings that entirely consist of set-contained code points 
- *       are irrelevant for span(USET_SPAN_CONTAINED) because the 
- *       recursive algorithm would continue after them anyway 
- *       and find the longest recursive match from their end. 
- *     ~ Rather than recursing, note each end point of a set string match. 
- *   + If no set string matched after spanSet.span(), then return 
- *     with where the spanSet.span() ended. 
- *   + If at least one set string matched after spanSet.span(), then 
- *     pop the shortest string match end point and continue 
- *     the loop, trying to match all set strings from there. 
- *   + If at least one more set string matched after a previous string match, 
- *     then test if the code point after the previous string match is also 
- *     contained in the set. 
- *     Continue the loop with the shortest end point of either this code point 
- *     or a matching set string. 
- *   + If no more set string matched after a previous string match, 
- *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 
- *     Stop if spanLength==0, otherwise continue the loop. 
- * 
- * By noting each end point of a set string match, 
- * the function visits each string position at most once and finishes 
- * in linear time. 
- * 
- * The recursive algorithm may visit the same string position many times 
- * if multiple paths lead to it and finishes in exponential time. 
- */ 
- 
-/* 
- * Algorithm for span(USET_SPAN_SIMPLE) 
- * 
- * Theoretical algorithm: 
- * - Iterate through the string, and at each code point boundary: 
- *   + If the code point there is in the set, then remember to continue after it. 
- *   + If a set string matches at the current position, then remember to continue after it. 
- *   + Continue from the farthest match position and ignore all others. 
- *   + If there is no match at the current position, 
- *     then stop and return the current position. 
- * 
- * Optimized implementation: 
- * 
- * (Same assumption and spanSet as above.) 
- * 
- * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 
- * - Loop: 
- *   + Try to match each set string at the end of the spanLength. 
- *     ~ Set strings that start with set-contained code points must be matched 
- *       with a partial overlap because the standard algorithm would have tried 
- *       to match them earlier. 
- *     ~ Set strings that entirely consist of set-contained code points 
- *       must be matched with a full overlap because the longest-match algorithm 
- *       would hide set string matches that end earlier. 
- *       Such set strings need not be matched earlier inside the code point span 
- *       because the standard algorithm would then have continued after 
- *       the set string match anyway. 
- *     ~ Remember the longest set string match (farthest end point) from the earliest 
- *       starting point. 
- *   + If no set string matched after spanSet.span(), then return 
- *     with where the spanSet.span() ended. 
- *   + If at least one set string matched, then continue the loop after the 
- *     longest match from the earliest position. 
- *   + If no more set string matched after a previous string match, 
- *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 
- *     Stop if spanLength==0, otherwise continue the loop. 
- */ 
- 
-int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 
-    if(spanCondition==USET_SPAN_NOT_CONTAINED) { 
-        return spanNot(s, length); 
-    } 
-    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 
-    if(spanLength==length) { 
-        return length; 
-    } 
- 
-    // Consider strings; they may overlap with the span. 
-    OffsetList offsets; 
-    if(spanCondition==USET_SPAN_CONTAINED) { 
-        // Use offset list to try all possibilities. 
-        offsets.setMaxLength(maxLength16); 
-    } 
-    int32_t pos=spanLength, rest=length-pos; 
-    int32_t i, stringsLength=strings.size(); 
-    for(;;) { 
-        if(spanCondition==USET_SPAN_CONTAINED) { 
-            for(i=0; i<stringsLength; ++i) { 
-                int32_t overlap=spanLengths[i]; 
-                if(overlap==ALL_CP_CONTAINED) { 
-                    continue;  // Irrelevant string. 
-                } 
-                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 
-                const UChar *s16=string.getBuffer(); 
-                int32_t length16=string.length(); 
- 
-                // Try to match this string at pos-overlap..pos. 
-                if(overlap>=LONG_SPAN) { 
-                    overlap=length16; 
-                    // While contained: No point matching fully inside the code point span. 
-                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point. 
-                } 
-                if(overlap>spanLength) { 
-                    overlap=spanLength; 
-                } 
-                int32_t inc=length16-overlap;  // Keep overlap+inc==length16. 
-                for(;;) { 
-                    if(inc>rest) { 
-                        break; 
-                    } 
-                    // Try to match if the increment is not listed already. 
-                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 
-                        if(inc==rest) { 
-                            return length;  // Reached the end of the string. 
-                        } 
-                        offsets.addOffset(inc); 
-                    } 
-                    if(overlap==0) { 
-                        break; 
-                    } 
-                    --overlap; 
-                    ++inc; 
-                } 
-            } 
-        } else /* USET_SPAN_SIMPLE */ { 
-            int32_t maxInc=0, maxOverlap=0; 
-            for(i=0; i<stringsLength; ++i) { 
-                int32_t overlap=spanLengths[i]; 
-                // For longest match, we do need to try to match even an all-contained string 
-                // to find the match from the earliest start. 
- 
-                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 
-                const UChar *s16=string.getBuffer(); 
-                int32_t length16=string.length(); 
- 
-                // Try to match this string at pos-overlap..pos. 
-                if(overlap>=LONG_SPAN) { 
-                    overlap=length16; 
-                    // Longest match: Need to match fully inside the code point span 
-                    // to find the match from the earliest start. 
-                } 
-                if(overlap>spanLength) { 
-                    overlap=spanLength; 
-                } 
-                int32_t inc=length16-overlap;  // Keep overlap+inc==length16. 
-                for(;;) { 
-                    if(inc>rest || overlap<maxOverlap) { 
-                        break; 
-                    } 
-                    // Try to match if the string is longer or starts earlier. 
-                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 
-                        matches16CPB(s, pos-overlap, length, s16, length16) 
-                    ) { 
-                        maxInc=inc;  // Longest match from earliest start. 
-                        maxOverlap=overlap; 
-                        break; 
-                    } 
-                    --overlap; 
-                    ++inc; 
-                } 
-            } 
- 
-            if(maxInc!=0 || maxOverlap!=0) { 
-                // Longest-match algorithm, and there was a string match. 
-                // Simply continue after it. 
-                pos+=maxInc; 
-                rest-=maxInc; 
-                if(rest==0) { 
-                    return length;  // Reached the end of the string. 
-                } 
-                spanLength=0;  // Match strings from after a string match. 
-                continue; 
-            } 
-        } 
-        // Finished trying to match all strings at pos. 
- 
-        if(spanLength!=0 || pos==0) { 
-            // The position is after an unlimited code point span (spanLength!=0), 
-            // not after a string match. 
-            // The only position where spanLength==0 after a span is pos==0. 
-            // Otherwise, an unlimited code point span is only tried again when no 
-            // strings match, and if such a non-initial span fails we stop. 
-            if(offsets.isEmpty()) { 
-                return pos;  // No strings matched after a span. 
-            } 
-            // Match strings from after the next string match. 
-        } else { 
-            // The position is after a string match (or a single code point). 
-            if(offsets.isEmpty()) { 
-                // No more strings matched after a previous string match. 
-                // Try another code point span from after the last string match. 
-                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 
-                if( spanLength==rest || // Reached the end of the string, or 
-                    spanLength==0       // neither strings nor span progressed. 
-                ) { 
-                    return pos+spanLength; 
-                } 
-                pos+=spanLength; 
-                rest-=spanLength; 
-                continue;  // spanLength>0: Match strings from after a span. 
-            } else { 
-                // Try to match only one code point from after a string match if some 
-                // string matched beyond it, so that we try all possible positions 
-                // and don't overshoot. 
-                spanLength=spanOne(spanSet, s+pos, rest); 
-                if(spanLength>0) { 
-                    if(spanLength==rest) { 
-                        return length;  // Reached the end of the string. 
-                    } 
-                    // Match strings after this code point. 
-                    // There cannot be any increments below it because UnicodeSet strings 
-                    // contain multiple code points. 
-                    pos+=spanLength; 
-                    rest-=spanLength; 
-                    offsets.shift(spanLength); 
-                    spanLength=0; 
-                    continue;  // Match strings from after a single code point. 
-                } 
-                // Match strings from after the next string match. 
-            } 
-        } 
-        int32_t minOffset=offsets.popMinimum(); 
-        pos+=minOffset; 
-        rest-=minOffset; 
-        spanLength=0;  // Match strings from after a string match. 
-    } 
-} 
- 
-int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 
-    if(spanCondition==USET_SPAN_NOT_CONTAINED) { 
-        return spanNotBack(s, length); 
-    } 
-    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 
-    if(pos==0) { 
-        return 0; 
-    } 
-    int32_t spanLength=length-pos; 
- 
-    // Consider strings; they may overlap with the span. 
-    OffsetList offsets; 
-    if(spanCondition==USET_SPAN_CONTAINED) { 
-        // Use offset list to try all possibilities. 
-        offsets.setMaxLength(maxLength16); 
-    } 
-    int32_t i, stringsLength=strings.size(); 
-    uint8_t *spanBackLengths=spanLengths; 
-    if(all) { 
-        spanBackLengths+=stringsLength; 
-    } 
-    for(;;) { 
-        if(spanCondition==USET_SPAN_CONTAINED) { 
-            for(i=0; i<stringsLength; ++i) { 
-                int32_t overlap=spanBackLengths[i]; 
-                if(overlap==ALL_CP_CONTAINED) { 
-                    continue;  // Irrelevant string. 
-                } 
-                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 
-                const UChar *s16=string.getBuffer(); 
-                int32_t length16=string.length(); 
- 
-                // Try to match this string at pos-(length16-overlap)..pos-length16. 
-                if(overlap>=LONG_SPAN) { 
-                    overlap=length16; 
-                    // While contained: No point matching fully inside the code point span. 
-                    int32_t len1=0; 
-                    U16_FWD_1(s16, len1, overlap); 
-                    overlap-=len1;  // Length of the string minus the first code point. 
-                } 
-                if(overlap>spanLength) { 
-                    overlap=spanLength; 
-                } 
-                int32_t dec=length16-overlap;  // Keep dec+overlap==length16. 
-                for(;;) { 
-                    if(dec>pos) { 
-                        break; 
-                    } 
-                    // Try to match if the decrement is not listed already. 
-                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 
-                        if(dec==pos) { 
-                            return 0;  // Reached the start of the string. 
-                        } 
-                        offsets.addOffset(dec); 
-                    } 
-                    if(overlap==0) { 
-                        break; 
-                    } 
-                    --overlap; 
-                    ++dec; 
-                } 
-            } 
-        } else /* USET_SPAN_SIMPLE */ { 
-            int32_t maxDec=0, maxOverlap=0; 
-            for(i=0; i<stringsLength; ++i) { 
-                int32_t overlap=spanBackLengths[i]; 
-                // For longest match, we do need to try to match even an all-contained string 
-                // to find the match from the latest end. 
- 
-                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 
-                const UChar *s16=string.getBuffer(); 
-                int32_t length16=string.length(); 
- 
-                // Try to match this string at pos-(length16-overlap)..pos-length16. 
-                if(overlap>=LONG_SPAN) { 
-                    overlap=length16; 
-                    // Longest match: Need to match fully inside the code point span 
-                    // to find the match from the latest end. 
-                } 
-                if(overlap>spanLength) { 
-                    overlap=spanLength; 
-                } 
-                int32_t dec=length16-overlap;  // Keep dec+overlap==length16. 
-                for(;;) { 
-                    if(dec>pos || overlap<maxOverlap) { 
-                        break; 
-                    } 
-                    // Try to match if the string is longer or ends later. 
-                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 
-                        matches16CPB(s, pos-dec, length, s16, length16) 
-                    ) { 
-                        maxDec=dec;  // Longest match from latest end. 
-                        maxOverlap=overlap; 
-                        break; 
-                    } 
-                    --overlap; 
-                    ++dec; 
-                } 
-            } 
- 
-            if(maxDec!=0 || maxOverlap!=0) { 
-                // Longest-match algorithm, and there was a string match. 
-                // Simply continue before it. 
-                pos-=maxDec; 
-                if(pos==0) { 
-                    return 0;  // Reached the start of the string. 
-                } 
-                spanLength=0;  // Match strings from before a string match. 
-                continue; 
-            } 
-        } 
-        // Finished trying to match all strings at pos. 
- 
-        if(spanLength!=0 || pos==length) { 
-            // The position is before an unlimited code point span (spanLength!=0), 
-            // not before a string match. 
-            // The only position where spanLength==0 before a span is pos==length. 
-            // Otherwise, an unlimited code point span is only tried again when no 
-            // strings match, and if such a non-initial span fails we stop. 
-            if(offsets.isEmpty()) { 
-                return pos;  // No strings matched before a span. 
-            } 
-            // Match strings from before the next string match. 
-        } else { 
-            // The position is before a string match (or a single code point). 
-            if(offsets.isEmpty()) { 
-                // No more strings matched before a previous string match. 
-                // Try another code point span from before the last string match. 
-                int32_t oldPos=pos; 
-                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 
-                spanLength=oldPos-pos; 
-                if( pos==0 ||           // Reached the start of the string, or 
-                    spanLength==0       // neither strings nor span progressed. 
-                ) { 
-                    return pos; 
-                } 
-                continue;  // spanLength>0: Match strings from before a span. 
-            } else { 
-                // Try to match only one code point from before a string match if some 
-                // string matched beyond it, so that we try all possible positions 
-                // and don't overshoot. 
-                spanLength=spanOneBack(spanSet, s, pos); 
-                if(spanLength>0) { 
-                    if(spanLength==pos) { 
-                        return 0;  // Reached the start of the string. 
-                    } 
-                    // Match strings before this code point. 
-                    // There cannot be any decrements below it because UnicodeSet strings 
-                    // contain multiple code points. 
-                    pos-=spanLength; 
-                    offsets.shift(spanLength); 
-                    spanLength=0; 
-                    continue;  // Match strings from before a single code point. 
-                } 
-                // Match strings from before the next string match. 
-            } 
-        } 
-        pos-=offsets.popMinimum(); 
-        spanLength=0;  // Match strings from before a string match. 
-    } 
-} 
- 
-int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 
-    if(spanCondition==USET_SPAN_NOT_CONTAINED) { 
-        return spanNotUTF8(s, length); 
-    } 
-    int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 
-    if(spanLength==length) { 
-        return length; 
-    } 
- 
-    // Consider strings; they may overlap with the span. 
-    OffsetList offsets; 
-    if(spanCondition==USET_SPAN_CONTAINED) { 
-        // Use offset list to try all possibilities. 
-        offsets.setMaxLength(maxLength8); 
-    } 
-    int32_t pos=spanLength, rest=length-pos; 
-    int32_t i, stringsLength=strings.size(); 
-    uint8_t *spanUTF8Lengths=spanLengths; 
-    if(all) { 
-        spanUTF8Lengths+=2*stringsLength; 
-    } 
-    for(;;) { 
-        const uint8_t *s8=utf8; 
-        int32_t length8; 
-        if(spanCondition==USET_SPAN_CONTAINED) { 
-            for(i=0; i<stringsLength; ++i) { 
-                length8=utf8Lengths[i]; 
-                if(length8==0) { 
-                    continue;  // String not representable in UTF-8. 
-                } 
-                int32_t overlap=spanUTF8Lengths[i]; 
-                if(overlap==ALL_CP_CONTAINED) { 
-                    s8+=length8; 
-                    continue;  // Irrelevant string. 
-                } 
- 
-                // Try to match this string at pos-overlap..pos. 
-                if(overlap>=LONG_SPAN) { 
-                    overlap=length8; 
-                    // While contained: No point matching fully inside the code point span. 
-                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point. 
-                } 
-                if(overlap>spanLength) { 
-                    overlap=spanLength; 
-                } 
-                int32_t inc=length8-overlap;  // Keep overlap+inc==length8. 
-                for(;;) { 
-                    if(inc>rest) { 
-                        break; 
-                    } 
-                    // Try to match if the increment is not listed already. 
-                    // Match at code point boundaries. (The UTF-8 strings were converted 
-                    // from UTF-16 and are guaranteed to be well-formed.) 
+        return set.contains(c) ? 1 : -1;
+    }
+    int32_t i=length-1;
+    c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
+    length-=i;
+    return set.contains(c) ? length : -length;
+}
+
+/*
+ * Note: In span() when spanLength==0 (after a string match, or at the beginning
+ * after an empty code point span) and in spanNot() and spanNotUTF8(),
+ * string matching could use a binary search
+ * because all string matches are done from the same start index.
+ *
+ * For UTF-8, this would require a comparison function that returns UTF-16 order.
+ *
+ * This optimization should not be necessary for normal UnicodeSets because
+ * most sets have no strings, and most sets with strings have
+ * very few very short strings.
+ * For cases with many strings, it might be better to use a different API
+ * and implementation with a DFA (state machine).
+ */
+
+/*
+ * Algorithm for span(USET_SPAN_CONTAINED)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ *   + If the code point there is in the set, then remember to continue after it.
+ *   + If a set string matches at the current position, then remember to continue after it.
+ *   + Either recursively span for each code point or string match,
+ *     or recursively span for all but the shortest one and
+ *     iteratively continue the span with the shortest local match.
+ *   + Remember the longest recursive span (the farthest end point).
+ *   + If there is no match at the current position, neither for the code point there
+ *     nor for any set string, then stop and return the longest recursive span length.
+ *
+ * Optimized implementation:
+ *
+ * (We assume that most sets will have very few very short strings.
+ * A span using a string-less set is extremely fast.)
+ *
+ * Create and cache a spanSet which contains all of the single code points
+ * of the original set but none of its strings.
+ *
+ * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
+ * - Loop:
+ *   + Try to match each set string at the end of the spanLength.
+ *     ~ Set strings that start with set-contained code points must be matched
+ *       with a partial overlap because the recursive algorithm would have tried
+ *       to match them at every position.
+ *     ~ Set strings that entirely consist of set-contained code points
+ *       are irrelevant for span(USET_SPAN_CONTAINED) because the
+ *       recursive algorithm would continue after them anyway
+ *       and find the longest recursive match from their end.
+ *     ~ Rather than recursing, note each end point of a set string match.
+ *   + If no set string matched after spanSet.span(), then return
+ *     with where the spanSet.span() ended.
+ *   + If at least one set string matched after spanSet.span(), then
+ *     pop the shortest string match end point and continue
+ *     the loop, trying to match all set strings from there.
+ *   + If at least one more set string matched after a previous string match,
+ *     then test if the code point after the previous string match is also
+ *     contained in the set.
+ *     Continue the loop with the shortest end point of either this code point
+ *     or a matching set string.
+ *   + If no more set string matched after a previous string match,
+ *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
+ *     Stop if spanLength==0, otherwise continue the loop.
+ *
+ * By noting each end point of a set string match,
+ * the function visits each string position at most once and finishes
+ * in linear time.
+ *
+ * The recursive algorithm may visit the same string position many times
+ * if multiple paths lead to it and finishes in exponential time.
+ */
+
+/*
+ * Algorithm for span(USET_SPAN_SIMPLE)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ *   + If the code point there is in the set, then remember to continue after it.
+ *   + If a set string matches at the current position, then remember to continue after it.
+ *   + Continue from the farthest match position and ignore all others.
+ *   + If there is no match at the current position,
+ *     then stop and return the current position.
+ *
+ * Optimized implementation:
+ *
+ * (Same assumption and spanSet as above.)
+ *
+ * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
+ * - Loop:
+ *   + Try to match each set string at the end of the spanLength.
+ *     ~ Set strings that start with set-contained code points must be matched
+ *       with a partial overlap because the standard algorithm would have tried
+ *       to match them earlier.
+ *     ~ Set strings that entirely consist of set-contained code points
+ *       must be matched with a full overlap because the longest-match algorithm
+ *       would hide set string matches that end earlier.
+ *       Such set strings need not be matched earlier inside the code point span
+ *       because the standard algorithm would then have continued after
+ *       the set string match anyway.
+ *     ~ Remember the longest set string match (farthest end point) from the earliest
+ *       starting point.
+ *   + If no set string matched after spanSet.span(), then return
+ *     with where the spanSet.span() ended.
+ *   + If at least one set string matched, then continue the loop after the
+ *     longest match from the earliest position.
+ *   + If no more set string matched after a previous string match,
+ *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
+ *     Stop if spanLength==0, otherwise continue the loop.
+ */
+
+int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
+    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+        return spanNot(s, length);
+    }
+    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
+    if(spanLength==length) {
+        return length;
+    }
+
+    // Consider strings; they may overlap with the span.
+    OffsetList offsets;
+    if(spanCondition==USET_SPAN_CONTAINED) {
+        // Use offset list to try all possibilities.
+        offsets.setMaxLength(maxLength16);
+    }
+    int32_t pos=spanLength, rest=length-pos;
+    int32_t i, stringsLength=strings.size();
+    for(;;) {
+        if(spanCondition==USET_SPAN_CONTAINED) {
+            for(i=0; i<stringsLength; ++i) {
+                int32_t overlap=spanLengths[i];
+                if(overlap==ALL_CP_CONTAINED) {
+                    continue;  // Irrelevant string.
+                }
+                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+                const UChar *s16=string.getBuffer();
+                int32_t length16=string.length();
+
+                // Try to match this string at pos-overlap..pos.
+                if(overlap>=LONG_SPAN) {
+                    overlap=length16;
+                    // While contained: No point matching fully inside the code point span.
+                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
+                }
+                if(overlap>spanLength) {
+                    overlap=spanLength;
+                }
+                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
+                for(;;) {
+                    if(inc>rest) {
+                        break;
+                    }
+                    // Try to match if the increment is not listed already.
+                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
+                        if(inc==rest) {
+                            return length;  // Reached the end of the string.
+                        }
+                        offsets.addOffset(inc);
+                    }
+                    if(overlap==0) {
+                        break;
+                    }
+                    --overlap;
+                    ++inc;
+                }
+            }
+        } else /* USET_SPAN_SIMPLE */ {
+            int32_t maxInc=0, maxOverlap=0;
+            for(i=0; i<stringsLength; ++i) {
+                int32_t overlap=spanLengths[i];
+                // For longest match, we do need to try to match even an all-contained string
+                // to find the match from the earliest start.
+
+                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+                const UChar *s16=string.getBuffer();
+                int32_t length16=string.length();
+
+                // Try to match this string at pos-overlap..pos.
+                if(overlap>=LONG_SPAN) {
+                    overlap=length16;
+                    // Longest match: Need to match fully inside the code point span
+                    // to find the match from the earliest start.
+                }
+                if(overlap>spanLength) {
+                    overlap=spanLength;
+                }
+                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
+                for(;;) {
+                    if(inc>rest || overlap<maxOverlap) {
+                        break;
+                    }
+                    // Try to match if the string is longer or starts earlier.
+                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
+                        matches16CPB(s, pos-overlap, length, s16, length16)
+                    ) {
+                        maxInc=inc;  // Longest match from earliest start.
+                        maxOverlap=overlap;
+                        break;
+                    }
+                    --overlap;
+                    ++inc;
+                }
+            }
+
+            if(maxInc!=0 || maxOverlap!=0) {
+                // Longest-match algorithm, and there was a string match.
+                // Simply continue after it.
+                pos+=maxInc;
+                rest-=maxInc;
+                if(rest==0) {
+                    return length;  // Reached the end of the string.
+                }
+                spanLength=0;  // Match strings from after a string match.
+                continue;
+            }
+        }
+        // Finished trying to match all strings at pos.
+
+        if(spanLength!=0 || pos==0) {
+            // The position is after an unlimited code point span (spanLength!=0),
+            // not after a string match.
+            // The only position where spanLength==0 after a span is pos==0.
+            // Otherwise, an unlimited code point span is only tried again when no
+            // strings match, and if such a non-initial span fails we stop.
+            if(offsets.isEmpty()) {
+                return pos;  // No strings matched after a span.
+            }
+            // Match strings from after the next string match.
+        } else {
+            // The position is after a string match (or a single code point).
+            if(offsets.isEmpty()) {
+                // No more strings matched after a previous string match.
+                // Try another code point span from after the last string match.
+                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
+                if( spanLength==rest || // Reached the end of the string, or
+                    spanLength==0       // neither strings nor span progressed.
+                ) {
+                    return pos+spanLength;
+                }
+                pos+=spanLength;
+                rest-=spanLength;
+                continue;  // spanLength>0: Match strings from after a span.
+            } else {
+                // Try to match only one code point from after a string match if some
+                // string matched beyond it, so that we try all possible positions
+                // and don't overshoot.
+                spanLength=spanOne(spanSet, s+pos, rest);
+                if(spanLength>0) {
+                    if(spanLength==rest) {
+                        return length;  // Reached the end of the string.
+                    }
+                    // Match strings after this code point.
+                    // There cannot be any increments below it because UnicodeSet strings
+                    // contain multiple code points.
+                    pos+=spanLength;
+                    rest-=spanLength;
+                    offsets.shift(spanLength);
+                    spanLength=0;
+                    continue;  // Match strings from after a single code point.
+                }
+                // Match strings from after the next string match.
+            }
+        }
+        int32_t minOffset=offsets.popMinimum();
+        pos+=minOffset;
+        rest-=minOffset;
+        spanLength=0;  // Match strings from after a string match.
+    }
+}
+
+int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
+    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+        return spanNotBack(s, length);
+    }
+    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
+    if(pos==0) {
+        return 0;
+    }
+    int32_t spanLength=length-pos;
+
+    // Consider strings; they may overlap with the span.
+    OffsetList offsets;
+    if(spanCondition==USET_SPAN_CONTAINED) {
+        // Use offset list to try all possibilities.
+        offsets.setMaxLength(maxLength16);
+    }
+    int32_t i, stringsLength=strings.size();
+    uint8_t *spanBackLengths=spanLengths;
+    if(all) {
+        spanBackLengths+=stringsLength;
+    }
+    for(;;) {
+        if(spanCondition==USET_SPAN_CONTAINED) {
+            for(i=0; i<stringsLength; ++i) {
+                int32_t overlap=spanBackLengths[i];
+                if(overlap==ALL_CP_CONTAINED) {
+                    continue;  // Irrelevant string.
+                }
+                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+                const UChar *s16=string.getBuffer();
+                int32_t length16=string.length();
+
+                // Try to match this string at pos-(length16-overlap)..pos-length16.
+                if(overlap>=LONG_SPAN) {
+                    overlap=length16;
+                    // While contained: No point matching fully inside the code point span.
+                    int32_t len1=0;
+                    U16_FWD_1(s16, len1, overlap);
+                    overlap-=len1;  // Length of the string minus the first code point.
+                }
+                if(overlap>spanLength) {
+                    overlap=spanLength;
+                }
+                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
+                for(;;) {
+                    if(dec>pos) {
+                        break;
+                    }
+                    // Try to match if the decrement is not listed already.
+                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
+                        if(dec==pos) {
+                            return 0;  // Reached the start of the string.
+                        }
+                        offsets.addOffset(dec);
+                    }
+                    if(overlap==0) {
+                        break;
+                    }
+                    --overlap;
+                    ++dec;
+                }
+            }
+        } else /* USET_SPAN_SIMPLE */ {
+            int32_t maxDec=0, maxOverlap=0;
+            for(i=0; i<stringsLength; ++i) {
+                int32_t overlap=spanBackLengths[i];
+                // For longest match, we do need to try to match even an all-contained string
+                // to find the match from the latest end.
+
+                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+                const UChar *s16=string.getBuffer();
+                int32_t length16=string.length();
+
+                // Try to match this string at pos-(length16-overlap)..pos-length16.
+                if(overlap>=LONG_SPAN) {
+                    overlap=length16;
+                    // Longest match: Need to match fully inside the code point span
+                    // to find the match from the latest end.
+                }
+                if(overlap>spanLength) {
+                    overlap=spanLength;
+                }
+                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
+                for(;;) {
+                    if(dec>pos || overlap<maxOverlap) {
+                        break;
+                    }
+                    // Try to match if the string is longer or ends later.
+                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
+                        matches16CPB(s, pos-dec, length, s16, length16)
+                    ) {
+                        maxDec=dec;  // Longest match from latest end.
+                        maxOverlap=overlap;
+                        break;
+                    }
+                    --overlap;
+                    ++dec;
+                }
+            }
+
+            if(maxDec!=0 || maxOverlap!=0) {
+                // Longest-match algorithm, and there was a string match.
+                // Simply continue before it.
+                pos-=maxDec;
+                if(pos==0) {
+                    return 0;  // Reached the start of the string.
+                }
+                spanLength=0;  // Match strings from before a string match.
+                continue;
+            }
+        }
+        // Finished trying to match all strings at pos.
+
+        if(spanLength!=0 || pos==length) {
+            // The position is before an unlimited code point span (spanLength!=0),
+            // not before a string match.
+            // The only position where spanLength==0 before a span is pos==length.
+            // Otherwise, an unlimited code point span is only tried again when no
+            // strings match, and if such a non-initial span fails we stop.
+            if(offsets.isEmpty()) {
+                return pos;  // No strings matched before a span.
+            }
+            // Match strings from before the next string match.
+        } else {
+            // The position is before a string match (or a single code point).
+            if(offsets.isEmpty()) {
+                // No more strings matched before a previous string match.
+                // Try another code point span from before the last string match.
+                int32_t oldPos=pos;
+                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
+                spanLength=oldPos-pos;
+                if( pos==0 ||           // Reached the start of the string, or
+                    spanLength==0       // neither strings nor span progressed.
+                ) {
+                    return pos;
+                }
+                continue;  // spanLength>0: Match strings from before a span.
+            } else {
+                // Try to match only one code point from before a string match if some
+                // string matched beyond it, so that we try all possible positions
+                // and don't overshoot.
+                spanLength=spanOneBack(spanSet, s, pos);
+                if(spanLength>0) {
+                    if(spanLength==pos) {
+                        return 0;  // Reached the start of the string.
+                    }
+                    // Match strings before this code point.
+                    // There cannot be any decrements below it because UnicodeSet strings
+                    // contain multiple code points.
+                    pos-=spanLength;
+                    offsets.shift(spanLength);
+                    spanLength=0;
+                    continue;  // Match strings from before a single code point.
+                }
+                // Match strings from before the next string match.
+            }
+        }
+        pos-=offsets.popMinimum();
+        spanLength=0;  // Match strings from before a string match.
+    }
+}
+
+int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
+    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+        return spanNotUTF8(s, length);
+    }
+    int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
+    if(spanLength==length) {
+        return length;
+    }
+
+    // Consider strings; they may overlap with the span.
+    OffsetList offsets;
+    if(spanCondition==USET_SPAN_CONTAINED) {
+        // Use offset list to try all possibilities.
+        offsets.setMaxLength(maxLength8);
+    }
+    int32_t pos=spanLength, rest=length-pos;
+    int32_t i, stringsLength=strings.size();
+    uint8_t *spanUTF8Lengths=spanLengths;
+    if(all) {
+        spanUTF8Lengths+=2*stringsLength;
+    }
+    for(;;) {
+        const uint8_t *s8=utf8;
+        int32_t length8;
+        if(spanCondition==USET_SPAN_CONTAINED) {
+            for(i=0; i<stringsLength; ++i) {
+                length8=utf8Lengths[i];
+                if(length8==0) {
+                    continue;  // String not representable in UTF-8.
+                }
+                int32_t overlap=spanUTF8Lengths[i];
+                if(overlap==ALL_CP_CONTAINED) {
+                    s8+=length8;
+                    continue;  // Irrelevant string.
+                }
+
+                // Try to match this string at pos-overlap..pos.
+                if(overlap>=LONG_SPAN) {
+                    overlap=length8;
+                    // While contained: No point matching fully inside the code point span.
+                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
+                }
+                if(overlap>spanLength) {
+                    overlap=spanLength;
+                }
+                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
+                for(;;) {
+                    if(inc>rest) {
+                        break;
+                    }
+                    // Try to match if the increment is not listed already.
+                    // Match at code point boundaries. (The UTF-8 strings were converted
+                    // from UTF-16 and are guaranteed to be well-formed.)
                     if(!U8_IS_TRAIL(s[pos-overlap]) &&
                             !offsets.containsOffset(inc) &&
                             matches8(s+pos-overlap, s8, length8)) {
-                        if(inc==rest) { 
-                            return length;  // Reached the end of the string. 
-                        } 
-                        offsets.addOffset(inc); 
-                    } 
-                    if(overlap==0) { 
-                        break; 
-                    } 
-                    --overlap; 
-                    ++inc; 
-                } 
-                s8+=length8; 
-            } 
-        } else /* USET_SPAN_SIMPLE */ { 
-            int32_t maxInc=0, maxOverlap=0; 
-            for(i=0; i<stringsLength; ++i) { 
-                length8=utf8Lengths[i]; 
-                if(length8==0) { 
-                    continue;  // String not representable in UTF-8. 
-                } 
-                int32_t overlap=spanUTF8Lengths[i]; 
-                // For longest match, we do need to try to match even an all-contained string 
-                // to find the match from the earliest start. 
- 
-                // Try to match this string at pos-overlap..pos. 
-                if(overlap>=LONG_SPAN) { 
-                    overlap=length8; 
-                    // Longest match: Need to match fully inside the code point span 
-                    // to find the match from the earliest start. 
-                } 
-                if(overlap>spanLength) { 
-                    overlap=spanLength; 
-                } 
-                int32_t inc=length8-overlap;  // Keep overlap+inc==length8. 
-                for(;;) { 
-                    if(inc>rest || overlap<maxOverlap) { 
-                        break; 
-                    } 
-                    // Try to match if the string is longer or starts earlier. 
-                    // Match at code point boundaries. (The UTF-8 strings were converted 
-                    // from UTF-16 and are guaranteed to be well-formed.) 
+                        if(inc==rest) {
+                            return length;  // Reached the end of the string.
+                        }
+                        offsets.addOffset(inc);
+                    }
+                    if(overlap==0) {
+                        break;
+                    }
+                    --overlap;
+                    ++inc;
+                }
+                s8+=length8;
+            }
+        } else /* USET_SPAN_SIMPLE */ {
+            int32_t maxInc=0, maxOverlap=0;
+            for(i=0; i<stringsLength; ++i) {
+                length8=utf8Lengths[i];
+                if(length8==0) {
+                    continue;  // String not representable in UTF-8.
+                }
+                int32_t overlap=spanUTF8Lengths[i];
+                // For longest match, we do need to try to match even an all-contained string
+                // to find the match from the earliest start.
+
+                // Try to match this string at pos-overlap..pos.
+                if(overlap>=LONG_SPAN) {
+                    overlap=length8;
+                    // Longest match: Need to match fully inside the code point span
+                    // to find the match from the earliest start.
+                }
+                if(overlap>spanLength) {
+                    overlap=spanLength;
+                }
+                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
+                for(;;) {
+                    if(inc>rest || overlap<maxOverlap) {
+                        break;
+                    }
+                    // Try to match if the string is longer or starts earlier.
+                    // Match at code point boundaries. (The UTF-8 strings were converted
+                    // from UTF-16 and are guaranteed to be well-formed.)
                     if(!U8_IS_TRAIL(s[pos-overlap]) &&
                             (overlap>maxOverlap ||
                                 /* redundant overlap==maxOverlap && */ inc>maxInc) &&
                             matches8(s+pos-overlap, s8, length8)) {
-                        maxInc=inc;  // Longest match from earliest start. 
-                        maxOverlap=overlap; 
-                        break; 
-                    } 
-                    --overlap; 
-                    ++inc; 
-                } 
-                s8+=length8; 
-            } 
- 
-            if(maxInc!=0 || maxOverlap!=0) { 
-                // Longest-match algorithm, and there was a string match. 
-                // Simply continue after it. 
-                pos+=maxInc; 
-                rest-=maxInc; 
-                if(rest==0) { 
-                    return length;  // Reached the end of the string. 
-                } 
-                spanLength=0;  // Match strings from after a string match. 
-                continue; 
-            } 
-        } 
-        // Finished trying to match all strings at pos. 
- 
-        if(spanLength!=0 || pos==0) { 
-            // The position is after an unlimited code point span (spanLength!=0), 
-            // not after a string match. 
-            // The only position where spanLength==0 after a span is pos==0. 
-            // Otherwise, an unlimited code point span is only tried again when no 
-            // strings match, and if such a non-initial span fails we stop. 
-            if(offsets.isEmpty()) { 
-                return pos;  // No strings matched after a span. 
-            } 
-            // Match strings from after the next string match. 
-        } else { 
-            // The position is after a string match (or a single code point). 
-            if(offsets.isEmpty()) { 
-                // No more strings matched after a previous string match. 
-                // Try another code point span from after the last string match. 
-                spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 
-                if( spanLength==rest || // Reached the end of the string, or 
-                    spanLength==0       // neither strings nor span progressed. 
-                ) { 
-                    return pos+spanLength; 
-                } 
-                pos+=spanLength; 
-                rest-=spanLength; 
-                continue;  // spanLength>0: Match strings from after a span. 
-            } else { 
-                // Try to match only one code point from after a string match if some 
-                // string matched beyond it, so that we try all possible positions 
-                // and don't overshoot. 
-                spanLength=spanOneUTF8(spanSet, s+pos, rest); 
-                if(spanLength>0) { 
-                    if(spanLength==rest) { 
-                        return length;  // Reached the end of the string. 
-                    } 
-                    // Match strings after this code point. 
-                    // There cannot be any increments below it because UnicodeSet strings 
-                    // contain multiple code points. 
-                    pos+=spanLength; 
-                    rest-=spanLength; 
-                    offsets.shift(spanLength); 
-                    spanLength=0; 
-                    continue;  // Match strings from after a single code point. 
-                } 
-                // Match strings from after the next string match. 
-            } 
-        } 
-        int32_t minOffset=offsets.popMinimum(); 
-        pos+=minOffset; 
-        rest-=minOffset; 
-        spanLength=0;  // Match strings from after a string match. 
-    } 
-} 
- 
-int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 
-    if(spanCondition==USET_SPAN_NOT_CONTAINED) { 
-        return spanNotBackUTF8(s, length); 
-    } 
-    int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 
-    if(pos==0) { 
-        return 0; 
-    } 
-    int32_t spanLength=length-pos; 
- 
-    // Consider strings; they may overlap with the span. 
-    OffsetList offsets; 
-    if(spanCondition==USET_SPAN_CONTAINED) { 
-        // Use offset list to try all possibilities. 
-        offsets.setMaxLength(maxLength8); 
-    } 
-    int32_t i, stringsLength=strings.size(); 
-    uint8_t *spanBackUTF8Lengths=spanLengths; 
-    if(all) { 
-        spanBackUTF8Lengths+=3*stringsLength; 
-    } 
-    for(;;) { 
-        const uint8_t *s8=utf8; 
-        int32_t length8; 
-        if(spanCondition==USET_SPAN_CONTAINED) { 
-            for(i=0; i<stringsLength; ++i) { 
-                length8=utf8Lengths[i]; 
-                if(length8==0) { 
-                    continue;  // String not representable in UTF-8. 
-                } 
-                int32_t overlap=spanBackUTF8Lengths[i]; 
-                if(overlap==ALL_CP_CONTAINED) { 
-                    s8+=length8; 
-                    continue;  // Irrelevant string. 
-                } 
- 
-                // Try to match this string at pos-(length8-overlap)..pos-length8. 
-                if(overlap>=LONG_SPAN) { 
-                    overlap=length8; 
-                    // While contained: No point matching fully inside the code point span. 
-                    int32_t len1=0; 
-                    U8_FWD_1(s8, len1, overlap); 
-                    overlap-=len1;  // Length of the string minus the first code point. 
-                } 
-                if(overlap>spanLength) { 
-                    overlap=spanLength; 
-                } 
-                int32_t dec=length8-overlap;  // Keep dec+overlap==length8. 
-                for(;;) { 
-                    if(dec>pos) { 
-                        break; 
-                    } 
-                    // Try to match if the decrement is not listed already. 
-                    // Match at code point boundaries. (The UTF-8 strings were converted 
-                    // from UTF-16 and are guaranteed to be well-formed.) 
-                    if( !U8_IS_TRAIL(s[pos-dec]) && 
-                        !offsets.containsOffset(dec) && 
-                        matches8(s+pos-dec, s8, length8) 
-                    ) { 
-                        if(dec==pos) { 
-                            return 0;  // Reached the start of the string. 
-                        } 
-                        offsets.addOffset(dec); 
-                    } 
-                    if(overlap==0) { 
-                        break; 
-                    } 
-                    --overlap; 
-                    ++dec; 
-                } 
-                s8+=length8; 
-            } 
-        } else /* USET_SPAN_SIMPLE */ { 
-            int32_t maxDec=0, maxOverlap=0; 
-            for(i=0; i<stringsLength; ++i) { 
-                length8=utf8Lengths[i]; 
-                if(length8==0) { 
-                    continue;  // String not representable in UTF-8. 
-                } 
-                int32_t overlap=spanBackUTF8Lengths[i]; 
-                // For longest match, we do need to try to match even an all-contained string 
-                // to find the match from the latest end. 
- 
-                // Try to match this string at pos-(length8-overlap)..pos-length8. 
-                if(overlap>=LONG_SPAN) { 
-                    overlap=length8; 
-                    // Longest match: Need to match fully inside the code point span 
-                    // to find the match from the latest end. 
-                } 
-                if(overlap>spanLength) { 
-                    overlap=spanLength; 
-                } 
-                int32_t dec=length8-overlap;  // Keep dec+overlap==length8. 
-                for(;;) { 
-                    if(dec>pos || overlap<maxOverlap) { 
-                        break; 
-                    } 
-                    // Try to match if the string is longer or ends later. 
-                    // Match at code point boundaries. (The UTF-8 strings were converted 
-                    // from UTF-16 and are guaranteed to be well-formed.) 
-                    if( !U8_IS_TRAIL(s[pos-dec]) && 
-                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 
-                        matches8(s+pos-dec, s8, length8) 
-                    ) { 
-                        maxDec=dec;  // Longest match from latest end. 
-                        maxOverlap=overlap; 
-                        break; 
-                    } 
-                    --overlap; 
-                    ++dec; 
-                } 
-                s8+=length8; 
-            } 
- 
-            if(maxDec!=0 || maxOverlap!=0) { 
-                // Longest-match algorithm, and there was a string match. 
-                // Simply continue before it. 
-                pos-=maxDec; 
-                if(pos==0) { 
-                    return 0;  // Reached the start of the string. 
-                } 
-                spanLength=0;  // Match strings from before a string match. 
-                continue; 
-            } 
-        } 
-        // Finished trying to match all strings at pos. 
- 
-        if(spanLength!=0 || pos==length) { 
-            // The position is before an unlimited code point span (spanLength!=0), 
-            // not before a string match. 
-            // The only position where spanLength==0 before a span is pos==length. 
-            // Otherwise, an unlimited code point span is only tried again when no 
-            // strings match, and if such a non-initial span fails we stop. 
-            if(offsets.isEmpty()) { 
-                return pos;  // No strings matched before a span. 
-            } 
-            // Match strings from before the next string match. 
-        } else { 
-            // The position is before a string match (or a single code point). 
-            if(offsets.isEmpty()) { 
-                // No more strings matched before a previous string match. 
-                // Try another code point span from before the last string match. 
-                int32_t oldPos=pos; 
-                pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 
-                spanLength=oldPos-pos; 
-                if( pos==0 ||           // Reached the start of the string, or 
-                    spanLength==0       // neither strings nor span progressed. 
-                ) { 
-                    return pos; 
-                } 
-                continue;  // spanLength>0: Match strings from before a span. 
-            } else { 
-                // Try to match only one code point from before a string match if some 
-                // string matched beyond it, so that we try all possible positions 
-                // and don't overshoot. 
-                spanLength=spanOneBackUTF8(spanSet, s, pos); 
-                if(spanLength>0) { 
-                    if(spanLength==pos) { 
-                        return 0;  // Reached the start of the string. 
-                    } 
-                    // Match strings before this code point. 
-                    // There cannot be any decrements below it because UnicodeSet strings 
-                    // contain multiple code points. 
-                    pos-=spanLength; 
-                    offsets.shift(spanLength); 
-                    spanLength=0; 
-                    continue;  // Match strings from before a single code point. 
-                } 
-                // Match strings from before the next string match. 
-            } 
-        } 
-        pos-=offsets.popMinimum(); 
-        spanLength=0;  // Match strings from before a string match. 
-    } 
-} 
- 
-/* 
- * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 
- * 
- * Theoretical algorithm: 
- * - Iterate through the string, and at each code point boundary: 
- *   + If the code point there is in the set, then return with the current position. 
- *   + If a set string matches at the current position, then return with the current position. 
- * 
- * Optimized implementation: 
- * 
- * (Same assumption as for span() above.) 
- * 
- * Create and cache a spanNotSet which contains all of the single code points 
- * of the original set but none of its strings. 
- * For each set string add its initial code point to the spanNotSet. 
- * (Also add its final code point for spanNotBack().) 
- * 
- * - Loop: 
- *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 
- *   + If the current code point is in the original set, then 
- *     return the current position. 
- *   + If any set string matches at the current position, then 
- *     return the current position. 
- *   + If there is no match at the current position, neither for the code point there 
- *     nor for any set string, then skip this code point and continue the loop. 
- *     This happens for set-string-initial code points that were added to spanNotSet 
- *     when there is not actually a match for such a set string. 
- */ 
- 
-int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 
-    int32_t pos=0, rest=length; 
-    int32_t i, stringsLength=strings.size(); 
-    do { 
-        // Span until we find a code point from the set, 
-        // or a code point that starts or ends some string. 
-        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 
-        if(i==rest) { 
-            return length;  // Reached the end of the string. 
-        } 
-        pos+=i; 
-        rest-=i; 
- 
-        // Check whether the current code point is in the original set, 
-        // without the string starts and ends. 
-        int32_t cpLength=spanOne(spanSet, s+pos, rest); 
-        if(cpLength>0) { 
-            return pos;  // There is a set element at pos. 
-        } 
- 
-        // Try to match the strings at pos. 
-        for(i=0; i<stringsLength; ++i) { 
-            if(spanLengths[i]==ALL_CP_CONTAINED) { 
-                continue;  // Irrelevant string. 
-            } 
-            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 
-            const UChar *s16=string.getBuffer(); 
-            int32_t length16=string.length(); 
-            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 
-                return pos;  // There is a set element at pos. 
-            } 
-        } 
- 
-        // The span(while not contained) ended on a string start/end which is 
-        // not in the original set. Skip this code point and continue. 
-        // cpLength<0 
-        pos-=cpLength; 
-        rest+=cpLength; 
-    } while(rest!=0); 
-    return length;  // Reached the end of the string. 
-} 
- 
-int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 
-    int32_t pos=length; 
-    int32_t i, stringsLength=strings.size(); 
-    do { 
-        // Span until we find a code point from the set, 
-        // or a code point that starts or ends some string. 
-        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 
-        if(pos==0) { 
-            return 0;  // Reached the start of the string. 
-        } 
- 
-        // Check whether the current code point is in the original set, 
-        // without the string starts and ends. 
-        int32_t cpLength=spanOneBack(spanSet, s, pos); 
-        if(cpLength>0) { 
-            return pos;  // There is a set element at pos. 
-        } 
- 
-        // Try to match the strings at pos. 
-        for(i=0; i<stringsLength; ++i) { 
-            // Use spanLengths rather than a spanBackLengths pointer because 
-            // it is easier and we only need to know whether the string is irrelevant 
-            // which is the same in either array. 
-            if(spanLengths[i]==ALL_CP_CONTAINED) { 
-                continue;  // Irrelevant string. 
-            } 
-            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 
-            const UChar *s16=string.getBuffer(); 
-            int32_t length16=string.length(); 
-            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 
-                return pos;  // There is a set element at pos. 
-            } 
-        } 
- 
-        // The span(while not contained) ended on a string start/end which is 
-        // not in the original set. Skip this code point and continue. 
-        // cpLength<0 
-        pos+=cpLength; 
-    } while(pos!=0); 
-    return 0;  // Reached the start of the string. 
-} 
- 
-int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 
-    int32_t pos=0, rest=length; 
-    int32_t i, stringsLength=strings.size(); 
-    uint8_t *spanUTF8Lengths=spanLengths; 
-    if(all) { 
-        spanUTF8Lengths+=2*stringsLength; 
-    } 
-    do { 
-        // Span until we find a code point from the set, 
-        // or a code point that starts or ends some string. 
-        i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 
-        if(i==rest) { 
-            return length;  // Reached the end of the string. 
-        } 
-        pos+=i; 
-        rest-=i; 
- 
-        // Check whether the current code point is in the original set, 
-        // without the string starts and ends. 
-        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 
-        if(cpLength>0) { 
-            return pos;  // There is a set element at pos. 
-        } 
- 
-        // Try to match the strings at pos. 
-        const uint8_t *s8=utf8; 
-        int32_t length8; 
-        for(i=0; i<stringsLength; ++i) { 
-            length8=utf8Lengths[i]; 
-            // ALL_CP_CONTAINED: Irrelevant string. 
-            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 
-                return pos;  // There is a set element at pos. 
-            } 
-            s8+=length8; 
-        } 
- 
-        // The span(while not contained) ended on a string start/end which is 
-        // not in the original set. Skip this code point and continue. 
-        // cpLength<0 
-        pos-=cpLength; 
-        rest+=cpLength; 
-    } while(rest!=0); 
-    return length;  // Reached the end of the string. 
-} 
- 
-int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 
-    int32_t pos=length; 
-    int32_t i, stringsLength=strings.size(); 
-    uint8_t *spanBackUTF8Lengths=spanLengths; 
-    if(all) { 
-        spanBackUTF8Lengths+=3*stringsLength; 
-    } 
-    do { 
-        // Span until we find a code point from the set, 
-        // or a code point that starts or ends some string. 
-        pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 
-        if(pos==0) { 
-            return 0;  // Reached the start of the string. 
-        } 
- 
-        // Check whether the current code point is in the original set, 
-        // without the string starts and ends. 
-        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 
-        if(cpLength>0) { 
-            return pos;  // There is a set element at pos. 
-        } 
- 
-        // Try to match the strings at pos. 
-        const uint8_t *s8=utf8; 
-        int32_t length8; 
-        for(i=0; i<stringsLength; ++i) { 
-            length8=utf8Lengths[i]; 
-            // ALL_CP_CONTAINED: Irrelevant string. 
-            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 
-                return pos;  // There is a set element at pos. 
-            } 
-            s8+=length8; 
-        } 
- 
-        // The span(while not contained) ended on a string start/end which is 
-        // not in the original set. Skip this code point and continue. 
-        // cpLength<0 
-        pos+=cpLength; 
-    } while(pos!=0); 
-    return 0;  // Reached the start of the string. 
-} 
- 
-U_NAMESPACE_END 
+                        maxInc=inc;  // Longest match from earliest start.
+                        maxOverlap=overlap;
+                        break;
+                    }
+                    --overlap;
+                    ++inc;
+                }
+                s8+=length8;
+            }
+
+            if(maxInc!=0 || maxOverlap!=0) {
+                // Longest-match algorithm, and there was a string match.
+                // Simply continue after it.
+                pos+=maxInc;
+                rest-=maxInc;
+                if(rest==0) {
+                    return length;  // Reached the end of the string.
+                }
+                spanLength=0;  // Match strings from after a string match.
+                continue;
+            }
+        }
+        // Finished trying to match all strings at pos.
+
+        if(spanLength!=0 || pos==0) {
+            // The position is after an unlimited code point span (spanLength!=0),
+            // not after a string match.
+            // The only position where spanLength==0 after a span is pos==0.
+            // Otherwise, an unlimited code point span is only tried again when no
+            // strings match, and if such a non-initial span fails we stop.
+            if(offsets.isEmpty()) {
+                return pos;  // No strings matched after a span.
+            }
+            // Match strings from after the next string match.
+        } else {
+            // The position is after a string match (or a single code point).
+            if(offsets.isEmpty()) {
+                // No more strings matched after a previous string match.
+                // Try another code point span from after the last string match.
+                spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
+                if( spanLength==rest || // Reached the end of the string, or
+                    spanLength==0       // neither strings nor span progressed.
+                ) {
+                    return pos+spanLength;
+                }
+                pos+=spanLength;
+                rest-=spanLength;
+                continue;  // spanLength>0: Match strings from after a span.
+            } else {
+                // Try to match only one code point from after a string match if some
+                // string matched beyond it, so that we try all possible positions
+                // and don't overshoot.
+                spanLength=spanOneUTF8(spanSet, s+pos, rest);
+                if(spanLength>0) {
+                    if(spanLength==rest) {
+                        return length;  // Reached the end of the string.
+                    }
+                    // Match strings after this code point.
+                    // There cannot be any increments below it because UnicodeSet strings
+                    // contain multiple code points.
+                    pos+=spanLength;
+                    rest-=spanLength;
+                    offsets.shift(spanLength);
+                    spanLength=0;
+                    continue;  // Match strings from after a single code point.
+                }
+                // Match strings from after the next string match.
+            }
+        }
+        int32_t minOffset=offsets.popMinimum();
+        pos+=minOffset;
+        rest-=minOffset;
+        spanLength=0;  // Match strings from after a string match.
+    }
+}
+
+int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
+    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+        return spanNotBackUTF8(s, length);
+    }
+    int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
+    if(pos==0) {
+        return 0;
+    }
+    int32_t spanLength=length-pos;
+
+    // Consider strings; they may overlap with the span.
+    OffsetList offsets;
+    if(spanCondition==USET_SPAN_CONTAINED) {
+        // Use offset list to try all possibilities.
+        offsets.setMaxLength(maxLength8);
+    }
+    int32_t i, stringsLength=strings.size();
+    uint8_t *spanBackUTF8Lengths=spanLengths;
+    if(all) {
+        spanBackUTF8Lengths+=3*stringsLength;
+    }
+    for(;;) {
+        const uint8_t *s8=utf8;
+        int32_t length8;
+        if(spanCondition==USET_SPAN_CONTAINED) {
+            for(i=0; i<stringsLength; ++i) {
+                length8=utf8Lengths[i];
+                if(length8==0) {
+                    continue;  // String not representable in UTF-8.
+                }
+                int32_t overlap=spanBackUTF8Lengths[i];
+                if(overlap==ALL_CP_CONTAINED) {
+                    s8+=length8;
+                    continue;  // Irrelevant string.
+                }
+
+                // Try to match this string at pos-(length8-overlap)..pos-length8.
+                if(overlap>=LONG_SPAN) {
+                    overlap=length8;
+                    // While contained: No point matching fully inside the code point span.
+                    int32_t len1=0;
+                    U8_FWD_1(s8, len1, overlap);
+                    overlap-=len1;  // Length of the string minus the first code point.
+                }
+                if(overlap>spanLength) {
+                    overlap=spanLength;
+                }
+                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
+                for(;;) {
+                    if(dec>pos) {
+                        break;
+                    }
+                    // Try to match if the decrement is not listed already.
+                    // Match at code point boundaries. (The UTF-8 strings were converted
+                    // from UTF-16 and are guaranteed to be well-formed.)
+                    if( !U8_IS_TRAIL(s[pos-dec]) &&
+                        !offsets.containsOffset(dec) &&
+                        matches8(s+pos-dec, s8, length8)
+                    ) {
+                        if(dec==pos) {
+                            return 0;  // Reached the start of the string.
+                        }
+                        offsets.addOffset(dec);
+                    }
+                    if(overlap==0) {
+                        break;
+                    }
+                    --overlap;
+                    ++dec;
+                }
+                s8+=length8;
+            }
+        } else /* USET_SPAN_SIMPLE */ {
+            int32_t maxDec=0, maxOverlap=0;
+            for(i=0; i<stringsLength; ++i) {
+                length8=utf8Lengths[i];
+                if(length8==0) {
+                    continue;  // String not representable in UTF-8.
+                }
+                int32_t overlap=spanBackUTF8Lengths[i];
+                // For longest match, we do need to try to match even an all-contained string
+                // to find the match from the latest end.
+
+                // Try to match this string at pos-(length8-overlap)..pos-length8.
+                if(overlap>=LONG_SPAN) {
+                    overlap=length8;
+                    // Longest match: Need to match fully inside the code point span
+                    // to find the match from the latest end.
+                }
+                if(overlap>spanLength) {
+                    overlap=spanLength;
+                }
+                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
+                for(;;) {
+                    if(dec>pos || overlap<maxOverlap) {
+                        break;
+                    }
+                    // Try to match if the string is longer or ends later.
+                    // Match at code point boundaries. (The UTF-8 strings were converted
+                    // from UTF-16 and are guaranteed to be well-formed.)
+                    if( !U8_IS_TRAIL(s[pos-dec]) &&
+                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
+                        matches8(s+pos-dec, s8, length8)
+                    ) {
+                        maxDec=dec;  // Longest match from latest end.
+                        maxOverlap=overlap;
+                        break;
+                    }
+                    --overlap;
+                    ++dec;
+                }
+                s8+=length8;
+            }
+
+            if(maxDec!=0 || maxOverlap!=0) {
+                // Longest-match algorithm, and there was a string match.
+                // Simply continue before it.
+                pos-=maxDec;
+                if(pos==0) {
+                    return 0;  // Reached the start of the string.
+                }
+                spanLength=0;  // Match strings from before a string match.
+                continue;
+            }
+        }
+        // Finished trying to match all strings at pos.
+
+        if(spanLength!=0 || pos==length) {
+            // The position is before an unlimited code point span (spanLength!=0),
+            // not before a string match.
+            // The only position where spanLength==0 before a span is pos==length.
+            // Otherwise, an unlimited code point span is only tried again when no
+            // strings match, and if such a non-initial span fails we stop.
+            if(offsets.isEmpty()) {
+                return pos;  // No strings matched before a span.
+            }
+            // Match strings from before the next string match.
+        } else {
+            // The position is before a string match (or a single code point).
+            if(offsets.isEmpty()) {
+                // No more strings matched before a previous string match.
+                // Try another code point span from before the last string match.
+                int32_t oldPos=pos;
+                pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
+                spanLength=oldPos-pos;
+                if( pos==0 ||           // Reached the start of the string, or
+                    spanLength==0       // neither strings nor span progressed.
+                ) {
+                    return pos;
+                }
+                continue;  // spanLength>0: Match strings from before a span.
+            } else {
+                // Try to match only one code point from before a string match if some
+                // string matched beyond it, so that we try all possible positions
+                // and don't overshoot.
+                spanLength=spanOneBackUTF8(spanSet, s, pos);
+                if(spanLength>0) {
+                    if(spanLength==pos) {
+                        return 0;  // Reached the start of the string.
+                    }
+                    // Match strings before this code point.
+                    // There cannot be any decrements below it because UnicodeSet strings
+                    // contain multiple code points.
+                    pos-=spanLength;
+                    offsets.shift(spanLength);
+                    spanLength=0;
+                    continue;  // Match strings from before a single code point.
+                }
+                // Match strings from before the next string match.
+            }
+        }
+        pos-=offsets.popMinimum();
+        spanLength=0;  // Match strings from before a string match.
+    }
+}
+
+/*
+ * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ *   + If the code point there is in the set, then return with the current position.
+ *   + If a set string matches at the current position, then return with the current position.
+ *
+ * Optimized implementation:
+ *
+ * (Same assumption as for span() above.)
+ *
+ * Create and cache a spanNotSet which contains all of the single code points
+ * of the original set but none of its strings.
+ * For each set string add its initial code point to the spanNotSet.
+ * (Also add its final code point for spanNotBack().)
+ *
+ * - Loop:
+ *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
+ *   + If the current code point is in the original set, then
+ *     return the current position.
+ *   + If any set string matches at the current position, then
+ *     return the current position.
+ *   + If there is no match at the current position, neither for the code point there
+ *     nor for any set string, then skip this code point and continue the loop.
+ *     This happens for set-string-initial code points that were added to spanNotSet
+ *     when there is not actually a match for such a set string.
+ */
+
+int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
+    int32_t pos=0, rest=length;
+    int32_t i, stringsLength=strings.size();
+    do {
+        // Span until we find a code point from the set,
+        // or a code point that starts or ends some string.
+        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
+        if(i==rest) {
+            return length;  // Reached the end of the string.
+        }
+        pos+=i;
+        rest-=i;
+
+        // Check whether the current code point is in the original set,
+        // without the string starts and ends.
+        int32_t cpLength=spanOne(spanSet, s+pos, rest);
+        if(cpLength>0) {
+            return pos;  // There is a set element at pos.
+        }
+
+        // Try to match the strings at pos.
+        for(i=0; i<stringsLength; ++i) {
+            if(spanLengths[i]==ALL_CP_CONTAINED) {
+                continue;  // Irrelevant string.
+            }
+            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+            const UChar *s16=string.getBuffer();
+            int32_t length16=string.length();
+            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
+                return pos;  // There is a set element at pos.
+            }
+        }
+
+        // The span(while not contained) ended on a string start/end which is
+        // not in the original set. Skip this code point and continue.
+        // cpLength<0
+        pos-=cpLength;
+        rest+=cpLength;
+    } while(rest!=0);
+    return length;  // Reached the end of the string.
+}
+
+int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
+    int32_t pos=length;
+    int32_t i, stringsLength=strings.size();
+    do {
+        // Span until we find a code point from the set,
+        // or a code point that starts or ends some string.
+        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
+        if(pos==0) {
+            return 0;  // Reached the start of the string.
+        }
+
+        // Check whether the current code point is in the original set,
+        // without the string starts and ends.
+        int32_t cpLength=spanOneBack(spanSet, s, pos);
+        if(cpLength>0) {
+            return pos;  // There is a set element at pos.
+        }
+
+        // Try to match the strings at pos.
+        for(i=0; i<stringsLength; ++i) {
+            // Use spanLengths rather than a spanBackLengths pointer because
+            // it is easier and we only need to know whether the string is irrelevant
+            // which is the same in either array.
+            if(spanLengths[i]==ALL_CP_CONTAINED) {
+                continue;  // Irrelevant string.
+            }
+            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
+            const UChar *s16=string.getBuffer();
+            int32_t length16=string.length();
+            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
+                return pos;  // There is a set element at pos.
+            }
+        }
+
+        // The span(while not contained) ended on a string start/end which is
+        // not in the original set. Skip this code point and continue.
+        // cpLength<0
+        pos+=cpLength;
+    } while(pos!=0);
+    return 0;  // Reached the start of the string.
+}
+
+int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
+    int32_t pos=0, rest=length;
+    int32_t i, stringsLength=strings.size();
+    uint8_t *spanUTF8Lengths=spanLengths;
+    if(all) {
+        spanUTF8Lengths+=2*stringsLength;
+    }
+    do {
+        // Span until we find a code point from the set,
+        // or a code point that starts or ends some string.
+        i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
+        if(i==rest) {
+            return length;  // Reached the end of the string.
+        }
+        pos+=i;
+        rest-=i;
+
+        // Check whether the current code point is in the original set,
+        // without the string starts and ends.
+        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
+        if(cpLength>0) {
+            return pos;  // There is a set element at pos.
+        }
+
+        // Try to match the strings at pos.
+        const uint8_t *s8=utf8;
+        int32_t length8;
+        for(i=0; i<stringsLength; ++i) {
+            length8=utf8Lengths[i];
+            // ALL_CP_CONTAINED: Irrelevant string.
+            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
+                return pos;  // There is a set element at pos.
+            }
+            s8+=length8;
+        }
+
+        // The span(while not contained) ended on a string start/end which is
+        // not in the original set. Skip this code point and continue.
+        // cpLength<0
+        pos-=cpLength;
+        rest+=cpLength;
+    } while(rest!=0);
+    return length;  // Reached the end of the string.
+}
+
+int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
+    int32_t pos=length;
+    int32_t i, stringsLength=strings.size();
+    uint8_t *spanBackUTF8Lengths=spanLengths;
+    if(all) {
+        spanBackUTF8Lengths+=3*stringsLength;
+    }
+    do {
+        // Span until we find a code point from the set,
+        // or a code point that starts or ends some string.
+        pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
+        if(pos==0) {
+            return 0;  // Reached the start of the string.
+        }
+
+        // Check whether the current code point is in the original set,
+        // without the string starts and ends.
+        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
+        if(cpLength>0) {
+            return pos;  // There is a set element at pos.
+        }
+
+        // Try to match the strings at pos.
+        const uint8_t *s8=utf8;
+        int32_t length8;
+        for(i=0; i<stringsLength; ++i) {
+            length8=utf8Lengths[i];
+            // ALL_CP_CONTAINED: Irrelevant string.
+            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
+                return pos;  // There is a set element at pos.
+            }
+            s8+=length8;
+        }
+
+        // The span(while not contained) ended on a string start/end which is
+        // not in the original set. Skip this code point and continue.
+        // cpLength<0
+        pos+=cpLength;
+    } while(pos!=0);
+    return 0;  // Reached the start of the string.
+}
+
+U_NAMESPACE_END
author	neksard <[email protected]>	2022-02-10 16:45:33 +0300
committer	Daniil Cherednik <[email protected]>	2022-02-10 16:45:33 +0300
commit	1d9c550e7c38e051d7961f576013a482003a70d9 (patch)
tree	b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/common/unisetspan.cpp
parent	8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff)