diff options
author | neksard <[email protected]> | 2022-02-10 16:45:33 +0300 |
---|---|---|
committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:45:33 +0300 |
commit | 1d9c550e7c38e051d7961f576013a482003a70d9 (patch) | |
tree | b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/common/unisetspan.cpp | |
parent | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff) |
Restoring authorship annotation for <[email protected]>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/common/unisetspan.cpp')
-rw-r--r-- | contrib/libs/icu/common/unisetspan.cpp | 2992 |
1 files changed, 1496 insertions, 1496 deletions
diff --git a/contrib/libs/icu/common/unisetspan.cpp b/contrib/libs/icu/common/unisetspan.cpp index 1490a435a1a..68e44d91ee7 100644 --- a/contrib/libs/icu/common/unisetspan.cpp +++ b/contrib/libs/icu/common/unisetspan.cpp @@ -1,1509 +1,1509 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 2007-2012, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: unisetspan.cpp +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 2007-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: unisetspan.cpp * encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2007mar01 -* created by: Markus W. Scherer -*/ - -#include "unicode/utypes.h" -#include "unicode/uniset.h" -#include "unicode/ustring.h" -#include "unicode/utf8.h" -#include "unicode/utf16.h" -#include "cmemory.h" -#include "uvector.h" -#include "unisetspan.h" - -U_NAMESPACE_BEGIN - -/* - * List of offsets from the current position from where to try matching - * a code point or a string. - * Store offsets rather than indexes to simplify the code and use the same list - * for both increments (in span()) and decrements (in spanBack()). - * - * Assumption: The maximum offset is limited, and the offsets that are stored - * at any one time are relatively dense, that is, there are normally no gaps of - * hundreds or thousands of offset values. - * - * The implementation uses a circular buffer of byte flags, - * each indicating whether the corresponding offset is in the list. - * This avoids inserting into a sorted list of offsets (or absolute indexes) and - * physically moving part of the list. - * - * Note: In principle, the caller should setMaxLength() to the maximum of the - * max string length and U16_LENGTH/U8_LENGTH to account for - * "long" single code points. - * However, this implementation uses at least a staticList with more than - * U8_LENGTH entries anyway. - * - * Note: If maxLength were guaranteed to be no more than 32 or 64, - * the list could be stored as bit flags in a single integer. - * Rather than handling a circular buffer with a start list index, - * the integer would simply be shifted when lower offsets are removed. - * UnicodeSet does not have a limit on the lengths of strings. - */ -class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. -public: - OffsetList() : list(staticList), capacity(0), length(0), start(0) {} - - ~OffsetList() { - if(list!=staticList) { - uprv_free(list); - } - } - - // Call exactly once if the list is to be used. - void setMaxLength(int32_t maxLength) { - if(maxLength<=(int32_t)sizeof(staticList)) { - capacity=(int32_t)sizeof(staticList); - } else { - UBool *l=(UBool *)uprv_malloc(maxLength); - if(l!=NULL) { - list=l; - capacity=maxLength; - } - } - uprv_memset(list, 0, capacity); - } - - void clear() { - uprv_memset(list, 0, capacity); - start=length=0; - } - - UBool isEmpty() const { - return (UBool)(length==0); - } - - // Reduce all stored offsets by delta, used when the current position - // moves by delta. - // There must not be any offsets lower than delta. - // If there is an offset equal to delta, it is removed. - // delta=[1..maxLength] - void shift(int32_t delta) { - int32_t i=start+delta; - if(i>=capacity) { - i-=capacity; - } - if(list[i]) { - list[i]=FALSE; - --length; - } - start=i; - } - - // Add an offset. The list must not contain it yet. - // offset=[1..maxLength] - void addOffset(int32_t offset) { - int32_t i=start+offset; - if(i>=capacity) { - i-=capacity; - } - list[i]=TRUE; - ++length; - } - - // offset=[1..maxLength] - UBool containsOffset(int32_t offset) const { - int32_t i=start+offset; - if(i>=capacity) { - i-=capacity; - } - return list[i]; - } - - // Find the lowest stored offset from a non-empty list, remove it, - // and reduce all other offsets by this minimum. - // Returns [1..maxLength]. - int32_t popMinimum() { - // Look for the next offset in list[start+1..capacity-1]. - int32_t i=start, result; - while(++i<capacity) { - if(list[i]) { - list[i]=FALSE; - --length; - result=i-start; - start=i; - return result; - } - } - // i==capacity - - // Wrap around and look for the next offset in list[0..start]. - // Since the list is not empty, there will be one. - result=capacity-start; - i=0; - while(!list[i]) { - ++i; - } - list[i]=FALSE; - --length; - start=i; - return result+=i; - } - -private: - UBool *list; - int32_t capacity; - int32_t length; - int32_t start; - - UBool staticList[16]; -}; - -// Get the number of UTF-8 bytes for a UTF-16 (sub)string. -static int32_t -getUTF8Length(const UChar *s, int32_t length) { - UErrorCode errorCode=U_ZERO_ERROR; - int32_t length8=0; - u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); - if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { - return length8; - } else { - // The string contains an unpaired surrogate. - // Ignore this string. - return 0; - } -} - -// Append the UTF-8 version of the string to t and return the appended UTF-8 length. -static int32_t -appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { - UErrorCode errorCode=U_ZERO_ERROR; - int32_t length8=0; - u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); - if(U_SUCCESS(errorCode)) { - return length8; - } else { - // The string contains an unpaired surrogate. - // Ignore this string. - return 0; - } -} - -static inline uint8_t -makeSpanLengthByte(int32_t spanLength) { - // 0xfe==UnicodeSetStringSpan::LONG_SPAN - return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; -} - -// Construct for all variants of span(), or only for any one variant. -// Initialize as little as possible, for single use. -UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, - const UVector &setStrings, - uint32_t which) - : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), - utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), - utf8Length(0), - maxLength16(0), maxLength8(0), - all((UBool)(which==ALL)) { - spanSet.retainAll(set); - if(which&NOT_CONTAINED) { - // Default to the same sets. - // addToSpanNotSet() will create a separate set if necessary. - pSpanNotSet=&spanSet; - } - - // Determine if the strings even need to be taken into account at all for span() etc. - // If any string is relevant, then all strings need to be used for - // span(longest match) but only the relevant ones for span(while contained). - // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH - // and do not store UTF-8 strings if !thisRelevant and CONTAINED. - // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) - // Also count the lengths of the UTF-8 versions of the strings for memory allocation. - int32_t stringsLength=strings.size(); - - int32_t i, spanLength; - UBool someRelevant=FALSE; - for(i=0; i<stringsLength; ++i) { - const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); - const UChar *s16=string.getBuffer(); - int32_t length16=string.length(); - UBool thisRelevant; - spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); - if(spanLength<length16) { // Relevant string. - someRelevant=thisRelevant=TRUE; - } else { - thisRelevant=FALSE; - } - if((which&UTF16) && length16>maxLength16) { - maxLength16=length16; - } - if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { - int32_t length8=getUTF8Length(s16, length16); - utf8Length+=length8; - if(length8>maxLength8) { - maxLength8=length8; - } - } - } - if(!someRelevant) { - maxLength16=maxLength8=0; - return; - } - - // Freeze after checking for the need to use strings at all because freezing - // a set takes some time and memory which are wasted if there are no relevant strings. - if(all) { - spanSet.freeze(); - } - - uint8_t *spanBackLengths; - uint8_t *spanUTF8Lengths; - uint8_t *spanBackUTF8Lengths; - - // Allocate a block of meta data. - int32_t allocSize; - if(all) { - // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. - allocSize=stringsLength*(4+1+1+1+1)+utf8Length; - } else { - allocSize=stringsLength; // One set of span lengths. - if(which&UTF8) { - // UTF-8 lengths and UTF-8 strings. - allocSize+=stringsLength*4+utf8Length; - } - } - if(allocSize<=(int32_t)sizeof(staticLengths)) { - utf8Lengths=staticLengths; - } else { - utf8Lengths=(int32_t *)uprv_malloc(allocSize); - if(utf8Lengths==NULL) { - maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. - return; // Out of memory. - } - } - - if(all) { - // Store span lengths for all span() variants. - spanLengths=(uint8_t *)(utf8Lengths+stringsLength); - spanBackLengths=spanLengths+stringsLength; - spanUTF8Lengths=spanBackLengths+stringsLength; - spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; - utf8=spanBackUTF8Lengths+stringsLength; - } else { - // Store span lengths for only one span() variant. - if(which&UTF8) { - spanLengths=(uint8_t *)(utf8Lengths+stringsLength); - utf8=spanLengths+stringsLength; - } else { - spanLengths=(uint8_t *)utf8Lengths; - } - spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; - } - - // Set the meta data and pSpanNotSet and write the UTF-8 strings. - int32_t utf8Count=0; // Count UTF-8 bytes written so far. - - for(i=0; i<stringsLength; ++i) { - const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); - const UChar *s16=string.getBuffer(); - int32_t length16=string.length(); - spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); - if(spanLength<length16) { // Relevant string. - if(which&UTF16) { - if(which&CONTAINED) { - if(which&FWD) { - spanLengths[i]=makeSpanLengthByte(spanLength); - } - if(which&BACK) { - spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); - spanBackLengths[i]=makeSpanLengthByte(spanLength); - } - } else /* not CONTAINED, not all, but NOT_CONTAINED */ { - spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. - } - } - if(which&UTF8) { - uint8_t *s8=utf8+utf8Count; - int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); - utf8Count+=utf8Lengths[i]=length8; - if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. - spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; - } else { // Relevant for UTF-8. - if(which&CONTAINED) { - if(which&FWD) { - spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); - spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); - } - if(which&BACK) { - spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); - spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); - } - } else /* not CONTAINED, not all, but NOT_CONTAINED */ { - spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. - } - } - } - if(which&NOT_CONTAINED) { - // Add string start and end code points to the spanNotSet so that - // a span(while not contained) stops before any string. - UChar32 c; - if(which&FWD) { - int32_t len=0; - U16_NEXT(s16, len, length16, c); - addToSpanNotSet(c); - } - if(which&BACK) { - int32_t len=length16; - U16_PREV(s16, 0, len, c); - addToSpanNotSet(c); - } - } - } else { // Irrelevant string. - if(which&UTF8) { - if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. - uint8_t *s8=utf8+utf8Count; - int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); - utf8Count+=utf8Lengths[i]=length8; - } else { - utf8Lengths[i]=0; - } - } - if(all) { - spanLengths[i]=spanBackLengths[i]= - spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= - (uint8_t)ALL_CP_CONTAINED; - } else { - // All spanXYZLengths pointers contain the same address. - spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; - } - } - } - - // Finish. - if(all) { - pSpanNotSet->freeze(); - } -} - -// Copy constructor. Assumes which==ALL for a frozen set. -UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, - const UVector &newParentSetStrings) - : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), - utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), - utf8Length(otherStringSpan.utf8Length), - maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), - all(TRUE) { - if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { - pSpanNotSet=&spanSet; - } else { +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2007mar01 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "unicode/uniset.h" +#include "unicode/ustring.h" +#include "unicode/utf8.h" +#include "unicode/utf16.h" +#include "cmemory.h" +#include "uvector.h" +#include "unisetspan.h" + +U_NAMESPACE_BEGIN + +/* + * List of offsets from the current position from where to try matching + * a code point or a string. + * Store offsets rather than indexes to simplify the code and use the same list + * for both increments (in span()) and decrements (in spanBack()). + * + * Assumption: The maximum offset is limited, and the offsets that are stored + * at any one time are relatively dense, that is, there are normally no gaps of + * hundreds or thousands of offset values. + * + * The implementation uses a circular buffer of byte flags, + * each indicating whether the corresponding offset is in the list. + * This avoids inserting into a sorted list of offsets (or absolute indexes) and + * physically moving part of the list. + * + * Note: In principle, the caller should setMaxLength() to the maximum of the + * max string length and U16_LENGTH/U8_LENGTH to account for + * "long" single code points. + * However, this implementation uses at least a staticList with more than + * U8_LENGTH entries anyway. + * + * Note: If maxLength were guaranteed to be no more than 32 or 64, + * the list could be stored as bit flags in a single integer. + * Rather than handling a circular buffer with a start list index, + * the integer would simply be shifted when lower offsets are removed. + * UnicodeSet does not have a limit on the lengths of strings. + */ +class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. +public: + OffsetList() : list(staticList), capacity(0), length(0), start(0) {} + + ~OffsetList() { + if(list!=staticList) { + uprv_free(list); + } + } + + // Call exactly once if the list is to be used. + void setMaxLength(int32_t maxLength) { + if(maxLength<=(int32_t)sizeof(staticList)) { + capacity=(int32_t)sizeof(staticList); + } else { + UBool *l=(UBool *)uprv_malloc(maxLength); + if(l!=NULL) { + list=l; + capacity=maxLength; + } + } + uprv_memset(list, 0, capacity); + } + + void clear() { + uprv_memset(list, 0, capacity); + start=length=0; + } + + UBool isEmpty() const { + return (UBool)(length==0); + } + + // Reduce all stored offsets by delta, used when the current position + // moves by delta. + // There must not be any offsets lower than delta. + // If there is an offset equal to delta, it is removed. + // delta=[1..maxLength] + void shift(int32_t delta) { + int32_t i=start+delta; + if(i>=capacity) { + i-=capacity; + } + if(list[i]) { + list[i]=FALSE; + --length; + } + start=i; + } + + // Add an offset. The list must not contain it yet. + // offset=[1..maxLength] + void addOffset(int32_t offset) { + int32_t i=start+offset; + if(i>=capacity) { + i-=capacity; + } + list[i]=TRUE; + ++length; + } + + // offset=[1..maxLength] + UBool containsOffset(int32_t offset) const { + int32_t i=start+offset; + if(i>=capacity) { + i-=capacity; + } + return list[i]; + } + + // Find the lowest stored offset from a non-empty list, remove it, + // and reduce all other offsets by this minimum. + // Returns [1..maxLength]. + int32_t popMinimum() { + // Look for the next offset in list[start+1..capacity-1]. + int32_t i=start, result; + while(++i<capacity) { + if(list[i]) { + list[i]=FALSE; + --length; + result=i-start; + start=i; + return result; + } + } + // i==capacity + + // Wrap around and look for the next offset in list[0..start]. + // Since the list is not empty, there will be one. + result=capacity-start; + i=0; + while(!list[i]) { + ++i; + } + list[i]=FALSE; + --length; + start=i; + return result+=i; + } + +private: + UBool *list; + int32_t capacity; + int32_t length; + int32_t start; + + UBool staticList[16]; +}; + +// Get the number of UTF-8 bytes for a UTF-16 (sub)string. +static int32_t +getUTF8Length(const UChar *s, int32_t length) { + UErrorCode errorCode=U_ZERO_ERROR; + int32_t length8=0; + u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); + if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { + return length8; + } else { + // The string contains an unpaired surrogate. + // Ignore this string. + return 0; + } +} + +// Append the UTF-8 version of the string to t and return the appended UTF-8 length. +static int32_t +appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { + UErrorCode errorCode=U_ZERO_ERROR; + int32_t length8=0; + u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); + if(U_SUCCESS(errorCode)) { + return length8; + } else { + // The string contains an unpaired surrogate. + // Ignore this string. + return 0; + } +} + +static inline uint8_t +makeSpanLengthByte(int32_t spanLength) { + // 0xfe==UnicodeSetStringSpan::LONG_SPAN + return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; +} + +// Construct for all variants of span(), or only for any one variant. +// Initialize as little as possible, for single use. +UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, + const UVector &setStrings, + uint32_t which) + : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), + utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), + utf8Length(0), + maxLength16(0), maxLength8(0), + all((UBool)(which==ALL)) { + spanSet.retainAll(set); + if(which&NOT_CONTAINED) { + // Default to the same sets. + // addToSpanNotSet() will create a separate set if necessary. + pSpanNotSet=&spanSet; + } + + // Determine if the strings even need to be taken into account at all for span() etc. + // If any string is relevant, then all strings need to be used for + // span(longest match) but only the relevant ones for span(while contained). + // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH + // and do not store UTF-8 strings if !thisRelevant and CONTAINED. + // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) + // Also count the lengths of the UTF-8 versions of the strings for memory allocation. + int32_t stringsLength=strings.size(); + + int32_t i, spanLength; + UBool someRelevant=FALSE; + for(i=0; i<stringsLength; ++i) { + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const UChar *s16=string.getBuffer(); + int32_t length16=string.length(); + UBool thisRelevant; + spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); + if(spanLength<length16) { // Relevant string. + someRelevant=thisRelevant=TRUE; + } else { + thisRelevant=FALSE; + } + if((which&UTF16) && length16>maxLength16) { + maxLength16=length16; + } + if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { + int32_t length8=getUTF8Length(s16, length16); + utf8Length+=length8; + if(length8>maxLength8) { + maxLength8=length8; + } + } + } + if(!someRelevant) { + maxLength16=maxLength8=0; + return; + } + + // Freeze after checking for the need to use strings at all because freezing + // a set takes some time and memory which are wasted if there are no relevant strings. + if(all) { + spanSet.freeze(); + } + + uint8_t *spanBackLengths; + uint8_t *spanUTF8Lengths; + uint8_t *spanBackUTF8Lengths; + + // Allocate a block of meta data. + int32_t allocSize; + if(all) { + // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. + allocSize=stringsLength*(4+1+1+1+1)+utf8Length; + } else { + allocSize=stringsLength; // One set of span lengths. + if(which&UTF8) { + // UTF-8 lengths and UTF-8 strings. + allocSize+=stringsLength*4+utf8Length; + } + } + if(allocSize<=(int32_t)sizeof(staticLengths)) { + utf8Lengths=staticLengths; + } else { + utf8Lengths=(int32_t *)uprv_malloc(allocSize); + if(utf8Lengths==NULL) { + maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. + return; // Out of memory. + } + } + + if(all) { + // Store span lengths for all span() variants. + spanLengths=(uint8_t *)(utf8Lengths+stringsLength); + spanBackLengths=spanLengths+stringsLength; + spanUTF8Lengths=spanBackLengths+stringsLength; + spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; + utf8=spanBackUTF8Lengths+stringsLength; + } else { + // Store span lengths for only one span() variant. + if(which&UTF8) { + spanLengths=(uint8_t *)(utf8Lengths+stringsLength); + utf8=spanLengths+stringsLength; + } else { + spanLengths=(uint8_t *)utf8Lengths; + } + spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; + } + + // Set the meta data and pSpanNotSet and write the UTF-8 strings. + int32_t utf8Count=0; // Count UTF-8 bytes written so far. + + for(i=0; i<stringsLength; ++i) { + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const UChar *s16=string.getBuffer(); + int32_t length16=string.length(); + spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); + if(spanLength<length16) { // Relevant string. + if(which&UTF16) { + if(which&CONTAINED) { + if(which&FWD) { + spanLengths[i]=makeSpanLengthByte(spanLength); + } + if(which&BACK) { + spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); + spanBackLengths[i]=makeSpanLengthByte(spanLength); + } + } else /* not CONTAINED, not all, but NOT_CONTAINED */ { + spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. + } + } + if(which&UTF8) { + uint8_t *s8=utf8+utf8Count; + int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); + utf8Count+=utf8Lengths[i]=length8; + if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. + spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; + } else { // Relevant for UTF-8. + if(which&CONTAINED) { + if(which&FWD) { + spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); + spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); + } + if(which&BACK) { + spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); + spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); + } + } else /* not CONTAINED, not all, but NOT_CONTAINED */ { + spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. + } + } + } + if(which&NOT_CONTAINED) { + // Add string start and end code points to the spanNotSet so that + // a span(while not contained) stops before any string. + UChar32 c; + if(which&FWD) { + int32_t len=0; + U16_NEXT(s16, len, length16, c); + addToSpanNotSet(c); + } + if(which&BACK) { + int32_t len=length16; + U16_PREV(s16, 0, len, c); + addToSpanNotSet(c); + } + } + } else { // Irrelevant string. + if(which&UTF8) { + if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. + uint8_t *s8=utf8+utf8Count; + int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); + utf8Count+=utf8Lengths[i]=length8; + } else { + utf8Lengths[i]=0; + } + } + if(all) { + spanLengths[i]=spanBackLengths[i]= + spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= + (uint8_t)ALL_CP_CONTAINED; + } else { + // All spanXYZLengths pointers contain the same address. + spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; + } + } + } + + // Finish. + if(all) { + pSpanNotSet->freeze(); + } +} + +// Copy constructor. Assumes which==ALL for a frozen set. +UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, + const UVector &newParentSetStrings) + : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), + utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), + utf8Length(otherStringSpan.utf8Length), + maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), + all(TRUE) { + if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { + pSpanNotSet=&spanSet; + } else { pSpanNotSet=otherStringSpan.pSpanNotSet->clone(); - } - - // Allocate a block of meta data. - // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. - int32_t stringsLength=strings.size(); - int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; - if(allocSize<=(int32_t)sizeof(staticLengths)) { - utf8Lengths=staticLengths; - } else { - utf8Lengths=(int32_t *)uprv_malloc(allocSize); - if(utf8Lengths==NULL) { - maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. - return; // Out of memory. - } - } - - spanLengths=(uint8_t *)(utf8Lengths+stringsLength); - utf8=spanLengths+stringsLength*4; - uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); -} - -UnicodeSetStringSpan::~UnicodeSetStringSpan() { - if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { - delete pSpanNotSet; - } - if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { - uprv_free(utf8Lengths); - } -} - -void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { - if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { - if(spanSet.contains(c)) { - return; // Nothing to do. - } + } + + // Allocate a block of meta data. + // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. + int32_t stringsLength=strings.size(); + int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; + if(allocSize<=(int32_t)sizeof(staticLengths)) { + utf8Lengths=staticLengths; + } else { + utf8Lengths=(int32_t *)uprv_malloc(allocSize); + if(utf8Lengths==NULL) { + maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. + return; // Out of memory. + } + } + + spanLengths=(uint8_t *)(utf8Lengths+stringsLength); + utf8=spanLengths+stringsLength*4; + uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); +} + +UnicodeSetStringSpan::~UnicodeSetStringSpan() { + if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { + delete pSpanNotSet; + } + if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { + uprv_free(utf8Lengths); + } +} + +void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { + if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { + if(spanSet.contains(c)) { + return; // Nothing to do. + } UnicodeSet *newSet=spanSet.cloneAsThawed(); - if(newSet==NULL) { - return; // Out of memory. - } else { - pSpanNotSet=newSet; - } - } - pSpanNotSet->add(c); -} - -// Compare strings without any argument checks. Requires length>0. -static inline UBool -matches16(const UChar *s, const UChar *t, int32_t length) { - do { - if(*s++!=*t++) { - return FALSE; - } - } while(--length>0); - return TRUE; -} - -static inline UBool -matches8(const uint8_t *s, const uint8_t *t, int32_t length) { - do { - if(*s++!=*t++) { - return FALSE; - } - } while(--length>0); - return TRUE; -} - -// Compare 16-bit Unicode strings (which may be malformed UTF-16) -// at code point boundaries. -// That is, each edge of a match must not be in the middle of a surrogate pair. -static inline UBool -matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { - s+=start; - limit-=start; - return matches16(s, t, length) && - !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && - !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); -} - -// Does the set contain the next code point? -// If so, return its length; otherwise return its negative length. -static inline int32_t -spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { - UChar c=*s, c2; - if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { - return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; - } - return set.contains(c) ? 1 : -1; -} - -static inline int32_t -spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { - UChar c=s[length-1], c2; - if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { - return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; - } - return set.contains(c) ? 1 : -1; -} - -static inline int32_t -spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { - UChar32 c=*s; + if(newSet==NULL) { + return; // Out of memory. + } else { + pSpanNotSet=newSet; + } + } + pSpanNotSet->add(c); +} + +// Compare strings without any argument checks. Requires length>0. +static inline UBool +matches16(const UChar *s, const UChar *t, int32_t length) { + do { + if(*s++!=*t++) { + return FALSE; + } + } while(--length>0); + return TRUE; +} + +static inline UBool +matches8(const uint8_t *s, const uint8_t *t, int32_t length) { + do { + if(*s++!=*t++) { + return FALSE; + } + } while(--length>0); + return TRUE; +} + +// Compare 16-bit Unicode strings (which may be malformed UTF-16) +// at code point boundaries. +// That is, each edge of a match must not be in the middle of a surrogate pair. +static inline UBool +matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { + s+=start; + limit-=start; + return matches16(s, t, length) && + !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && + !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); +} + +// Does the set contain the next code point? +// If so, return its length; otherwise return its negative length. +static inline int32_t +spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { + UChar c=*s, c2; + if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { + return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; + } + return set.contains(c) ? 1 : -1; +} + +static inline int32_t +spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { + UChar c=s[length-1], c2; + if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { + return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; + } + return set.contains(c) ? 1 : -1; +} + +static inline int32_t +spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { + UChar32 c=*s; if(U8_IS_SINGLE(c)) { - return set.contains(c) ? 1 : -1; - } - // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). - int32_t i=0; - U8_NEXT_OR_FFFD(s, i, length, c); - return set.contains(c) ? i : -i; -} - -static inline int32_t -spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { - UChar32 c=s[length-1]; + return set.contains(c) ? 1 : -1; + } + // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). + int32_t i=0; + U8_NEXT_OR_FFFD(s, i, length, c); + return set.contains(c) ? i : -i; +} + +static inline int32_t +spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { + UChar32 c=s[length-1]; if(U8_IS_SINGLE(c)) { - return set.contains(c) ? 1 : -1; - } - int32_t i=length-1; - c=utf8_prevCharSafeBody(s, 0, &i, c, -3); - length-=i; - return set.contains(c) ? length : -length; -} - -/* - * Note: In span() when spanLength==0 (after a string match, or at the beginning - * after an empty code point span) and in spanNot() and spanNotUTF8(), - * string matching could use a binary search - * because all string matches are done from the same start index. - * - * For UTF-8, this would require a comparison function that returns UTF-16 order. - * - * This optimization should not be necessary for normal UnicodeSets because - * most sets have no strings, and most sets with strings have - * very few very short strings. - * For cases with many strings, it might be better to use a different API - * and implementation with a DFA (state machine). - */ - -/* - * Algorithm for span(USET_SPAN_CONTAINED) - * - * Theoretical algorithm: - * - Iterate through the string, and at each code point boundary: - * + If the code point there is in the set, then remember to continue after it. - * + If a set string matches at the current position, then remember to continue after it. - * + Either recursively span for each code point or string match, - * or recursively span for all but the shortest one and - * iteratively continue the span with the shortest local match. - * + Remember the longest recursive span (the farthest end point). - * + If there is no match at the current position, neither for the code point there - * nor for any set string, then stop and return the longest recursive span length. - * - * Optimized implementation: - * - * (We assume that most sets will have very few very short strings. - * A span using a string-less set is extremely fast.) - * - * Create and cache a spanSet which contains all of the single code points - * of the original set but none of its strings. - * - * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). - * - Loop: - * + Try to match each set string at the end of the spanLength. - * ~ Set strings that start with set-contained code points must be matched - * with a partial overlap because the recursive algorithm would have tried - * to match them at every position. - * ~ Set strings that entirely consist of set-contained code points - * are irrelevant for span(USET_SPAN_CONTAINED) because the - * recursive algorithm would continue after them anyway - * and find the longest recursive match from their end. - * ~ Rather than recursing, note each end point of a set string match. - * + If no set string matched after spanSet.span(), then return - * with where the spanSet.span() ended. - * + If at least one set string matched after spanSet.span(), then - * pop the shortest string match end point and continue - * the loop, trying to match all set strings from there. - * + If at least one more set string matched after a previous string match, - * then test if the code point after the previous string match is also - * contained in the set. - * Continue the loop with the shortest end point of either this code point - * or a matching set string. - * + If no more set string matched after a previous string match, - * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). - * Stop if spanLength==0, otherwise continue the loop. - * - * By noting each end point of a set string match, - * the function visits each string position at most once and finishes - * in linear time. - * - * The recursive algorithm may visit the same string position many times - * if multiple paths lead to it and finishes in exponential time. - */ - -/* - * Algorithm for span(USET_SPAN_SIMPLE) - * - * Theoretical algorithm: - * - Iterate through the string, and at each code point boundary: - * + If the code point there is in the set, then remember to continue after it. - * + If a set string matches at the current position, then remember to continue after it. - * + Continue from the farthest match position and ignore all others. - * + If there is no match at the current position, - * then stop and return the current position. - * - * Optimized implementation: - * - * (Same assumption and spanSet as above.) - * - * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). - * - Loop: - * + Try to match each set string at the end of the spanLength. - * ~ Set strings that start with set-contained code points must be matched - * with a partial overlap because the standard algorithm would have tried - * to match them earlier. - * ~ Set strings that entirely consist of set-contained code points - * must be matched with a full overlap because the longest-match algorithm - * would hide set string matches that end earlier. - * Such set strings need not be matched earlier inside the code point span - * because the standard algorithm would then have continued after - * the set string match anyway. - * ~ Remember the longest set string match (farthest end point) from the earliest - * starting point. - * + If no set string matched after spanSet.span(), then return - * with where the spanSet.span() ended. - * + If at least one set string matched, then continue the loop after the - * longest match from the earliest position. - * + If no more set string matched after a previous string match, - * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). - * Stop if spanLength==0, otherwise continue the loop. - */ - -int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { - if(spanCondition==USET_SPAN_NOT_CONTAINED) { - return spanNot(s, length); - } - int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); - if(spanLength==length) { - return length; - } - - // Consider strings; they may overlap with the span. - OffsetList offsets; - if(spanCondition==USET_SPAN_CONTAINED) { - // Use offset list to try all possibilities. - offsets.setMaxLength(maxLength16); - } - int32_t pos=spanLength, rest=length-pos; - int32_t i, stringsLength=strings.size(); - for(;;) { - if(spanCondition==USET_SPAN_CONTAINED) { - for(i=0; i<stringsLength; ++i) { - int32_t overlap=spanLengths[i]; - if(overlap==ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); - const UChar *s16=string.getBuffer(); - int32_t length16=string.length(); - - // Try to match this string at pos-overlap..pos. - if(overlap>=LONG_SPAN) { - overlap=length16; - // While contained: No point matching fully inside the code point span. - U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. - } - if(overlap>spanLength) { - overlap=spanLength; - } - int32_t inc=length16-overlap; // Keep overlap+inc==length16. - for(;;) { - if(inc>rest) { - break; - } - // Try to match if the increment is not listed already. - if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { - if(inc==rest) { - return length; // Reached the end of the string. - } - offsets.addOffset(inc); - } - if(overlap==0) { - break; - } - --overlap; - ++inc; - } - } - } else /* USET_SPAN_SIMPLE */ { - int32_t maxInc=0, maxOverlap=0; - for(i=0; i<stringsLength; ++i) { - int32_t overlap=spanLengths[i]; - // For longest match, we do need to try to match even an all-contained string - // to find the match from the earliest start. - - const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); - const UChar *s16=string.getBuffer(); - int32_t length16=string.length(); - - // Try to match this string at pos-overlap..pos. - if(overlap>=LONG_SPAN) { - overlap=length16; - // Longest match: Need to match fully inside the code point span - // to find the match from the earliest start. - } - if(overlap>spanLength) { - overlap=spanLength; - } - int32_t inc=length16-overlap; // Keep overlap+inc==length16. - for(;;) { - if(inc>rest || overlap<maxOverlap) { - break; - } - // Try to match if the string is longer or starts earlier. - if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && - matches16CPB(s, pos-overlap, length, s16, length16) - ) { - maxInc=inc; // Longest match from earliest start. - maxOverlap=overlap; - break; - } - --overlap; - ++inc; - } - } - - if(maxInc!=0 || maxOverlap!=0) { - // Longest-match algorithm, and there was a string match. - // Simply continue after it. - pos+=maxInc; - rest-=maxInc; - if(rest==0) { - return length; // Reached the end of the string. - } - spanLength=0; // Match strings from after a string match. - continue; - } - } - // Finished trying to match all strings at pos. - - if(spanLength!=0 || pos==0) { - // The position is after an unlimited code point span (spanLength!=0), - // not after a string match. - // The only position where spanLength==0 after a span is pos==0. - // Otherwise, an unlimited code point span is only tried again when no - // strings match, and if such a non-initial span fails we stop. - if(offsets.isEmpty()) { - return pos; // No strings matched after a span. - } - // Match strings from after the next string match. - } else { - // The position is after a string match (or a single code point). - if(offsets.isEmpty()) { - // No more strings matched after a previous string match. - // Try another code point span from after the last string match. - spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); - if( spanLength==rest || // Reached the end of the string, or - spanLength==0 // neither strings nor span progressed. - ) { - return pos+spanLength; - } - pos+=spanLength; - rest-=spanLength; - continue; // spanLength>0: Match strings from after a span. - } else { - // Try to match only one code point from after a string match if some - // string matched beyond it, so that we try all possible positions - // and don't overshoot. - spanLength=spanOne(spanSet, s+pos, rest); - if(spanLength>0) { - if(spanLength==rest) { - return length; // Reached the end of the string. - } - // Match strings after this code point. - // There cannot be any increments below it because UnicodeSet strings - // contain multiple code points. - pos+=spanLength; - rest-=spanLength; - offsets.shift(spanLength); - spanLength=0; - continue; // Match strings from after a single code point. - } - // Match strings from after the next string match. - } - } - int32_t minOffset=offsets.popMinimum(); - pos+=minOffset; - rest-=minOffset; - spanLength=0; // Match strings from after a string match. - } -} - -int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { - if(spanCondition==USET_SPAN_NOT_CONTAINED) { - return spanNotBack(s, length); - } - int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); - if(pos==0) { - return 0; - } - int32_t spanLength=length-pos; - - // Consider strings; they may overlap with the span. - OffsetList offsets; - if(spanCondition==USET_SPAN_CONTAINED) { - // Use offset list to try all possibilities. - offsets.setMaxLength(maxLength16); - } - int32_t i, stringsLength=strings.size(); - uint8_t *spanBackLengths=spanLengths; - if(all) { - spanBackLengths+=stringsLength; - } - for(;;) { - if(spanCondition==USET_SPAN_CONTAINED) { - for(i=0; i<stringsLength; ++i) { - int32_t overlap=spanBackLengths[i]; - if(overlap==ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); - const UChar *s16=string.getBuffer(); - int32_t length16=string.length(); - - // Try to match this string at pos-(length16-overlap)..pos-length16. - if(overlap>=LONG_SPAN) { - overlap=length16; - // While contained: No point matching fully inside the code point span. - int32_t len1=0; - U16_FWD_1(s16, len1, overlap); - overlap-=len1; // Length of the string minus the first code point. - } - if(overlap>spanLength) { - overlap=spanLength; - } - int32_t dec=length16-overlap; // Keep dec+overlap==length16. - for(;;) { - if(dec>pos) { - break; - } - // Try to match if the decrement is not listed already. - if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { - if(dec==pos) { - return 0; // Reached the start of the string. - } - offsets.addOffset(dec); - } - if(overlap==0) { - break; - } - --overlap; - ++dec; - } - } - } else /* USET_SPAN_SIMPLE */ { - int32_t maxDec=0, maxOverlap=0; - for(i=0; i<stringsLength; ++i) { - int32_t overlap=spanBackLengths[i]; - // For longest match, we do need to try to match even an all-contained string - // to find the match from the latest end. - - const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); - const UChar *s16=string.getBuffer(); - int32_t length16=string.length(); - - // Try to match this string at pos-(length16-overlap)..pos-length16. - if(overlap>=LONG_SPAN) { - overlap=length16; - // Longest match: Need to match fully inside the code point span - // to find the match from the latest end. - } - if(overlap>spanLength) { - overlap=spanLength; - } - int32_t dec=length16-overlap; // Keep dec+overlap==length16. - for(;;) { - if(dec>pos || overlap<maxOverlap) { - break; - } - // Try to match if the string is longer or ends later. - if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && - matches16CPB(s, pos-dec, length, s16, length16) - ) { - maxDec=dec; // Longest match from latest end. - maxOverlap=overlap; - break; - } - --overlap; - ++dec; - } - } - - if(maxDec!=0 || maxOverlap!=0) { - // Longest-match algorithm, and there was a string match. - // Simply continue before it. - pos-=maxDec; - if(pos==0) { - return 0; // Reached the start of the string. - } - spanLength=0; // Match strings from before a string match. - continue; - } - } - // Finished trying to match all strings at pos. - - if(spanLength!=0 || pos==length) { - // The position is before an unlimited code point span (spanLength!=0), - // not before a string match. - // The only position where spanLength==0 before a span is pos==length. - // Otherwise, an unlimited code point span is only tried again when no - // strings match, and if such a non-initial span fails we stop. - if(offsets.isEmpty()) { - return pos; // No strings matched before a span. - } - // Match strings from before the next string match. - } else { - // The position is before a string match (or a single code point). - if(offsets.isEmpty()) { - // No more strings matched before a previous string match. - // Try another code point span from before the last string match. - int32_t oldPos=pos; - pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); - spanLength=oldPos-pos; - if( pos==0 || // Reached the start of the string, or - spanLength==0 // neither strings nor span progressed. - ) { - return pos; - } - continue; // spanLength>0: Match strings from before a span. - } else { - // Try to match only one code point from before a string match if some - // string matched beyond it, so that we try all possible positions - // and don't overshoot. - spanLength=spanOneBack(spanSet, s, pos); - if(spanLength>0) { - if(spanLength==pos) { - return 0; // Reached the start of the string. - } - // Match strings before this code point. - // There cannot be any decrements below it because UnicodeSet strings - // contain multiple code points. - pos-=spanLength; - offsets.shift(spanLength); - spanLength=0; - continue; // Match strings from before a single code point. - } - // Match strings from before the next string match. - } - } - pos-=offsets.popMinimum(); - spanLength=0; // Match strings from before a string match. - } -} - -int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { - if(spanCondition==USET_SPAN_NOT_CONTAINED) { - return spanNotUTF8(s, length); - } - int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); - if(spanLength==length) { - return length; - } - - // Consider strings; they may overlap with the span. - OffsetList offsets; - if(spanCondition==USET_SPAN_CONTAINED) { - // Use offset list to try all possibilities. - offsets.setMaxLength(maxLength8); - } - int32_t pos=spanLength, rest=length-pos; - int32_t i, stringsLength=strings.size(); - uint8_t *spanUTF8Lengths=spanLengths; - if(all) { - spanUTF8Lengths+=2*stringsLength; - } - for(;;) { - const uint8_t *s8=utf8; - int32_t length8; - if(spanCondition==USET_SPAN_CONTAINED) { - for(i=0; i<stringsLength; ++i) { - length8=utf8Lengths[i]; - if(length8==0) { - continue; // String not representable in UTF-8. - } - int32_t overlap=spanUTF8Lengths[i]; - if(overlap==ALL_CP_CONTAINED) { - s8+=length8; - continue; // Irrelevant string. - } - - // Try to match this string at pos-overlap..pos. - if(overlap>=LONG_SPAN) { - overlap=length8; - // While contained: No point matching fully inside the code point span. - U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. - } - if(overlap>spanLength) { - overlap=spanLength; - } - int32_t inc=length8-overlap; // Keep overlap+inc==length8. - for(;;) { - if(inc>rest) { - break; - } - // Try to match if the increment is not listed already. - // Match at code point boundaries. (The UTF-8 strings were converted - // from UTF-16 and are guaranteed to be well-formed.) + return set.contains(c) ? 1 : -1; + } + int32_t i=length-1; + c=utf8_prevCharSafeBody(s, 0, &i, c, -3); + length-=i; + return set.contains(c) ? length : -length; +} + +/* + * Note: In span() when spanLength==0 (after a string match, or at the beginning + * after an empty code point span) and in spanNot() and spanNotUTF8(), + * string matching could use a binary search + * because all string matches are done from the same start index. + * + * For UTF-8, this would require a comparison function that returns UTF-16 order. + * + * This optimization should not be necessary for normal UnicodeSets because + * most sets have no strings, and most sets with strings have + * very few very short strings. + * For cases with many strings, it might be better to use a different API + * and implementation with a DFA (state machine). + */ + +/* + * Algorithm for span(USET_SPAN_CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Either recursively span for each code point or string match, + * or recursively span for all but the shortest one and + * iteratively continue the span with the shortest local match. + * + Remember the longest recursive span (the farthest end point). + * + If there is no match at the current position, neither for the code point there + * nor for any set string, then stop and return the longest recursive span length. + * + * Optimized implementation: + * + * (We assume that most sets will have very few very short strings. + * A span using a string-less set is extremely fast.) + * + * Create and cache a spanSet which contains all of the single code points + * of the original set but none of its strings. + * + * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points must be matched + * with a partial overlap because the recursive algorithm would have tried + * to match them at every position. + * ~ Set strings that entirely consist of set-contained code points + * are irrelevant for span(USET_SPAN_CONTAINED) because the + * recursive algorithm would continue after them anyway + * and find the longest recursive match from their end. + * ~ Rather than recursing, note each end point of a set string match. + * + If no set string matched after spanSet.span(), then return + * with where the spanSet.span() ended. + * + If at least one set string matched after spanSet.span(), then + * pop the shortest string match end point and continue + * the loop, trying to match all set strings from there. + * + If at least one more set string matched after a previous string match, + * then test if the code point after the previous string match is also + * contained in the set. + * Continue the loop with the shortest end point of either this code point + * or a matching set string. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + * + * By noting each end point of a set string match, + * the function visits each string position at most once and finishes + * in linear time. + * + * The recursive algorithm may visit the same string position many times + * if multiple paths lead to it and finishes in exponential time. + */ + +/* + * Algorithm for span(USET_SPAN_SIMPLE) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Continue from the farthest match position and ignore all others. + * + If there is no match at the current position, + * then stop and return the current position. + * + * Optimized implementation: + * + * (Same assumption and spanSet as above.) + * + * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points must be matched + * with a partial overlap because the standard algorithm would have tried + * to match them earlier. + * ~ Set strings that entirely consist of set-contained code points + * must be matched with a full overlap because the longest-match algorithm + * would hide set string matches that end earlier. + * Such set strings need not be matched earlier inside the code point span + * because the standard algorithm would then have continued after + * the set string match anyway. + * ~ Remember the longest set string match (farthest end point) from the earliest + * starting point. + * + If no set string matched after spanSet.span(), then return + * with where the spanSet.span() ended. + * + If at least one set string matched, then continue the loop after the + * longest match from the earliest position. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + */ + +int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + return spanNot(s, length); + } + int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); + if(spanLength==length) { + return length; + } + + // Consider strings; they may overlap with the span. + OffsetList offsets; + if(spanCondition==USET_SPAN_CONTAINED) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength16); + } + int32_t pos=spanLength, rest=length-pos; + int32_t i, stringsLength=strings.size(); + for(;;) { + if(spanCondition==USET_SPAN_CONTAINED) { + for(i=0; i<stringsLength; ++i) { + int32_t overlap=spanLengths[i]; + if(overlap==ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const UChar *s16=string.getBuffer(); + int32_t length16=string.length(); + + // Try to match this string at pos-overlap..pos. + if(overlap>=LONG_SPAN) { + overlap=length16; + // While contained: No point matching fully inside the code point span. + U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t inc=length16-overlap; // Keep overlap+inc==length16. + for(;;) { + if(inc>rest) { + break; + } + // Try to match if the increment is not listed already. + if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { + if(inc==rest) { + return length; // Reached the end of the string. + } + offsets.addOffset(inc); + } + if(overlap==0) { + break; + } + --overlap; + ++inc; + } + } + } else /* USET_SPAN_SIMPLE */ { + int32_t maxInc=0, maxOverlap=0; + for(i=0; i<stringsLength; ++i) { + int32_t overlap=spanLengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the earliest start. + + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const UChar *s16=string.getBuffer(); + int32_t length16=string.length(); + + // Try to match this string at pos-overlap..pos. + if(overlap>=LONG_SPAN) { + overlap=length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the earliest start. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t inc=length16-overlap; // Keep overlap+inc==length16. + for(;;) { + if(inc>rest || overlap<maxOverlap) { + break; + } + // Try to match if the string is longer or starts earlier. + if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && + matches16CPB(s, pos-overlap, length, s16, length16) + ) { + maxInc=inc; // Longest match from earliest start. + maxOverlap=overlap; + break; + } + --overlap; + ++inc; + } + } + + if(maxInc!=0 || maxOverlap!=0) { + // Longest-match algorithm, and there was a string match. + // Simply continue after it. + pos+=maxInc; + rest-=maxInc; + if(rest==0) { + return length; // Reached the end of the string. + } + spanLength=0; // Match strings from after a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if(spanLength!=0 || pos==0) { + // The position is after an unlimited code point span (spanLength!=0), + // not after a string match. + // The only position where spanLength==0 after a span is pos==0. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if(offsets.isEmpty()) { + return pos; // No strings matched after a span. + } + // Match strings from after the next string match. + } else { + // The position is after a string match (or a single code point). + if(offsets.isEmpty()) { + // No more strings matched after a previous string match. + // Try another code point span from after the last string match. + spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); + if( spanLength==rest || // Reached the end of the string, or + spanLength==0 // neither strings nor span progressed. + ) { + return pos+spanLength; + } + pos+=spanLength; + rest-=spanLength; + continue; // spanLength>0: Match strings from after a span. + } else { + // Try to match only one code point from after a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength=spanOne(spanSet, s+pos, rest); + if(spanLength>0) { + if(spanLength==rest) { + return length; // Reached the end of the string. + } + // Match strings after this code point. + // There cannot be any increments below it because UnicodeSet strings + // contain multiple code points. + pos+=spanLength; + rest-=spanLength; + offsets.shift(spanLength); + spanLength=0; + continue; // Match strings from after a single code point. + } + // Match strings from after the next string match. + } + } + int32_t minOffset=offsets.popMinimum(); + pos+=minOffset; + rest-=minOffset; + spanLength=0; // Match strings from after a string match. + } +} + +int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + return spanNotBack(s, length); + } + int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); + if(pos==0) { + return 0; + } + int32_t spanLength=length-pos; + + // Consider strings; they may overlap with the span. + OffsetList offsets; + if(spanCondition==USET_SPAN_CONTAINED) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength16); + } + int32_t i, stringsLength=strings.size(); + uint8_t *spanBackLengths=spanLengths; + if(all) { + spanBackLengths+=stringsLength; + } + for(;;) { + if(spanCondition==USET_SPAN_CONTAINED) { + for(i=0; i<stringsLength; ++i) { + int32_t overlap=spanBackLengths[i]; + if(overlap==ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const UChar *s16=string.getBuffer(); + int32_t length16=string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if(overlap>=LONG_SPAN) { + overlap=length16; + // While contained: No point matching fully inside the code point span. + int32_t len1=0; + U16_FWD_1(s16, len1, overlap); + overlap-=len1; // Length of the string minus the first code point. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t dec=length16-overlap; // Keep dec+overlap==length16. + for(;;) { + if(dec>pos) { + break; + } + // Try to match if the decrement is not listed already. + if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { + if(dec==pos) { + return 0; // Reached the start of the string. + } + offsets.addOffset(dec); + } + if(overlap==0) { + break; + } + --overlap; + ++dec; + } + } + } else /* USET_SPAN_SIMPLE */ { + int32_t maxDec=0, maxOverlap=0; + for(i=0; i<stringsLength; ++i) { + int32_t overlap=spanBackLengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the latest end. + + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const UChar *s16=string.getBuffer(); + int32_t length16=string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if(overlap>=LONG_SPAN) { + overlap=length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the latest end. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t dec=length16-overlap; // Keep dec+overlap==length16. + for(;;) { + if(dec>pos || overlap<maxOverlap) { + break; + } + // Try to match if the string is longer or ends later. + if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && + matches16CPB(s, pos-dec, length, s16, length16) + ) { + maxDec=dec; // Longest match from latest end. + maxOverlap=overlap; + break; + } + --overlap; + ++dec; + } + } + + if(maxDec!=0 || maxOverlap!=0) { + // Longest-match algorithm, and there was a string match. + // Simply continue before it. + pos-=maxDec; + if(pos==0) { + return 0; // Reached the start of the string. + } + spanLength=0; // Match strings from before a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if(spanLength!=0 || pos==length) { + // The position is before an unlimited code point span (spanLength!=0), + // not before a string match. + // The only position where spanLength==0 before a span is pos==length. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if(offsets.isEmpty()) { + return pos; // No strings matched before a span. + } + // Match strings from before the next string match. + } else { + // The position is before a string match (or a single code point). + if(offsets.isEmpty()) { + // No more strings matched before a previous string match. + // Try another code point span from before the last string match. + int32_t oldPos=pos; + pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); + spanLength=oldPos-pos; + if( pos==0 || // Reached the start of the string, or + spanLength==0 // neither strings nor span progressed. + ) { + return pos; + } + continue; // spanLength>0: Match strings from before a span. + } else { + // Try to match only one code point from before a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength=spanOneBack(spanSet, s, pos); + if(spanLength>0) { + if(spanLength==pos) { + return 0; // Reached the start of the string. + } + // Match strings before this code point. + // There cannot be any decrements below it because UnicodeSet strings + // contain multiple code points. + pos-=spanLength; + offsets.shift(spanLength); + spanLength=0; + continue; // Match strings from before a single code point. + } + // Match strings from before the next string match. + } + } + pos-=offsets.popMinimum(); + spanLength=0; // Match strings from before a string match. + } +} + +int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + return spanNotUTF8(s, length); + } + int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); + if(spanLength==length) { + return length; + } + + // Consider strings; they may overlap with the span. + OffsetList offsets; + if(spanCondition==USET_SPAN_CONTAINED) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength8); + } + int32_t pos=spanLength, rest=length-pos; + int32_t i, stringsLength=strings.size(); + uint8_t *spanUTF8Lengths=spanLengths; + if(all) { + spanUTF8Lengths+=2*stringsLength; + } + for(;;) { + const uint8_t *s8=utf8; + int32_t length8; + if(spanCondition==USET_SPAN_CONTAINED) { + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + if(length8==0) { + continue; // String not representable in UTF-8. + } + int32_t overlap=spanUTF8Lengths[i]; + if(overlap==ALL_CP_CONTAINED) { + s8+=length8; + continue; // Irrelevant string. + } + + // Try to match this string at pos-overlap..pos. + if(overlap>=LONG_SPAN) { + overlap=length8; + // While contained: No point matching fully inside the code point span. + U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t inc=length8-overlap; // Keep overlap+inc==length8. + for(;;) { + if(inc>rest) { + break; + } + // Try to match if the increment is not listed already. + // Match at code point boundaries. (The UTF-8 strings were converted + // from UTF-16 and are guaranteed to be well-formed.) if(!U8_IS_TRAIL(s[pos-overlap]) && !offsets.containsOffset(inc) && matches8(s+pos-overlap, s8, length8)) { - if(inc==rest) { - return length; // Reached the end of the string. - } - offsets.addOffset(inc); - } - if(overlap==0) { - break; - } - --overlap; - ++inc; - } - s8+=length8; - } - } else /* USET_SPAN_SIMPLE */ { - int32_t maxInc=0, maxOverlap=0; - for(i=0; i<stringsLength; ++i) { - length8=utf8Lengths[i]; - if(length8==0) { - continue; // String not representable in UTF-8. - } - int32_t overlap=spanUTF8Lengths[i]; - // For longest match, we do need to try to match even an all-contained string - // to find the match from the earliest start. - - // Try to match this string at pos-overlap..pos. - if(overlap>=LONG_SPAN) { - overlap=length8; - // Longest match: Need to match fully inside the code point span - // to find the match from the earliest start. - } - if(overlap>spanLength) { - overlap=spanLength; - } - int32_t inc=length8-overlap; // Keep overlap+inc==length8. - for(;;) { - if(inc>rest || overlap<maxOverlap) { - break; - } - // Try to match if the string is longer or starts earlier. - // Match at code point boundaries. (The UTF-8 strings were converted - // from UTF-16 and are guaranteed to be well-formed.) + if(inc==rest) { + return length; // Reached the end of the string. + } + offsets.addOffset(inc); + } + if(overlap==0) { + break; + } + --overlap; + ++inc; + } + s8+=length8; + } + } else /* USET_SPAN_SIMPLE */ { + int32_t maxInc=0, maxOverlap=0; + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + if(length8==0) { + continue; // String not representable in UTF-8. + } + int32_t overlap=spanUTF8Lengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the earliest start. + + // Try to match this string at pos-overlap..pos. + if(overlap>=LONG_SPAN) { + overlap=length8; + // Longest match: Need to match fully inside the code point span + // to find the match from the earliest start. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t inc=length8-overlap; // Keep overlap+inc==length8. + for(;;) { + if(inc>rest || overlap<maxOverlap) { + break; + } + // Try to match if the string is longer or starts earlier. + // Match at code point boundaries. (The UTF-8 strings were converted + // from UTF-16 and are guaranteed to be well-formed.) if(!U8_IS_TRAIL(s[pos-overlap]) && (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && matches8(s+pos-overlap, s8, length8)) { - maxInc=inc; // Longest match from earliest start. - maxOverlap=overlap; - break; - } - --overlap; - ++inc; - } - s8+=length8; - } - - if(maxInc!=0 || maxOverlap!=0) { - // Longest-match algorithm, and there was a string match. - // Simply continue after it. - pos+=maxInc; - rest-=maxInc; - if(rest==0) { - return length; // Reached the end of the string. - } - spanLength=0; // Match strings from after a string match. - continue; - } - } - // Finished trying to match all strings at pos. - - if(spanLength!=0 || pos==0) { - // The position is after an unlimited code point span (spanLength!=0), - // not after a string match. - // The only position where spanLength==0 after a span is pos==0. - // Otherwise, an unlimited code point span is only tried again when no - // strings match, and if such a non-initial span fails we stop. - if(offsets.isEmpty()) { - return pos; // No strings matched after a span. - } - // Match strings from after the next string match. - } else { - // The position is after a string match (or a single code point). - if(offsets.isEmpty()) { - // No more strings matched after a previous string match. - // Try another code point span from after the last string match. - spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); - if( spanLength==rest || // Reached the end of the string, or - spanLength==0 // neither strings nor span progressed. - ) { - return pos+spanLength; - } - pos+=spanLength; - rest-=spanLength; - continue; // spanLength>0: Match strings from after a span. - } else { - // Try to match only one code point from after a string match if some - // string matched beyond it, so that we try all possible positions - // and don't overshoot. - spanLength=spanOneUTF8(spanSet, s+pos, rest); - if(spanLength>0) { - if(spanLength==rest) { - return length; // Reached the end of the string. - } - // Match strings after this code point. - // There cannot be any increments below it because UnicodeSet strings - // contain multiple code points. - pos+=spanLength; - rest-=spanLength; - offsets.shift(spanLength); - spanLength=0; - continue; // Match strings from after a single code point. - } - // Match strings from after the next string match. - } - } - int32_t minOffset=offsets.popMinimum(); - pos+=minOffset; - rest-=minOffset; - spanLength=0; // Match strings from after a string match. - } -} - -int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { - if(spanCondition==USET_SPAN_NOT_CONTAINED) { - return spanNotBackUTF8(s, length); - } - int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); - if(pos==0) { - return 0; - } - int32_t spanLength=length-pos; - - // Consider strings; they may overlap with the span. - OffsetList offsets; - if(spanCondition==USET_SPAN_CONTAINED) { - // Use offset list to try all possibilities. - offsets.setMaxLength(maxLength8); - } - int32_t i, stringsLength=strings.size(); - uint8_t *spanBackUTF8Lengths=spanLengths; - if(all) { - spanBackUTF8Lengths+=3*stringsLength; - } - for(;;) { - const uint8_t *s8=utf8; - int32_t length8; - if(spanCondition==USET_SPAN_CONTAINED) { - for(i=0; i<stringsLength; ++i) { - length8=utf8Lengths[i]; - if(length8==0) { - continue; // String not representable in UTF-8. - } - int32_t overlap=spanBackUTF8Lengths[i]; - if(overlap==ALL_CP_CONTAINED) { - s8+=length8; - continue; // Irrelevant string. - } - - // Try to match this string at pos-(length8-overlap)..pos-length8. - if(overlap>=LONG_SPAN) { - overlap=length8; - // While contained: No point matching fully inside the code point span. - int32_t len1=0; - U8_FWD_1(s8, len1, overlap); - overlap-=len1; // Length of the string minus the first code point. - } - if(overlap>spanLength) { - overlap=spanLength; - } - int32_t dec=length8-overlap; // Keep dec+overlap==length8. - for(;;) { - if(dec>pos) { - break; - } - // Try to match if the decrement is not listed already. - // Match at code point boundaries. (The UTF-8 strings were converted - // from UTF-16 and are guaranteed to be well-formed.) - if( !U8_IS_TRAIL(s[pos-dec]) && - !offsets.containsOffset(dec) && - matches8(s+pos-dec, s8, length8) - ) { - if(dec==pos) { - return 0; // Reached the start of the string. - } - offsets.addOffset(dec); - } - if(overlap==0) { - break; - } - --overlap; - ++dec; - } - s8+=length8; - } - } else /* USET_SPAN_SIMPLE */ { - int32_t maxDec=0, maxOverlap=0; - for(i=0; i<stringsLength; ++i) { - length8=utf8Lengths[i]; - if(length8==0) { - continue; // String not representable in UTF-8. - } - int32_t overlap=spanBackUTF8Lengths[i]; - // For longest match, we do need to try to match even an all-contained string - // to find the match from the latest end. - - // Try to match this string at pos-(length8-overlap)..pos-length8. - if(overlap>=LONG_SPAN) { - overlap=length8; - // Longest match: Need to match fully inside the code point span - // to find the match from the latest end. - } - if(overlap>spanLength) { - overlap=spanLength; - } - int32_t dec=length8-overlap; // Keep dec+overlap==length8. - for(;;) { - if(dec>pos || overlap<maxOverlap) { - break; - } - // Try to match if the string is longer or ends later. - // Match at code point boundaries. (The UTF-8 strings were converted - // from UTF-16 and are guaranteed to be well-formed.) - if( !U8_IS_TRAIL(s[pos-dec]) && - (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && - matches8(s+pos-dec, s8, length8) - ) { - maxDec=dec; // Longest match from latest end. - maxOverlap=overlap; - break; - } - --overlap; - ++dec; - } - s8+=length8; - } - - if(maxDec!=0 || maxOverlap!=0) { - // Longest-match algorithm, and there was a string match. - // Simply continue before it. - pos-=maxDec; - if(pos==0) { - return 0; // Reached the start of the string. - } - spanLength=0; // Match strings from before a string match. - continue; - } - } - // Finished trying to match all strings at pos. - - if(spanLength!=0 || pos==length) { - // The position is before an unlimited code point span (spanLength!=0), - // not before a string match. - // The only position where spanLength==0 before a span is pos==length. - // Otherwise, an unlimited code point span is only tried again when no - // strings match, and if such a non-initial span fails we stop. - if(offsets.isEmpty()) { - return pos; // No strings matched before a span. - } - // Match strings from before the next string match. - } else { - // The position is before a string match (or a single code point). - if(offsets.isEmpty()) { - // No more strings matched before a previous string match. - // Try another code point span from before the last string match. - int32_t oldPos=pos; - pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); - spanLength=oldPos-pos; - if( pos==0 || // Reached the start of the string, or - spanLength==0 // neither strings nor span progressed. - ) { - return pos; - } - continue; // spanLength>0: Match strings from before a span. - } else { - // Try to match only one code point from before a string match if some - // string matched beyond it, so that we try all possible positions - // and don't overshoot. - spanLength=spanOneBackUTF8(spanSet, s, pos); - if(spanLength>0) { - if(spanLength==pos) { - return 0; // Reached the start of the string. - } - // Match strings before this code point. - // There cannot be any decrements below it because UnicodeSet strings - // contain multiple code points. - pos-=spanLength; - offsets.shift(spanLength); - spanLength=0; - continue; // Match strings from before a single code point. - } - // Match strings from before the next string match. - } - } - pos-=offsets.popMinimum(); - spanLength=0; // Match strings from before a string match. - } -} - -/* - * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) - * - * Theoretical algorithm: - * - Iterate through the string, and at each code point boundary: - * + If the code point there is in the set, then return with the current position. - * + If a set string matches at the current position, then return with the current position. - * - * Optimized implementation: - * - * (Same assumption as for span() above.) - * - * Create and cache a spanNotSet which contains all of the single code points - * of the original set but none of its strings. - * For each set string add its initial code point to the spanNotSet. - * (Also add its final code point for spanNotBack().) - * - * - Loop: - * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). - * + If the current code point is in the original set, then - * return the current position. - * + If any set string matches at the current position, then - * return the current position. - * + If there is no match at the current position, neither for the code point there - * nor for any set string, then skip this code point and continue the loop. - * This happens for set-string-initial code points that were added to spanNotSet - * when there is not actually a match for such a set string. - */ - -int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { - int32_t pos=0, rest=length; - int32_t i, stringsLength=strings.size(); - do { - // Span until we find a code point from the set, - // or a code point that starts or ends some string. - i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); - if(i==rest) { - return length; // Reached the end of the string. - } - pos+=i; - rest-=i; - - // Check whether the current code point is in the original set, - // without the string starts and ends. - int32_t cpLength=spanOne(spanSet, s+pos, rest); - if(cpLength>0) { - return pos; // There is a set element at pos. - } - - // Try to match the strings at pos. - for(i=0; i<stringsLength; ++i) { - if(spanLengths[i]==ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); - const UChar *s16=string.getBuffer(); - int32_t length16=string.length(); - if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { - return pos; // There is a set element at pos. - } - } - - // The span(while not contained) ended on a string start/end which is - // not in the original set. Skip this code point and continue. - // cpLength<0 - pos-=cpLength; - rest+=cpLength; - } while(rest!=0); - return length; // Reached the end of the string. -} - -int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { - int32_t pos=length; - int32_t i, stringsLength=strings.size(); - do { - // Span until we find a code point from the set, - // or a code point that starts or ends some string. - pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); - if(pos==0) { - return 0; // Reached the start of the string. - } - - // Check whether the current code point is in the original set, - // without the string starts and ends. - int32_t cpLength=spanOneBack(spanSet, s, pos); - if(cpLength>0) { - return pos; // There is a set element at pos. - } - - // Try to match the strings at pos. - for(i=0; i<stringsLength; ++i) { - // Use spanLengths rather than a spanBackLengths pointer because - // it is easier and we only need to know whether the string is irrelevant - // which is the same in either array. - if(spanLengths[i]==ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); - const UChar *s16=string.getBuffer(); - int32_t length16=string.length(); - if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { - return pos; // There is a set element at pos. - } - } - - // The span(while not contained) ended on a string start/end which is - // not in the original set. Skip this code point and continue. - // cpLength<0 - pos+=cpLength; - } while(pos!=0); - return 0; // Reached the start of the string. -} - -int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { - int32_t pos=0, rest=length; - int32_t i, stringsLength=strings.size(); - uint8_t *spanUTF8Lengths=spanLengths; - if(all) { - spanUTF8Lengths+=2*stringsLength; - } - do { - // Span until we find a code point from the set, - // or a code point that starts or ends some string. - i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); - if(i==rest) { - return length; // Reached the end of the string. - } - pos+=i; - rest-=i; - - // Check whether the current code point is in the original set, - // without the string starts and ends. - int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); - if(cpLength>0) { - return pos; // There is a set element at pos. - } - - // Try to match the strings at pos. - const uint8_t *s8=utf8; - int32_t length8; - for(i=0; i<stringsLength; ++i) { - length8=utf8Lengths[i]; - // ALL_CP_CONTAINED: Irrelevant string. - if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { - return pos; // There is a set element at pos. - } - s8+=length8; - } - - // The span(while not contained) ended on a string start/end which is - // not in the original set. Skip this code point and continue. - // cpLength<0 - pos-=cpLength; - rest+=cpLength; - } while(rest!=0); - return length; // Reached the end of the string. -} - -int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { - int32_t pos=length; - int32_t i, stringsLength=strings.size(); - uint8_t *spanBackUTF8Lengths=spanLengths; - if(all) { - spanBackUTF8Lengths+=3*stringsLength; - } - do { - // Span until we find a code point from the set, - // or a code point that starts or ends some string. - pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); - if(pos==0) { - return 0; // Reached the start of the string. - } - - // Check whether the current code point is in the original set, - // without the string starts and ends. - int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); - if(cpLength>0) { - return pos; // There is a set element at pos. - } - - // Try to match the strings at pos. - const uint8_t *s8=utf8; - int32_t length8; - for(i=0; i<stringsLength; ++i) { - length8=utf8Lengths[i]; - // ALL_CP_CONTAINED: Irrelevant string. - if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { - return pos; // There is a set element at pos. - } - s8+=length8; - } - - // The span(while not contained) ended on a string start/end which is - // not in the original set. Skip this code point and continue. - // cpLength<0 - pos+=cpLength; - } while(pos!=0); - return 0; // Reached the start of the string. -} - -U_NAMESPACE_END + maxInc=inc; // Longest match from earliest start. + maxOverlap=overlap; + break; + } + --overlap; + ++inc; + } + s8+=length8; + } + + if(maxInc!=0 || maxOverlap!=0) { + // Longest-match algorithm, and there was a string match. + // Simply continue after it. + pos+=maxInc; + rest-=maxInc; + if(rest==0) { + return length; // Reached the end of the string. + } + spanLength=0; // Match strings from after a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if(spanLength!=0 || pos==0) { + // The position is after an unlimited code point span (spanLength!=0), + // not after a string match. + // The only position where spanLength==0 after a span is pos==0. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if(offsets.isEmpty()) { + return pos; // No strings matched after a span. + } + // Match strings from after the next string match. + } else { + // The position is after a string match (or a single code point). + if(offsets.isEmpty()) { + // No more strings matched after a previous string match. + // Try another code point span from after the last string match. + spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); + if( spanLength==rest || // Reached the end of the string, or + spanLength==0 // neither strings nor span progressed. + ) { + return pos+spanLength; + } + pos+=spanLength; + rest-=spanLength; + continue; // spanLength>0: Match strings from after a span. + } else { + // Try to match only one code point from after a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength=spanOneUTF8(spanSet, s+pos, rest); + if(spanLength>0) { + if(spanLength==rest) { + return length; // Reached the end of the string. + } + // Match strings after this code point. + // There cannot be any increments below it because UnicodeSet strings + // contain multiple code points. + pos+=spanLength; + rest-=spanLength; + offsets.shift(spanLength); + spanLength=0; + continue; // Match strings from after a single code point. + } + // Match strings from after the next string match. + } + } + int32_t minOffset=offsets.popMinimum(); + pos+=minOffset; + rest-=minOffset; + spanLength=0; // Match strings from after a string match. + } +} + +int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + return spanNotBackUTF8(s, length); + } + int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); + if(pos==0) { + return 0; + } + int32_t spanLength=length-pos; + + // Consider strings; they may overlap with the span. + OffsetList offsets; + if(spanCondition==USET_SPAN_CONTAINED) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength8); + } + int32_t i, stringsLength=strings.size(); + uint8_t *spanBackUTF8Lengths=spanLengths; + if(all) { + spanBackUTF8Lengths+=3*stringsLength; + } + for(;;) { + const uint8_t *s8=utf8; + int32_t length8; + if(spanCondition==USET_SPAN_CONTAINED) { + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + if(length8==0) { + continue; // String not representable in UTF-8. + } + int32_t overlap=spanBackUTF8Lengths[i]; + if(overlap==ALL_CP_CONTAINED) { + s8+=length8; + continue; // Irrelevant string. + } + + // Try to match this string at pos-(length8-overlap)..pos-length8. + if(overlap>=LONG_SPAN) { + overlap=length8; + // While contained: No point matching fully inside the code point span. + int32_t len1=0; + U8_FWD_1(s8, len1, overlap); + overlap-=len1; // Length of the string minus the first code point. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t dec=length8-overlap; // Keep dec+overlap==length8. + for(;;) { + if(dec>pos) { + break; + } + // Try to match if the decrement is not listed already. + // Match at code point boundaries. (The UTF-8 strings were converted + // from UTF-16 and are guaranteed to be well-formed.) + if( !U8_IS_TRAIL(s[pos-dec]) && + !offsets.containsOffset(dec) && + matches8(s+pos-dec, s8, length8) + ) { + if(dec==pos) { + return 0; // Reached the start of the string. + } + offsets.addOffset(dec); + } + if(overlap==0) { + break; + } + --overlap; + ++dec; + } + s8+=length8; + } + } else /* USET_SPAN_SIMPLE */ { + int32_t maxDec=0, maxOverlap=0; + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + if(length8==0) { + continue; // String not representable in UTF-8. + } + int32_t overlap=spanBackUTF8Lengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the latest end. + + // Try to match this string at pos-(length8-overlap)..pos-length8. + if(overlap>=LONG_SPAN) { + overlap=length8; + // Longest match: Need to match fully inside the code point span + // to find the match from the latest end. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t dec=length8-overlap; // Keep dec+overlap==length8. + for(;;) { + if(dec>pos || overlap<maxOverlap) { + break; + } + // Try to match if the string is longer or ends later. + // Match at code point boundaries. (The UTF-8 strings were converted + // from UTF-16 and are guaranteed to be well-formed.) + if( !U8_IS_TRAIL(s[pos-dec]) && + (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && + matches8(s+pos-dec, s8, length8) + ) { + maxDec=dec; // Longest match from latest end. + maxOverlap=overlap; + break; + } + --overlap; + ++dec; + } + s8+=length8; + } + + if(maxDec!=0 || maxOverlap!=0) { + // Longest-match algorithm, and there was a string match. + // Simply continue before it. + pos-=maxDec; + if(pos==0) { + return 0; // Reached the start of the string. + } + spanLength=0; // Match strings from before a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if(spanLength!=0 || pos==length) { + // The position is before an unlimited code point span (spanLength!=0), + // not before a string match. + // The only position where spanLength==0 before a span is pos==length. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if(offsets.isEmpty()) { + return pos; // No strings matched before a span. + } + // Match strings from before the next string match. + } else { + // The position is before a string match (or a single code point). + if(offsets.isEmpty()) { + // No more strings matched before a previous string match. + // Try another code point span from before the last string match. + int32_t oldPos=pos; + pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); + spanLength=oldPos-pos; + if( pos==0 || // Reached the start of the string, or + spanLength==0 // neither strings nor span progressed. + ) { + return pos; + } + continue; // spanLength>0: Match strings from before a span. + } else { + // Try to match only one code point from before a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength=spanOneBackUTF8(spanSet, s, pos); + if(spanLength>0) { + if(spanLength==pos) { + return 0; // Reached the start of the string. + } + // Match strings before this code point. + // There cannot be any decrements below it because UnicodeSet strings + // contain multiple code points. + pos-=spanLength; + offsets.shift(spanLength); + spanLength=0; + continue; // Match strings from before a single code point. + } + // Match strings from before the next string match. + } + } + pos-=offsets.popMinimum(); + spanLength=0; // Match strings from before a string match. + } +} + +/* + * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then return with the current position. + * + If a set string matches at the current position, then return with the current position. + * + * Optimized implementation: + * + * (Same assumption as for span() above.) + * + * Create and cache a spanNotSet which contains all of the single code points + * of the original set but none of its strings. + * For each set string add its initial code point to the spanNotSet. + * (Also add its final code point for spanNotBack().) + * + * - Loop: + * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). + * + If the current code point is in the original set, then + * return the current position. + * + If any set string matches at the current position, then + * return the current position. + * + If there is no match at the current position, neither for the code point there + * nor for any set string, then skip this code point and continue the loop. + * This happens for set-string-initial code points that were added to spanNotSet + * when there is not actually a match for such a set string. + */ + +int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { + int32_t pos=0, rest=length; + int32_t i, stringsLength=strings.size(); + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); + if(i==rest) { + return length; // Reached the end of the string. + } + pos+=i; + rest-=i; + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int32_t cpLength=spanOne(spanSet, s+pos, rest); + if(cpLength>0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for(i=0; i<stringsLength; ++i) { + if(spanLengths[i]==ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const UChar *s16=string.getBuffer(); + int32_t length16=string.length(); + if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos-=cpLength; + rest+=cpLength; + } while(rest!=0); + return length; // Reached the end of the string. +} + +int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { + int32_t pos=length; + int32_t i, stringsLength=strings.size(); + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); + if(pos==0) { + return 0; // Reached the start of the string. + } + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int32_t cpLength=spanOneBack(spanSet, s, pos); + if(cpLength>0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for(i=0; i<stringsLength; ++i) { + // Use spanLengths rather than a spanBackLengths pointer because + // it is easier and we only need to know whether the string is irrelevant + // which is the same in either array. + if(spanLengths[i]==ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const UChar *s16=string.getBuffer(); + int32_t length16=string.length(); + if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos+=cpLength; + } while(pos!=0); + return 0; // Reached the start of the string. +} + +int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { + int32_t pos=0, rest=length; + int32_t i, stringsLength=strings.size(); + uint8_t *spanUTF8Lengths=spanLengths; + if(all) { + spanUTF8Lengths+=2*stringsLength; + } + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); + if(i==rest) { + return length; // Reached the end of the string. + } + pos+=i; + rest-=i; + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); + if(cpLength>0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + const uint8_t *s8=utf8; + int32_t length8; + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + // ALL_CP_CONTAINED: Irrelevant string. + if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { + return pos; // There is a set element at pos. + } + s8+=length8; + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos-=cpLength; + rest+=cpLength; + } while(rest!=0); + return length; // Reached the end of the string. +} + +int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { + int32_t pos=length; + int32_t i, stringsLength=strings.size(); + uint8_t *spanBackUTF8Lengths=spanLengths; + if(all) { + spanBackUTF8Lengths+=3*stringsLength; + } + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); + if(pos==0) { + return 0; // Reached the start of the string. + } + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); + if(cpLength>0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + const uint8_t *s8=utf8; + int32_t length8; + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + // ALL_CP_CONTAINED: Irrelevant string. + if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { + return pos; // There is a set element at pos. + } + s8+=length8; + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos+=cpLength; + } while(pos!=0); + return 0; // Reached the start of the string. +} + +U_NAMESPACE_END |