diff options
author | mcheshkov <mcheshkov@yandex-team.ru> | 2022-02-10 16:46:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:15 +0300 |
commit | e9d19cec64684c9c1e6b0c98297e5b895cf904fe (patch) | |
tree | 2768b1223e96a8a0610a93d18425d9647c1123c8 /contrib/libs/icu/common/umutablecptrie.cpp | |
parent | 60040c91ffe701a84689b2c6310ff845e65cff42 (diff) | |
download | ydb-e9d19cec64684c9c1e6b0c98297e5b895cf904fe.tar.gz |
Restoring authorship annotation for <mcheshkov@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/common/umutablecptrie.cpp')
-rw-r--r-- | contrib/libs/icu/common/umutablecptrie.cpp | 3704 |
1 files changed, 1852 insertions, 1852 deletions
diff --git a/contrib/libs/icu/common/umutablecptrie.cpp b/contrib/libs/icu/common/umutablecptrie.cpp index cdbe27080b..6ef717a172 100644 --- a/contrib/libs/icu/common/umutablecptrie.cpp +++ b/contrib/libs/icu/common/umutablecptrie.cpp @@ -1,1852 +1,1852 @@ -// © 2017 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html - -// umutablecptrie.cpp (inspired by utrie2_builder.cpp) -// created: 2017dec29 Markus W. Scherer - -// #define UCPTRIE_DEBUG -#ifdef UCPTRIE_DEBUG -# include <stdio.h> -#endif - -#include "unicode/utypes.h" -#include "unicode/ucptrie.h" -#include "unicode/umutablecptrie.h" -#include "unicode/uobject.h" -#include "unicode/utf16.h" -#include "cmemory.h" -#include "uassert.h" -#include "ucptrie_impl.h" - -// ICU-20235 In case Microsoft math.h has defined this, undefine it. -#ifdef OVERFLOW -#undef OVERFLOW -#endif - -U_NAMESPACE_BEGIN - -namespace { - -constexpr int32_t MAX_UNICODE = 0x10ffff; - -constexpr int32_t UNICODE_LIMIT = 0x110000; -constexpr int32_t BMP_LIMIT = 0x10000; -constexpr int32_t ASCII_LIMIT = 0x80; - -constexpr int32_t I_LIMIT = UNICODE_LIMIT >> UCPTRIE_SHIFT_3; -constexpr int32_t BMP_I_LIMIT = BMP_LIMIT >> UCPTRIE_SHIFT_3; -constexpr int32_t ASCII_I_LIMIT = ASCII_LIMIT >> UCPTRIE_SHIFT_3; - -constexpr int32_t SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3)); - -// Flag values for data blocks. -constexpr uint8_t ALL_SAME = 0; -constexpr uint8_t MIXED = 1; -constexpr uint8_t SAME_AS = 2; - -/** Start with allocation of 16k data entries. */ -constexpr int32_t INITIAL_DATA_LENGTH = ((int32_t)1 << 14); - -/** Grow about 8x each time. */ -constexpr int32_t MEDIUM_DATA_LENGTH = ((int32_t)1 << 17); - -/** - * Maximum length of the build-time data array. - * One entry per 0x110000 code points. - */ -constexpr int32_t MAX_DATA_LENGTH = UNICODE_LIMIT; - -// Flag values for index-3 blocks while compacting/building. -constexpr uint8_t I3_NULL = 0; -constexpr uint8_t I3_BMP = 1; -constexpr uint8_t I3_16 = 2; -constexpr uint8_t I3_18 = 3; - -constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8; - -class AllSameBlocks; -class MixedBlocks; - -class MutableCodePointTrie : public UMemory { -public: - MutableCodePointTrie(uint32_t initialValue, uint32_t errorValue, UErrorCode &errorCode); - MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode); - MutableCodePointTrie(const MutableCodePointTrie &other) = delete; - ~MutableCodePointTrie(); - - MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete; - - static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode); - static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode); - - uint32_t get(UChar32 c) const; - int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context, - uint32_t *pValue) const; - - void set(UChar32 c, uint32_t value, UErrorCode &errorCode); - void setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode); - - UCPTrie *build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode); - -private: - void clear(); - - bool ensureHighStart(UChar32 c); - int32_t allocDataBlock(int32_t blockLength); - int32_t getDataBlock(int32_t i); - - void maskValues(uint32_t mask); - UChar32 findHighStart() const; - int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks); - int32_t compactData( - int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, - int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode); - int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode); - int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode); - - uint32_t *index = nullptr; - int32_t indexCapacity = 0; - int32_t index3NullOffset = -1; - uint32_t *data = nullptr; - int32_t dataCapacity = 0; - int32_t dataLength = 0; - int32_t dataNullOffset = -1; - - uint32_t origInitialValue; - uint32_t initialValue; - uint32_t errorValue; - UChar32 highStart; - uint32_t highValue; -#ifdef UCPTRIE_DEBUG -public: - const char *name; -#endif -private: - /** Temporary array while building the final data. */ - uint16_t *index16 = nullptr; - uint8_t flags[UNICODE_LIMIT >> UCPTRIE_SHIFT_3]; -}; - -MutableCodePointTrie::MutableCodePointTrie(uint32_t iniValue, uint32_t errValue, UErrorCode &errorCode) : - origInitialValue(iniValue), initialValue(iniValue), errorValue(errValue), - highStart(0), highValue(initialValue) -#ifdef UCPTRIE_DEBUG - , name("open") -#endif - { - if (U_FAILURE(errorCode)) { return; } - index = (uint32_t *)uprv_malloc(BMP_I_LIMIT * 4); - data = (uint32_t *)uprv_malloc(INITIAL_DATA_LENGTH * 4); - if (index == nullptr || data == nullptr) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - indexCapacity = BMP_I_LIMIT; - dataCapacity = INITIAL_DATA_LENGTH; -} - -MutableCodePointTrie::MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode) : - index3NullOffset(other.index3NullOffset), - dataNullOffset(other.dataNullOffset), - origInitialValue(other.origInitialValue), initialValue(other.initialValue), - errorValue(other.errorValue), - highStart(other.highStart), highValue(other.highValue) -#ifdef UCPTRIE_DEBUG - , name("mutable clone") -#endif - { - if (U_FAILURE(errorCode)) { return; } - int32_t iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT; - index = (uint32_t *)uprv_malloc(iCapacity * 4); - data = (uint32_t *)uprv_malloc(other.dataCapacity * 4); - if (index == nullptr || data == nullptr) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - indexCapacity = iCapacity; - dataCapacity = other.dataCapacity; - - int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; - uprv_memcpy(flags, other.flags, iLimit); - uprv_memcpy(index, other.index, iLimit * 4); - uprv_memcpy(data, other.data, (size_t)other.dataLength * 4); - dataLength = other.dataLength; - U_ASSERT(other.index16 == nullptr); -} - -MutableCodePointTrie::~MutableCodePointTrie() { - uprv_free(index); - uprv_free(data); - uprv_free(index16); -} - -MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) { - // Use the highValue as the initialValue to reduce the highStart. - uint32_t errorValue = ucpmap_get(map, -1); - uint32_t initialValue = ucpmap_get(map, 0x10ffff); - LocalPointer<MutableCodePointTrie> mutableTrie( - new MutableCodePointTrie(initialValue, errorValue, errorCode), - errorCode); - if (U_FAILURE(errorCode)) { - return nullptr; - } - UChar32 start = 0, end; - uint32_t value; - while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, - nullptr, nullptr, &value)) >= 0) { - if (value != initialValue) { - if (start == end) { - mutableTrie->set(start, value, errorCode); - } else { - mutableTrie->setRange(start, end, value, errorCode); - } - } - start = end + 1; - } - if (U_SUCCESS(errorCode)) { - return mutableTrie.orphan(); - } else { - return nullptr; - } -} - -MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) { - // Use the highValue as the initialValue to reduce the highStart. - uint32_t errorValue; - uint32_t initialValue; - switch (trie->valueWidth) { - case UCPTRIE_VALUE_BITS_16: - errorValue = trie->data.ptr16[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET]; - initialValue = trie->data.ptr16[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET]; - break; - case UCPTRIE_VALUE_BITS_32: - errorValue = trie->data.ptr32[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET]; - initialValue = trie->data.ptr32[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET]; - break; - case UCPTRIE_VALUE_BITS_8: - errorValue = trie->data.ptr8[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET]; - initialValue = trie->data.ptr8[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET]; - break; - default: - // Unreachable if the trie is properly initialized. - errorCode = U_ILLEGAL_ARGUMENT_ERROR; - return nullptr; - } - LocalPointer<MutableCodePointTrie> mutableTrie( - new MutableCodePointTrie(initialValue, errorValue, errorCode), - errorCode); - if (U_FAILURE(errorCode)) { - return nullptr; - } - UChar32 start = 0, end; - uint32_t value; - while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, - nullptr, nullptr, &value)) >= 0) { - if (value != initialValue) { - if (start == end) { - mutableTrie->set(start, value, errorCode); - } else { - mutableTrie->setRange(start, end, value, errorCode); - } - } - start = end + 1; - } - if (U_SUCCESS(errorCode)) { - return mutableTrie.orphan(); - } else { - return nullptr; - } -} - -void MutableCodePointTrie::clear() { - index3NullOffset = dataNullOffset = -1; - dataLength = 0; - highValue = initialValue = origInitialValue; - highStart = 0; - uprv_free(index16); - index16 = nullptr; -} - -uint32_t MutableCodePointTrie::get(UChar32 c) const { - if ((uint32_t)c > MAX_UNICODE) { - return errorValue; - } - if (c >= highStart) { - return highValue; - } - int32_t i = c >> UCPTRIE_SHIFT_3; - if (flags[i] == ALL_SAME) { - return index[i]; - } else { - return data[index[i] + (c & UCPTRIE_SMALL_DATA_MASK)]; - } -} - -inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue, - UCPMapValueFilter *filter, const void *context) { - if (value == initialValue) { - value = nullValue; - } else if (filter != nullptr) { - value = filter(context, value); - } - return value; -} - -UChar32 MutableCodePointTrie::getRange( - UChar32 start, UCPMapValueFilter *filter, const void *context, - uint32_t *pValue) const { - if ((uint32_t)start > MAX_UNICODE) { - return U_SENTINEL; - } - if (start >= highStart) { - if (pValue != nullptr) { - uint32_t value = highValue; - if (filter != nullptr) { value = filter(context, value); } - *pValue = value; - } - return MAX_UNICODE; - } - uint32_t nullValue = initialValue; - if (filter != nullptr) { nullValue = filter(context, nullValue); } - UChar32 c = start; - uint32_t trieValue, value; - bool haveValue = false; - int32_t i = c >> UCPTRIE_SHIFT_3; - do { - if (flags[i] == ALL_SAME) { - uint32_t trieValue2 = index[i]; - if (haveValue) { - if (trieValue2 != trieValue) { - if (filter == nullptr || - maybeFilterValue(trieValue2, initialValue, nullValue, - filter, context) != value) { - return c - 1; - } - trieValue = trieValue2; // may or may not help - } - } else { - trieValue = trieValue2; - value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); - if (pValue != nullptr) { *pValue = value; } - haveValue = true; - } - c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK; - } else /* MIXED */ { - int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK); - uint32_t trieValue2 = data[di]; - if (haveValue) { - if (trieValue2 != trieValue) { - if (filter == nullptr || - maybeFilterValue(trieValue2, initialValue, nullValue, - filter, context) != value) { - return c - 1; - } - trieValue = trieValue2; // may or may not help - } - } else { - trieValue = trieValue2; - value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); - if (pValue != nullptr) { *pValue = value; } - haveValue = true; - } - while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) { - trieValue2 = data[++di]; - if (trieValue2 != trieValue) { - if (filter == nullptr || - maybeFilterValue(trieValue2, initialValue, nullValue, - filter, context) != value) { - return c - 1; - } - } - trieValue = trieValue2; // may or may not help - } - } - ++i; - } while (c < highStart); - U_ASSERT(haveValue); - if (maybeFilterValue(highValue, initialValue, nullValue, - filter, context) != value) { - return c - 1; - } else { - return MAX_UNICODE; - } -} - -void -writeBlock(uint32_t *block, uint32_t value) { - uint32_t *limit = block + UCPTRIE_SMALL_DATA_BLOCK_LENGTH; - while (block < limit) { - *block++ = value; - } -} - -bool MutableCodePointTrie::ensureHighStart(UChar32 c) { - if (c >= highStart) { - // Round up to a UCPTRIE_CP_PER_INDEX_2_ENTRY boundary to simplify compaction. - c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1); - int32_t i = highStart >> UCPTRIE_SHIFT_3; - int32_t iLimit = c >> UCPTRIE_SHIFT_3; - if (iLimit > indexCapacity) { - uint32_t *newIndex = (uint32_t *)uprv_malloc(I_LIMIT * 4); - if (newIndex == nullptr) { return false; } - uprv_memcpy(newIndex, index, i * 4); - uprv_free(index); - index = newIndex; - indexCapacity = I_LIMIT; - } - do { - flags[i] = ALL_SAME; - index[i] = initialValue; - } while(++i < iLimit); - highStart = c; - } - return true; -} - -int32_t MutableCodePointTrie::allocDataBlock(int32_t blockLength) { - int32_t newBlock = dataLength; - int32_t newTop = newBlock + blockLength; - if (newTop > dataCapacity) { - int32_t capacity; - if (dataCapacity < MEDIUM_DATA_LENGTH) { - capacity = MEDIUM_DATA_LENGTH; - } else if (dataCapacity < MAX_DATA_LENGTH) { - capacity = MAX_DATA_LENGTH; - } else { - // Should never occur. - // Either MAX_DATA_LENGTH is incorrect, - // or the code writes more values than should be possible. - return -1; - } - uint32_t *newData = (uint32_t *)uprv_malloc(capacity * 4); - if (newData == nullptr) { - return -1; - } - uprv_memcpy(newData, data, (size_t)dataLength * 4); - uprv_free(data); - data = newData; - dataCapacity = capacity; - } - dataLength = newTop; - return newBlock; -} - -/** - * No error checking for illegal arguments. - * - * @return -1 if no new data block available (out of memory in data array) - * @internal - */ -int32_t MutableCodePointTrie::getDataBlock(int32_t i) { - if (flags[i] == MIXED) { - return index[i]; - } - if (i < BMP_I_LIMIT) { - int32_t newBlock = allocDataBlock(UCPTRIE_FAST_DATA_BLOCK_LENGTH); - if (newBlock < 0) { return newBlock; } - int32_t iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1); - int32_t iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; - do { - U_ASSERT(flags[iStart] == ALL_SAME); - writeBlock(data + newBlock, index[iStart]); - flags[iStart] = MIXED; - index[iStart++] = newBlock; - newBlock += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; - } while (iStart < iLimit); - return index[i]; - } else { - int32_t newBlock = allocDataBlock(UCPTRIE_SMALL_DATA_BLOCK_LENGTH); - if (newBlock < 0) { return newBlock; } - writeBlock(data + newBlock, index[i]); - flags[i] = MIXED; - index[i] = newBlock; - return newBlock; - } -} - -void MutableCodePointTrie::set(UChar32 c, uint32_t value, UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { - return; - } - if ((uint32_t)c > MAX_UNICODE) { - errorCode = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - - int32_t block; - if (!ensureHighStart(c) || (block = getDataBlock(c >> UCPTRIE_SHIFT_3)) < 0) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - - data[block + (c & UCPTRIE_SMALL_DATA_MASK)] = value; -} - -void -fillBlock(uint32_t *block, UChar32 start, UChar32 limit, uint32_t value) { - uint32_t *pLimit = block + limit; - block += start; - while (block < pLimit) { - *block++ = value; - } -} - -void MutableCodePointTrie::setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { - return; - } - if ((uint32_t)start > MAX_UNICODE || (uint32_t)end > MAX_UNICODE || start > end) { - errorCode = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - if (!ensureHighStart(end)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - - UChar32 limit = end + 1; - if (start & UCPTRIE_SMALL_DATA_MASK) { - // Set partial block at [start..following block boundary[. - int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3); - if (block < 0) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - - UChar32 nextStart = (start + UCPTRIE_SMALL_DATA_MASK) & ~UCPTRIE_SMALL_DATA_MASK; - if (nextStart <= limit) { - fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, - value); - start = nextStart; - } else { - fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, limit & UCPTRIE_SMALL_DATA_MASK, - value); - return; - } - } - - // Number of positions in the last, partial block. - int32_t rest = limit & UCPTRIE_SMALL_DATA_MASK; - - // Round down limit to a block boundary. - limit &= ~UCPTRIE_SMALL_DATA_MASK; - - // Iterate over all-value blocks. - while (start < limit) { - int32_t i = start >> UCPTRIE_SHIFT_3; - if (flags[i] == ALL_SAME) { - index[i] = value; - } else /* MIXED */ { - fillBlock(data + index[i], 0, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, value); - } - start += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; - } - - if (rest > 0) { - // Set partial block at [last block boundary..limit[. - int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3); - if (block < 0) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - - fillBlock(data + block, 0, rest, value); - } -} - -/* compaction --------------------------------------------------------------- */ - -void MutableCodePointTrie::maskValues(uint32_t mask) { - initialValue &= mask; - errorValue &= mask; - highValue &= mask; - int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; - for (int32_t i = 0; i < iLimit; ++i) { - if (flags[i] == ALL_SAME) { - index[i] &= mask; - } - } - for (int32_t i = 0; i < dataLength; ++i) { - data[i] &= mask; - } -} - -template<typename UIntA, typename UIntB> -bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) { - while (length > 0 && *s == *t) { - ++s; - ++t; - --length; - } - return length == 0; -} - -bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) { - const uint32_t *pLimit = p + length; - while (p < pLimit && *p == value) { ++p; } - return p == pLimit; -} - -/** Search for an identical block. */ -int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, - const uint16_t *q, int32_t qStart, int32_t blockLength) { - // Ensure that we do not even partially get past length. - length -= blockLength; - - q += qStart; - while (pStart <= length) { - if (equalBlocks(p + pStart, q, blockLength)) { - return pStart; - } - ++pStart; - } - return -1; -} - -int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit, - uint32_t value, int32_t blockLength) { - // Ensure that we do not even partially get past limit. - limit -= blockLength; - - for (int32_t block = start; block <= limit; ++block) { - if (p[block] == value) { - for (int32_t i = 1;; ++i) { - if (i == blockLength) { - return block; - } - if (p[block + i] != value) { - block += i; - break; - } - } - } - } - return -1; -} - -/** - * Look for maximum overlap of the beginning of the other block - * with the previous, adjacent block. - */ -template<typename UIntA, typename UIntB> -int32_t getOverlap(const UIntA *p, int32_t length, - const UIntB *q, int32_t qStart, int32_t blockLength) { - int32_t overlap = blockLength - 1; - U_ASSERT(overlap <= length); - q += qStart; - while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { - --overlap; - } - return overlap; -} - -int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value, - int32_t blockLength) { - int32_t min = length - (blockLength - 1); - int32_t i = length; - while (min < i && p[i - 1] == value) { --i; } - return length - i; -} - -bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) { - for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { - if (index[i] == dataOffset) { - return true; - } - } - return false; -} - -/** - * Finds the start of the last range in the trie by enumerating backward. - * Indexes for code points higher than this will be omitted. - */ -UChar32 MutableCodePointTrie::findHighStart() const { - int32_t i = highStart >> UCPTRIE_SHIFT_3; - while (i > 0) { - bool match; - if (flags[--i] == ALL_SAME) { - match = index[i] == highValue; - } else /* MIXED */ { - const uint32_t *p = data + index[i]; - for (int32_t j = 0;; ++j) { - if (j == UCPTRIE_SMALL_DATA_BLOCK_LENGTH) { - match = true; - break; - } - if (p[j] != highValue) { - match = false; - break; - } - } - } - if (!match) { - return (i + 1) << UCPTRIE_SHIFT_3; - } - } - return 0; -} - -class AllSameBlocks { -public: - static constexpr int32_t NEW_UNIQUE = -1; - static constexpr int32_t OVERFLOW = -2; - - AllSameBlocks() : length(0), mostRecent(-1) {} - - int32_t findOrAdd(int32_t index, int32_t count, uint32_t value) { - if (mostRecent >= 0 && values[mostRecent] == value) { - refCounts[mostRecent] += count; - return indexes[mostRecent]; - } - for (int32_t i = 0; i < length; ++i) { - if (values[i] == value) { - mostRecent = i; - refCounts[i] += count; - return indexes[i]; - } - } - if (length == CAPACITY) { - return OVERFLOW; - } - mostRecent = length; - indexes[length] = index; - values[length] = value; - refCounts[length++] = count; - return NEW_UNIQUE; - } - - /** Replaces the block which has the lowest reference count. */ - void add(int32_t index, int32_t count, uint32_t value) { - U_ASSERT(length == CAPACITY); - int32_t least = -1; - int32_t leastCount = I_LIMIT; - for (int32_t i = 0; i < length; ++i) { - U_ASSERT(values[i] != value); - if (refCounts[i] < leastCount) { - least = i; - leastCount = refCounts[i]; - } - } - U_ASSERT(least >= 0); - mostRecent = least; - indexes[least] = index; - values[least] = value; - refCounts[least] = count; - } - - int32_t findMostUsed() const { - if (length == 0) { return -1; } - int32_t max = -1; - int32_t maxCount = 0; - for (int32_t i = 0; i < length; ++i) { - if (refCounts[i] > maxCount) { - max = i; - maxCount = refCounts[i]; - } - } - return indexes[max]; - } - -private: - static constexpr int32_t CAPACITY = 32; - - int32_t length; - int32_t mostRecent; - - int32_t indexes[CAPACITY]; - uint32_t values[CAPACITY]; - int32_t refCounts[CAPACITY]; -}; - -// Custom hash table for mixed-value blocks to be found anywhere in the -// compacted data or index so far. -class MixedBlocks { -public: - MixedBlocks() {} - ~MixedBlocks() { - uprv_free(table); - } - - bool init(int32_t maxLength, int32_t newBlockLength) { - // We store actual data indexes + 1 to reserve 0 for empty entries. - int32_t maxDataIndex = maxLength - newBlockLength + 1; - int32_t newLength; - if (maxDataIndex <= 0xfff) { // 4k - newLength = 6007; - shift = 12; - mask = 0xfff; - } else if (maxDataIndex <= 0x7fff) { // 32k - newLength = 50021; - shift = 15; - mask = 0x7fff; - } else if (maxDataIndex <= 0x1ffff) { // 128k - newLength = 200003; - shift = 17; - mask = 0x1ffff; - } else { - // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M - newLength = 1500007; - shift = 21; - mask = 0x1fffff; - } - if (newLength > capacity) { - uprv_free(table); - table = (uint32_t *)uprv_malloc(newLength * 4); - if (table == nullptr) { - return false; - } - capacity = newLength; - } - length = newLength; - uprv_memset(table, 0, length * 4); - - blockLength = newBlockLength; - return true; - } - - template<typename UInt> - void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) { - int32_t start = prevDataLength - blockLength; - if (start >= minStart) { - ++start; // Skip the last block that we added last time. - } else { - start = minStart; // Begin with the first full block. - } - for (int32_t end = newDataLength - blockLength; start <= end; ++start) { - uint32_t hashCode = makeHashCode(data, start); - addEntry(data, start, hashCode, start); - } - } - - template<typename UIntA, typename UIntB> - int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const { - uint32_t hashCode = makeHashCode(blockData, blockStart); - int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode); - if (entryIndex >= 0) { - return (table[entryIndex] & mask) - 1; - } else { - return -1; - } - } - - int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const { - uint32_t hashCode = makeHashCode(blockValue); - int32_t entryIndex = findEntry(data, blockValue, hashCode); - if (entryIndex >= 0) { - return (table[entryIndex] & mask) - 1; - } else { - return -1; - } - } - -private: - template<typename UInt> - uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const { - int32_t blockLimit = blockStart + blockLength; - uint32_t hashCode = blockData[blockStart++]; - do { - hashCode = 37 * hashCode + blockData[blockStart++]; - } while (blockStart < blockLimit); - return hashCode; - } - - uint32_t makeHashCode(uint32_t blockValue) const { - uint32_t hashCode = blockValue; - for (int32_t i = 1; i < blockLength; ++i) { - hashCode = 37 * hashCode + blockValue; - } - return hashCode; - } - - template<typename UInt> - void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) { - U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask); - int32_t entryIndex = findEntry(data, data, blockStart, hashCode); - if (entryIndex < 0) { - table[~entryIndex] = (hashCode << shift) | (dataIndex + 1); - } - } - - template<typename UIntA, typename UIntB> - int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart, - uint32_t hashCode) const { - uint32_t shiftedHashCode = hashCode << shift; - int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 - for (int32_t entryIndex = initialEntryIndex;;) { - uint32_t entry = table[entryIndex]; - if (entry == 0) { - return ~entryIndex; - } - if ((entry & ~mask) == shiftedHashCode) { - int32_t dataIndex = (entry & mask) - 1; - if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) { - return entryIndex; - } - } - entryIndex = nextIndex(initialEntryIndex, entryIndex); - } - } - - int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const { - uint32_t shiftedHashCode = hashCode << shift; - int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 - for (int32_t entryIndex = initialEntryIndex;;) { - uint32_t entry = table[entryIndex]; - if (entry == 0) { - return ~entryIndex; - } - if ((entry & ~mask) == shiftedHashCode) { - int32_t dataIndex = (entry & mask) - 1; - if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) { - return entryIndex; - } - } - entryIndex = nextIndex(initialEntryIndex, entryIndex); - } - } - - inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const { - // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length); - return (entryIndex + initialEntryIndex) % length; - } - - // Hash table. - // The length is a prime number, larger than the maximum data length. - // The "shift" lower bits store a data index + 1. - // The remaining upper bits store a partial hashCode of the block data values. - uint32_t *table = nullptr; - int32_t capacity = 0; - int32_t length = 0; - int32_t shift = 0; - uint32_t mask = 0; - - int32_t blockLength = 0; -}; - -int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) { -#ifdef UCPTRIE_DEBUG - bool overflow = false; -#endif - - // ASCII data will be stored as a linear table, even if the following code - // does not yet count it that way. - int32_t newDataCapacity = ASCII_LIMIT; - // Add room for a small data null block in case it would match the start of - // a fast data block where dataNullOffset must not be set in that case. - newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; - // Add room for special values (errorValue, highValue) and padding. - newDataCapacity += 4; - int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; - int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; - int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; - for (int32_t i = 0; i < iLimit; i += inc) { - if (i == fastILimit) { - blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; - inc = 1; - } - uint32_t value = index[i]; - if (flags[i] == MIXED) { - // Really mixed? - const uint32_t *p = data + value; - value = *p; - if (allValuesSameAs(p + 1, blockLength - 1, value)) { - flags[i] = ALL_SAME; - index[i] = value; - // Fall through to ALL_SAME handling. - } else { - newDataCapacity += blockLength; - continue; - } - } else { - U_ASSERT(flags[i] == ALL_SAME); - if (inc > 1) { - // Do all of the fast-range data block's ALL_SAME parts have the same value? - bool allSame = true; - int32_t next_i = i + inc; - for (int32_t j = i + 1; j < next_i; ++j) { - U_ASSERT(flags[j] == ALL_SAME); - if (index[j] != value) { - allSame = false; - break; - } - } - if (!allSame) { - // Turn it into a MIXED block. - if (getDataBlock(i) < 0) { - return -1; - } - newDataCapacity += blockLength; - continue; - } - } - } - // Is there another ALL_SAME block with the same value? - int32_t other = allSameBlocks.findOrAdd(i, inc, value); - if (other == AllSameBlocks::OVERFLOW) { - // The fixed-size array overflowed. Slow check for a duplicate block. -#ifdef UCPTRIE_DEBUG - if (!overflow) { - puts("UCPTrie AllSameBlocks overflow"); - overflow = true; - } -#endif - int32_t jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; - for (int32_t j = 0;; j += jInc) { - if (j == i) { - allSameBlocks.add(i, inc, value); - break; - } - if (j == fastILimit) { - jInc = 1; - } - if (flags[j] == ALL_SAME && index[j] == value) { - allSameBlocks.add(j, jInc + inc, value); - other = j; - break; - // We could keep counting blocks with the same value - // before we add the first one, which may improve compaction in rare cases, - // but it would make it slower. - } - } - } - if (other >= 0) { - flags[i] = SAME_AS; - index[i] = other; - } else { - // New unique same-value block. - newDataCapacity += blockLength; - } - } - return newDataCapacity; -} - -#ifdef UCPTRIE_DEBUG -# define DEBUG_DO(expr) expr -#else -# define DEBUG_DO(expr) -#endif - -#ifdef UCPTRIE_DEBUG -// Braille symbols: U+28xx = UTF-8 E2 A0 80..E2 A3 BF -int32_t appendValue(char s[], int32_t length, uint32_t value) { - value ^= value >> 16; - value ^= value >> 8; - s[length] = 0xE2; - s[length + 1] = (char)(0xA0 + ((value >> 6) & 3)); - s[length + 2] = (char)(0x80 + (value & 0x3F)); - return length + 3; -} - -void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value, - UChar32 start, int32_t overlap, uint32_t initialValue) { - char s[UCPTRIE_FAST_DATA_BLOCK_LENGTH * 3 + 3]; - int32_t length = 0; - int32_t i; - for (i = 0; i < overlap; ++i) { - length = appendValue(s, length, 0); // Braille blank - } - s[length++] = '|'; - for (; i < blockLength; ++i) { - if (block != nullptr) { - value = block[i]; - } - if (value == initialValue) { - value = 0x40; // Braille lower left dot - } - length = appendValue(s, length, value); - } - s[length] = 0; - start += overlap; - if (start <= 0xffff) { - printf(" %04lX %s|\n", (long)start, s); - } else if (start <= 0xfffff) { - printf(" %5lX %s|\n", (long)start, s); - } else { - printf(" %6lX %s|\n", (long)start, s); - } -} -#endif - -/** - * Compacts a build-time trie. - * - * The compaction - * - removes blocks that are identical with earlier ones - * - overlaps each new non-duplicate block as much as possible with the previously-written one - * - works with fast-range data blocks whose length is a multiple of that of - * higher-code-point data blocks - * - * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. - */ -int32_t MutableCodePointTrie::compactData( - int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, - int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) { -#ifdef UCPTRIE_DEBUG - int32_t countSame=0, sumOverlaps=0; - bool printData = dataLength == 29088 /* line.brk */ || - // dataLength == 30048 /* CanonIterData */ || - dataLength == 50400 /* zh.txt~stroke */; -#endif - - // The linear ASCII data has been copied into newData already. - int32_t newDataLength = 0; - for (int32_t i = 0; newDataLength < ASCII_LIMIT; - newDataLength += UCPTRIE_FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { - index[i] = newDataLength; -#ifdef UCPTRIE_DEBUG - if (printData) { - printBlock(newData + newDataLength, UCPTRIE_FAST_DATA_BLOCK_LENGTH, 0, newDataLength, 0, initialValue); - } -#endif - } - - int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; - if (!mixedBlocks.init(newDataCapacity, blockLength)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - mixedBlocks.extend(newData, 0, 0, newDataLength); - - int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; - int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; - int32_t fastLength = 0; - for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) { - if (i == fastILimit) { - blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; - inc = 1; - fastLength = newDataLength; - if (!mixedBlocks.init(newDataCapacity, blockLength)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - mixedBlocks.extend(newData, 0, 0, newDataLength); - } - if (flags[i] == ALL_SAME) { - uint32_t value = index[i]; - // Find an earlier part of the data array of length blockLength - // that is filled with this value. - int32_t n = mixedBlocks.findAllSameBlock(newData, value); - // If we find a match, and the current block is the data null block, - // and it is not a fast block but matches the start of a fast block, - // then we need to continue looking. - // This is because this small block is shorter than the fast block, - // and not all of the rest of the fast block is filled with this value. - // Otherwise trie.getRange() would detect that the fast block starts at - // dataNullOffset and assume incorrectly that it is filled with the null value. - while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength && - isStartOfSomeFastBlock(n, index, fastILimit)) { - n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength); - } - if (n >= 0) { - DEBUG_DO(++countSame); - index[i] = n; - } else { - n = getAllSameOverlap(newData, newDataLength, value, blockLength); - DEBUG_DO(sumOverlaps += n); -#ifdef UCPTRIE_DEBUG - if (printData) { - printBlock(nullptr, blockLength, value, i << UCPTRIE_SHIFT_3, n, initialValue); - } -#endif - index[i] = newDataLength - n; - int32_t prevDataLength = newDataLength; - while (n < blockLength) { - newData[newDataLength++] = value; - ++n; - } - mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); - } - } else if (flags[i] == MIXED) { - const uint32_t *block = data + index[i]; - int32_t n = mixedBlocks.findBlock(newData, block, 0); - if (n >= 0) { - DEBUG_DO(++countSame); - index[i] = n; - } else { - n = getOverlap(newData, newDataLength, block, 0, blockLength); - DEBUG_DO(sumOverlaps += n); -#ifdef UCPTRIE_DEBUG - if (printData) { - printBlock(block, blockLength, 0, i << UCPTRIE_SHIFT_3, n, initialValue); - } -#endif - index[i] = newDataLength - n; - int32_t prevDataLength = newDataLength; - while (n < blockLength) { - newData[newDataLength++] = block[n++]; - } - mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); - } - } else /* SAME_AS */ { - uint32_t j = index[i]; - index[i] = index[j]; - } - } - -#ifdef UCPTRIE_DEBUG - /* we saved some space */ - printf("compacting UCPTrie: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n", - (long)dataLength, (long)newDataLength, (long)countSame, (long)sumOverlaps); -#endif - return newDataLength; -} - -int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, - UErrorCode &errorCode) { - int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3); - if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) { - // Only the linear fast index, no multi-stage index tables. - index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET; - return fastIndexLength; - } - - // Condense the fast index table. - // Also, does it contain an index-3 block with all dataNullOffset? - uint16_t fastIndex[UCPTRIE_BMP_INDEX_LENGTH]; // fastIndexLength - int32_t i3FirstNull = -1; - for (int32_t i = 0, j = 0; i < fastILimit; ++j) { - uint32_t i3 = index[i]; - fastIndex[j] = (uint16_t)i3; - if (i3 == (uint32_t)dataNullOffset) { - if (i3FirstNull < 0) { - i3FirstNull = j; - } else if (index3NullOffset < 0 && - (j - i3FirstNull + 1) == UCPTRIE_INDEX_3_BLOCK_LENGTH) { - index3NullOffset = i3FirstNull; - } - } else { - i3FirstNull = -1; - } - // Set the index entries that compactData() skipped. - // Needed when the multi-stage index covers the fast index range as well. - int32_t iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; - while (++i < iNext) { - i3 += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; - index[i] = i3; - } - } - - if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength); - - // Examine index-3 blocks. For each determine one of: - // - same as the index-3 null block - // - same as a fast-index block - // - 16-bit indexes - // - 18-bit indexes - // We store this in the first flags entry for the index-3 block. - // - // Also determine an upper limit for the index-3 table length. - int32_t index3Capacity = 0; - i3FirstNull = index3NullOffset; - bool hasLongI3Blocks = false; - // If the fast index covers the whole BMP, then - // the multi-stage index is only for supplementary code points. - // Otherwise, the multi-stage index covers all of Unicode. - int32_t iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT; - int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; - for (int32_t i = iStart; i < iLimit;) { - int32_t j = i; - int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; - uint32_t oredI3 = 0; - bool isNull = true; - do { - uint32_t i3 = index[j]; - oredI3 |= i3; - if (i3 != (uint32_t)dataNullOffset) { - isNull = false; - } - } while (++j < jLimit); - if (isNull) { - flags[i] = I3_NULL; - if (i3FirstNull < 0) { - if (oredI3 <= 0xffff) { - index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; - } else { - index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; - hasLongI3Blocks = true; - } - i3FirstNull = 0; - } - } else { - if (oredI3 <= 0xffff) { - int32_t n = mixedBlocks.findBlock(fastIndex, index, i); - if (n >= 0) { - flags[i] = I3_BMP; - index[i] = n; - } else { - flags[i] = I3_16; - index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; - } - } else { - flags[i] = I3_18; - index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; - hasLongI3Blocks = true; - } - } - i = j; - } - - int32_t index2Capacity = (iLimit - iStart) >> UCPTRIE_SHIFT_2_3; - - // Length of the index-1 table, rounded up. - int32_t index1Length = (index2Capacity + UCPTRIE_INDEX_2_MASK) >> UCPTRIE_SHIFT_1_2; - - // Index table: Fast index, index-1, index-3, index-2. - // +1 for possible index table padding. - int32_t index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1; - index16 = (uint16_t *)uprv_malloc(index16Capacity * 2); - if (index16 == nullptr) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - uprv_memcpy(index16, fastIndex, fastIndexLength * 2); - - if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - MixedBlocks longI3Blocks; - if (hasLongI3Blocks) { - if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - } - - // Compact the index-3 table and write an uncompacted version of the index-2 table. - uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity - int32_t i2Length = 0; - i3FirstNull = index3NullOffset; - int32_t index3Start = fastIndexLength + index1Length; - int32_t indexLength = index3Start; - for (int32_t i = iStart; i < iLimit; i += UCPTRIE_INDEX_3_BLOCK_LENGTH) { - int32_t i3; - uint8_t f = flags[i]; - if (f == I3_NULL && i3FirstNull < 0) { - // First index-3 null block. Write & overlap it like a normal block, then remember it. - f = dataNullOffset <= 0xffff ? I3_16 : I3_18; - i3FirstNull = 0; - } - if (f == I3_NULL) { - i3 = index3NullOffset; - } else if (f == I3_BMP) { - i3 = index[i]; - } else if (f == I3_16) { - int32_t n = mixedBlocks.findBlock(index16, index, i); - if (n >= 0) { - i3 = n; - } else { - if (indexLength == index3Start) { - // No overlap at the boundary between the index-1 and index-3 tables. - n = 0; - } else { - n = getOverlap(index16, indexLength, - index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); - } - i3 = indexLength - n; - int32_t prevIndexLength = indexLength; - while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) { - index16[indexLength++] = index[i + n++]; - } - mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); - if (hasLongI3Blocks) { - longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); - } - } - } else { - U_ASSERT(f == I3_18); - U_ASSERT(hasLongI3Blocks); - // Encode an index-3 block that contains one or more data indexes exceeding 16 bits. - int32_t j = i; - int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; - int32_t k = indexLength; - do { - ++k; - uint32_t v = index[j++]; - uint32_t upperBits = (v & 0x30000) >> 2; - index16[k++] = v; - v = index[j++]; - upperBits |= (v & 0x30000) >> 4; - index16[k++] = v; - v = index[j++]; - upperBits |= (v & 0x30000) >> 6; - index16[k++] = v; - v = index[j++]; - upperBits |= (v & 0x30000) >> 8; - index16[k++] = v; - v = index[j++]; - upperBits |= (v & 0x30000) >> 10; - index16[k++] = v; - v = index[j++]; - upperBits |= (v & 0x30000) >> 12; - index16[k++] = v; - v = index[j++]; - upperBits |= (v & 0x30000) >> 14; - index16[k++] = v; - v = index[j++]; - upperBits |= (v & 0x30000) >> 16; - index16[k++] = v; - index16[k - 9] = upperBits; - } while (j < jLimit); - int32_t n = longI3Blocks.findBlock(index16, index16, indexLength); - if (n >= 0) { - i3 = n | 0x8000; - } else { - if (indexLength == index3Start) { - // No overlap at the boundary between the index-1 and index-3 tables. - n = 0; - } else { - n = getOverlap(index16, indexLength, - index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); - } - i3 = (indexLength - n) | 0x8000; - int32_t prevIndexLength = indexLength; - if (n > 0) { - int32_t start = indexLength; - while (n < INDEX_3_18BIT_BLOCK_LENGTH) { - index16[indexLength++] = index16[start + n++]; - } - } else { - indexLength += INDEX_3_18BIT_BLOCK_LENGTH; - } - mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); - if (hasLongI3Blocks) { - longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); - } - } - } - if (index3NullOffset < 0 && i3FirstNull >= 0) { - index3NullOffset = i3; - } - // Set the index-2 table entry. - index2[i2Length++] = i3; - } - U_ASSERT(i2Length == index2Capacity); - U_ASSERT(indexLength <= index3Start + index3Capacity); - - if (index3NullOffset < 0) { - index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET; - } - if (indexLength >= (UCPTRIE_NO_INDEX3_NULL_OFFSET + UCPTRIE_INDEX_3_BLOCK_LENGTH)) { - // The index-3 offsets exceed 15 bits, or - // the last one cannot be distinguished from the no-null-block value. - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - - // Compact the index-2 table and write the index-1 table. - static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH, - "must re-init mixedBlocks"); - int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH; - int32_t i1 = fastIndexLength; - for (int32_t i = 0; i < i2Length; i += blockLength) { - int32_t n; - if ((i2Length - i) >= blockLength) { - // normal block - U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH); - n = mixedBlocks.findBlock(index16, index2, i); - } else { - // highStart is inside the last index-2 block. Shorten it. - blockLength = i2Length - i; - n = findSameBlock(index16, index3Start, indexLength, - index2, i, blockLength); - } - int32_t i2; - if (n >= 0) { - i2 = n; - } else { - if (indexLength == index3Start) { - // No overlap at the boundary between the index-1 and index-3/2 tables. - n = 0; - } else { - n = getOverlap(index16, indexLength, index2, i, blockLength); - } - i2 = indexLength - n; - int32_t prevIndexLength = indexLength; - while (n < blockLength) { - index16[indexLength++] = index2[i + n++]; - } - mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); - } - // Set the index-1 table entry. - index16[i1++] = i2; - } - U_ASSERT(i1 == index3Start); - U_ASSERT(indexLength <= index16Capacity); - -#ifdef UCPTRIE_DEBUG - /* we saved some space */ - printf("compacting UCPTrie: count of 16-bit index words %lu->%lu\n", - (long)iLimit, (long)indexLength); -#endif - - return indexLength; -} - -int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorCode) { - // Find the real highStart and round it up. - U_ASSERT((highStart & (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) == 0); - highValue = get(MAX_UNICODE); - int32_t realHighStart = findHighStart(); - realHighStart = (realHighStart + (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) & - ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1); - if (realHighStart == UNICODE_LIMIT) { - highValue = initialValue; - } - -#ifdef UCPTRIE_DEBUG - printf("UCPTrie: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n", - (long)realHighStart, (long)highValue, (long)initialValue); -#endif - - // We always store indexes and data values for the fast range. - // Pin highStart to the top of that range while building. - UChar32 fastLimit = fastILimit << UCPTRIE_SHIFT_3; - if (realHighStart < fastLimit) { - for (int32_t i = (realHighStart >> UCPTRIE_SHIFT_3); i < fastILimit; ++i) { - flags[i] = ALL_SAME; - index[i] = highValue; - } - highStart = fastLimit; - } else { - highStart = realHighStart; - } - - uint32_t asciiData[ASCII_LIMIT]; - for (int32_t i = 0; i < ASCII_LIMIT; ++i) { - asciiData[i] = get(i); - } - - // First we look for which data blocks have the same value repeated over the whole block, - // deduplicate such blocks, find a good null data block (for faster enumeration), - // and get an upper bound for the necessary data array length. - AllSameBlocks allSameBlocks; - int32_t newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks); - if (newDataCapacity < 0) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - uint32_t *newData = (uint32_t *)uprv_malloc(newDataCapacity * 4); - if (newData == nullptr) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - uprv_memcpy(newData, asciiData, sizeof(asciiData)); - - int32_t dataNullIndex = allSameBlocks.findMostUsed(); - - MixedBlocks mixedBlocks; - int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity, - dataNullIndex, mixedBlocks, errorCode); - if (U_FAILURE(errorCode)) { return 0; } - U_ASSERT(newDataLength <= newDataCapacity); - uprv_free(data); - data = newData; - dataCapacity = newDataCapacity; - dataLength = newDataLength; - if (dataLength > (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH)) { - // The offset of the last data block is too high to be stored in the index table. - errorCode = U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - - if (dataNullIndex >= 0) { - dataNullOffset = index[dataNullIndex]; -#ifdef UCPTRIE_DEBUG - if (data[dataNullOffset] != initialValue) { - printf("UCPTrie initialValue %lx -> more common nullValue %lx\n", - (long)initialValue, (long)data[dataNullOffset]); - } -#endif - initialValue = data[dataNullOffset]; - } else { - dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET; - } - - int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode); - highStart = realHighStart; - return indexLength; -} - -UCPTrie *MutableCodePointTrie::build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { - return nullptr; - } - if (type < UCPTRIE_TYPE_FAST || UCPTRIE_TYPE_SMALL < type || - valueWidth < UCPTRIE_VALUE_BITS_16 || UCPTRIE_VALUE_BITS_8 < valueWidth) { - errorCode = U_ILLEGAL_ARGUMENT_ERROR; - return nullptr; - } - - // The mutable trie always stores 32-bit values. - // When we build a UCPTrie for a smaller value width, we first mask off unused bits - // before compacting the data. - switch (valueWidth) { - case UCPTRIE_VALUE_BITS_32: - break; - case UCPTRIE_VALUE_BITS_16: - maskValues(0xffff); - break; - case UCPTRIE_VALUE_BITS_8: - maskValues(0xff); - break; - default: - break; - } - - UChar32 fastLimit = type == UCPTRIE_TYPE_FAST ? BMP_LIMIT : UCPTRIE_SMALL_LIMIT; - int32_t indexLength = compactTrie(fastLimit >> UCPTRIE_SHIFT_3, errorCode); - if (U_FAILURE(errorCode)) { - clear(); - return nullptr; - } - - // Ensure data table alignment: The index length must be even for uint32_t data. - if (valueWidth == UCPTRIE_VALUE_BITS_32 && (indexLength & 1) != 0) { - index16[indexLength++] = 0xffee; // arbitrary value - } - - // Make the total trie structure length a multiple of 4 bytes by padding the data table, - // and store special values as the last two data values. - int32_t length = indexLength * 2; - if (valueWidth == UCPTRIE_VALUE_BITS_16) { - if (((indexLength ^ dataLength) & 1) != 0) { - // padding - data[dataLength++] = errorValue; - } - if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { - data[dataLength++] = highValue; - data[dataLength++] = errorValue; - } - length += dataLength * 2; - } else if (valueWidth == UCPTRIE_VALUE_BITS_32) { - // 32-bit data words never need padding to a multiple of 4 bytes. - if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { - if (data[dataLength - 1] != highValue) { - data[dataLength++] = highValue; - } - data[dataLength++] = errorValue; - } - length += dataLength * 4; - } else { - int32_t and3 = (length + dataLength) & 3; - if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) { - // all set - } else if(and3 == 3 && data[dataLength - 1] == highValue) { - data[dataLength++] = errorValue; - } else { - while (and3 != 2) { - data[dataLength++] = highValue; - and3 = (and3 + 1) & 3; - } - data[dataLength++] = highValue; - data[dataLength++] = errorValue; - } - length += dataLength; - } - - // Calculate the total length of the UCPTrie as a single memory block. - length += sizeof(UCPTrie); - U_ASSERT((length & 3) == 0); - - uint8_t *bytes = (uint8_t *)uprv_malloc(length); - if (bytes == nullptr) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - clear(); - return nullptr; - } - UCPTrie *trie = reinterpret_cast<UCPTrie *>(bytes); - uprv_memset(trie, 0, sizeof(UCPTrie)); - trie->indexLength = indexLength; - trie->dataLength = dataLength; - - trie->highStart = highStart; - // Round up shifted12HighStart to a multiple of 0x1000 for easy testing from UTF-8 lead bytes. - // Runtime code needs to then test for the real highStart as well. - trie->shifted12HighStart = (highStart + 0xfff) >> 12; - trie->type = type; - trie->valueWidth = valueWidth; - - trie->index3NullOffset = index3NullOffset; - trie->dataNullOffset = dataNullOffset; - trie->nullValue = initialValue; - - bytes += sizeof(UCPTrie); - - // Fill the index and data arrays. - uint16_t *dest16 = (uint16_t *)bytes; - trie->index = dest16; - - if (highStart <= fastLimit) { - // Condense only the fast index from the mutable-trie index. - for (int32_t i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) { - *dest16++ = (uint16_t)index[i]; // dest16[j] - } - } else { - uprv_memcpy(dest16, index16, indexLength * 2); - dest16 += indexLength; - } - bytes += indexLength * 2; - - // Write the data array. - const uint32_t *p = data; - switch (valueWidth) { - case UCPTRIE_VALUE_BITS_16: - // Write 16-bit data values. - trie->data.ptr16 = dest16; - for (int32_t i = dataLength; i > 0; --i) { - *dest16++ = (uint16_t)*p++; - } - break; - case UCPTRIE_VALUE_BITS_32: - // Write 32-bit data values. - trie->data.ptr32 = (uint32_t *)bytes; - uprv_memcpy(bytes, p, (size_t)dataLength * 4); - break; - case UCPTRIE_VALUE_BITS_8: - // Write 8-bit data values. - trie->data.ptr8 = bytes; - for (int32_t i = dataLength; i > 0; --i) { - *bytes++ = (uint8_t)*p++; - } - break; - default: - // Will not occur, valueWidth checked at the beginning. - break; - } - -#ifdef UCPTRIE_DEBUG - trie->name = name; - - ucptrie_printLengths(trie, ""); -#endif - - clear(); - return trie; -} - -} // namespace - -U_NAMESPACE_END - -U_NAMESPACE_USE - -U_CAPI UMutableCPTrie * U_EXPORT2 -umutablecptrie_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { - if (U_FAILURE(*pErrorCode)) { - return nullptr; - } - LocalPointer<MutableCodePointTrie> trie( - new MutableCodePointTrie(initialValue, errorValue, *pErrorCode), *pErrorCode); - if (U_FAILURE(*pErrorCode)) { - return nullptr; - } - return reinterpret_cast<UMutableCPTrie *>(trie.orphan()); -} - -U_CAPI UMutableCPTrie * U_EXPORT2 -umutablecptrie_clone(const UMutableCPTrie *other, UErrorCode *pErrorCode) { - if (U_FAILURE(*pErrorCode)) { - return nullptr; - } - if (other == nullptr) { - return nullptr; - } - LocalPointer<MutableCodePointTrie> clone( - new MutableCodePointTrie(*reinterpret_cast<const MutableCodePointTrie *>(other), *pErrorCode), *pErrorCode); - if (U_FAILURE(*pErrorCode)) { - return nullptr; - } - return reinterpret_cast<UMutableCPTrie *>(clone.orphan()); -} - -U_CAPI void U_EXPORT2 -umutablecptrie_close(UMutableCPTrie *trie) { - delete reinterpret_cast<MutableCodePointTrie *>(trie); -} - -U_CAPI UMutableCPTrie * U_EXPORT2 -umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) { - if (U_FAILURE(*pErrorCode)) { - return nullptr; - } - if (map == nullptr) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return nullptr; - } - return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode)); -} - -U_CAPI UMutableCPTrie * U_EXPORT2 -umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) { - if (U_FAILURE(*pErrorCode)) { - return nullptr; - } - if (trie == nullptr) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return nullptr; - } - return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPTrie(trie, *pErrorCode)); -} - -U_CAPI uint32_t U_EXPORT2 -umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c) { - return reinterpret_cast<const MutableCodePointTrie *>(trie)->get(c); -} - -namespace { - -UChar32 getRange(const void *trie, UChar32 start, - UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { - return reinterpret_cast<const MutableCodePointTrie *>(trie)-> - getRange(start, filter, context, pValue); -} - -} // namespace - -U_CAPI UChar32 U_EXPORT2 -umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start, - UCPMapRangeOption option, uint32_t surrogateValue, - UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { - return ucptrie_internalGetRange(getRange, trie, start, - option, surrogateValue, - filter, context, pValue); -} - -U_CAPI void U_EXPORT2 -umutablecptrie_set(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { - if (U_FAILURE(*pErrorCode)) { - return; - } - reinterpret_cast<MutableCodePointTrie *>(trie)->set(c, value, *pErrorCode); -} - -U_CAPI void U_EXPORT2 -umutablecptrie_setRange(UMutableCPTrie *trie, UChar32 start, UChar32 end, - uint32_t value, UErrorCode *pErrorCode) { - if (U_FAILURE(*pErrorCode)) { - return; - } - reinterpret_cast<MutableCodePointTrie *>(trie)->setRange(start, end, value, *pErrorCode); -} - -/* Compact and internally serialize the trie. */ -U_CAPI UCPTrie * U_EXPORT2 -umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth, - UErrorCode *pErrorCode) { - if (U_FAILURE(*pErrorCode)) { - return nullptr; - } - return reinterpret_cast<MutableCodePointTrie *>(trie)->build(type, valueWidth, *pErrorCode); -} - -#ifdef UCPTRIE_DEBUG -U_CFUNC void umutablecptrie_setName(UMutableCPTrie *trie, const char *name) { - reinterpret_cast<MutableCodePointTrie *>(trie)->name = name; -} -#endif +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// umutablecptrie.cpp (inspired by utrie2_builder.cpp) +// created: 2017dec29 Markus W. Scherer + +// #define UCPTRIE_DEBUG +#ifdef UCPTRIE_DEBUG +# include <stdio.h> +#endif + +#include "unicode/utypes.h" +#include "unicode/ucptrie.h" +#include "unicode/umutablecptrie.h" +#include "unicode/uobject.h" +#include "unicode/utf16.h" +#include "cmemory.h" +#include "uassert.h" +#include "ucptrie_impl.h" + +// ICU-20235 In case Microsoft math.h has defined this, undefine it. +#ifdef OVERFLOW +#undef OVERFLOW +#endif + +U_NAMESPACE_BEGIN + +namespace { + +constexpr int32_t MAX_UNICODE = 0x10ffff; + +constexpr int32_t UNICODE_LIMIT = 0x110000; +constexpr int32_t BMP_LIMIT = 0x10000; +constexpr int32_t ASCII_LIMIT = 0x80; + +constexpr int32_t I_LIMIT = UNICODE_LIMIT >> UCPTRIE_SHIFT_3; +constexpr int32_t BMP_I_LIMIT = BMP_LIMIT >> UCPTRIE_SHIFT_3; +constexpr int32_t ASCII_I_LIMIT = ASCII_LIMIT >> UCPTRIE_SHIFT_3; + +constexpr int32_t SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3)); + +// Flag values for data blocks. +constexpr uint8_t ALL_SAME = 0; +constexpr uint8_t MIXED = 1; +constexpr uint8_t SAME_AS = 2; + +/** Start with allocation of 16k data entries. */ +constexpr int32_t INITIAL_DATA_LENGTH = ((int32_t)1 << 14); + +/** Grow about 8x each time. */ +constexpr int32_t MEDIUM_DATA_LENGTH = ((int32_t)1 << 17); + +/** + * Maximum length of the build-time data array. + * One entry per 0x110000 code points. + */ +constexpr int32_t MAX_DATA_LENGTH = UNICODE_LIMIT; + +// Flag values for index-3 blocks while compacting/building. +constexpr uint8_t I3_NULL = 0; +constexpr uint8_t I3_BMP = 1; +constexpr uint8_t I3_16 = 2; +constexpr uint8_t I3_18 = 3; + +constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8; + +class AllSameBlocks; +class MixedBlocks; + +class MutableCodePointTrie : public UMemory { +public: + MutableCodePointTrie(uint32_t initialValue, uint32_t errorValue, UErrorCode &errorCode); + MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode); + MutableCodePointTrie(const MutableCodePointTrie &other) = delete; + ~MutableCodePointTrie(); + + MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete; + + static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode); + static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode); + + uint32_t get(UChar32 c) const; + int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context, + uint32_t *pValue) const; + + void set(UChar32 c, uint32_t value, UErrorCode &errorCode); + void setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode); + + UCPTrie *build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode); + +private: + void clear(); + + bool ensureHighStart(UChar32 c); + int32_t allocDataBlock(int32_t blockLength); + int32_t getDataBlock(int32_t i); + + void maskValues(uint32_t mask); + UChar32 findHighStart() const; + int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks); + int32_t compactData( + int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, + int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode); + int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode); + int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode); + + uint32_t *index = nullptr; + int32_t indexCapacity = 0; + int32_t index3NullOffset = -1; + uint32_t *data = nullptr; + int32_t dataCapacity = 0; + int32_t dataLength = 0; + int32_t dataNullOffset = -1; + + uint32_t origInitialValue; + uint32_t initialValue; + uint32_t errorValue; + UChar32 highStart; + uint32_t highValue; +#ifdef UCPTRIE_DEBUG +public: + const char *name; +#endif +private: + /** Temporary array while building the final data. */ + uint16_t *index16 = nullptr; + uint8_t flags[UNICODE_LIMIT >> UCPTRIE_SHIFT_3]; +}; + +MutableCodePointTrie::MutableCodePointTrie(uint32_t iniValue, uint32_t errValue, UErrorCode &errorCode) : + origInitialValue(iniValue), initialValue(iniValue), errorValue(errValue), + highStart(0), highValue(initialValue) +#ifdef UCPTRIE_DEBUG + , name("open") +#endif + { + if (U_FAILURE(errorCode)) { return; } + index = (uint32_t *)uprv_malloc(BMP_I_LIMIT * 4); + data = (uint32_t *)uprv_malloc(INITIAL_DATA_LENGTH * 4); + if (index == nullptr || data == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + indexCapacity = BMP_I_LIMIT; + dataCapacity = INITIAL_DATA_LENGTH; +} + +MutableCodePointTrie::MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode) : + index3NullOffset(other.index3NullOffset), + dataNullOffset(other.dataNullOffset), + origInitialValue(other.origInitialValue), initialValue(other.initialValue), + errorValue(other.errorValue), + highStart(other.highStart), highValue(other.highValue) +#ifdef UCPTRIE_DEBUG + , name("mutable clone") +#endif + { + if (U_FAILURE(errorCode)) { return; } + int32_t iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT; + index = (uint32_t *)uprv_malloc(iCapacity * 4); + data = (uint32_t *)uprv_malloc(other.dataCapacity * 4); + if (index == nullptr || data == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + indexCapacity = iCapacity; + dataCapacity = other.dataCapacity; + + int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; + uprv_memcpy(flags, other.flags, iLimit); + uprv_memcpy(index, other.index, iLimit * 4); + uprv_memcpy(data, other.data, (size_t)other.dataLength * 4); + dataLength = other.dataLength; + U_ASSERT(other.index16 == nullptr); +} + +MutableCodePointTrie::~MutableCodePointTrie() { + uprv_free(index); + uprv_free(data); + uprv_free(index16); +} + +MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) { + // Use the highValue as the initialValue to reduce the highStart. + uint32_t errorValue = ucpmap_get(map, -1); + uint32_t initialValue = ucpmap_get(map, 0x10ffff); + LocalPointer<MutableCodePointTrie> mutableTrie( + new MutableCodePointTrie(initialValue, errorValue, errorCode), + errorCode); + if (U_FAILURE(errorCode)) { + return nullptr; + } + UChar32 start = 0, end; + uint32_t value; + while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, + nullptr, nullptr, &value)) >= 0) { + if (value != initialValue) { + if (start == end) { + mutableTrie->set(start, value, errorCode); + } else { + mutableTrie->setRange(start, end, value, errorCode); + } + } + start = end + 1; + } + if (U_SUCCESS(errorCode)) { + return mutableTrie.orphan(); + } else { + return nullptr; + } +} + +MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) { + // Use the highValue as the initialValue to reduce the highStart. + uint32_t errorValue; + uint32_t initialValue; + switch (trie->valueWidth) { + case UCPTRIE_VALUE_BITS_16: + errorValue = trie->data.ptr16[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET]; + initialValue = trie->data.ptr16[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET]; + break; + case UCPTRIE_VALUE_BITS_32: + errorValue = trie->data.ptr32[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET]; + initialValue = trie->data.ptr32[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET]; + break; + case UCPTRIE_VALUE_BITS_8: + errorValue = trie->data.ptr8[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET]; + initialValue = trie->data.ptr8[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET]; + break; + default: + // Unreachable if the trie is properly initialized. + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + LocalPointer<MutableCodePointTrie> mutableTrie( + new MutableCodePointTrie(initialValue, errorValue, errorCode), + errorCode); + if (U_FAILURE(errorCode)) { + return nullptr; + } + UChar32 start = 0, end; + uint32_t value; + while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, + nullptr, nullptr, &value)) >= 0) { + if (value != initialValue) { + if (start == end) { + mutableTrie->set(start, value, errorCode); + } else { + mutableTrie->setRange(start, end, value, errorCode); + } + } + start = end + 1; + } + if (U_SUCCESS(errorCode)) { + return mutableTrie.orphan(); + } else { + return nullptr; + } +} + +void MutableCodePointTrie::clear() { + index3NullOffset = dataNullOffset = -1; + dataLength = 0; + highValue = initialValue = origInitialValue; + highStart = 0; + uprv_free(index16); + index16 = nullptr; +} + +uint32_t MutableCodePointTrie::get(UChar32 c) const { + if ((uint32_t)c > MAX_UNICODE) { + return errorValue; + } + if (c >= highStart) { + return highValue; + } + int32_t i = c >> UCPTRIE_SHIFT_3; + if (flags[i] == ALL_SAME) { + return index[i]; + } else { + return data[index[i] + (c & UCPTRIE_SMALL_DATA_MASK)]; + } +} + +inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue, + UCPMapValueFilter *filter, const void *context) { + if (value == initialValue) { + value = nullValue; + } else if (filter != nullptr) { + value = filter(context, value); + } + return value; +} + +UChar32 MutableCodePointTrie::getRange( + UChar32 start, UCPMapValueFilter *filter, const void *context, + uint32_t *pValue) const { + if ((uint32_t)start > MAX_UNICODE) { + return U_SENTINEL; + } + if (start >= highStart) { + if (pValue != nullptr) { + uint32_t value = highValue; + if (filter != nullptr) { value = filter(context, value); } + *pValue = value; + } + return MAX_UNICODE; + } + uint32_t nullValue = initialValue; + if (filter != nullptr) { nullValue = filter(context, nullValue); } + UChar32 c = start; + uint32_t trieValue, value; + bool haveValue = false; + int32_t i = c >> UCPTRIE_SHIFT_3; + do { + if (flags[i] == ALL_SAME) { + uint32_t trieValue2 = index[i]; + if (haveValue) { + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help + } + } else { + trieValue = trieValue2; + value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); + if (pValue != nullptr) { *pValue = value; } + haveValue = true; + } + c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK; + } else /* MIXED */ { + int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK); + uint32_t trieValue2 = data[di]; + if (haveValue) { + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help + } + } else { + trieValue = trieValue2; + value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); + if (pValue != nullptr) { *pValue = value; } + haveValue = true; + } + while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) { + trieValue2 = data[++di]; + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } + } + trieValue = trieValue2; // may or may not help + } + } + ++i; + } while (c < highStart); + U_ASSERT(haveValue); + if (maybeFilterValue(highValue, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } else { + return MAX_UNICODE; + } +} + +void +writeBlock(uint32_t *block, uint32_t value) { + uint32_t *limit = block + UCPTRIE_SMALL_DATA_BLOCK_LENGTH; + while (block < limit) { + *block++ = value; + } +} + +bool MutableCodePointTrie::ensureHighStart(UChar32 c) { + if (c >= highStart) { + // Round up to a UCPTRIE_CP_PER_INDEX_2_ENTRY boundary to simplify compaction. + c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1); + int32_t i = highStart >> UCPTRIE_SHIFT_3; + int32_t iLimit = c >> UCPTRIE_SHIFT_3; + if (iLimit > indexCapacity) { + uint32_t *newIndex = (uint32_t *)uprv_malloc(I_LIMIT * 4); + if (newIndex == nullptr) { return false; } + uprv_memcpy(newIndex, index, i * 4); + uprv_free(index); + index = newIndex; + indexCapacity = I_LIMIT; + } + do { + flags[i] = ALL_SAME; + index[i] = initialValue; + } while(++i < iLimit); + highStart = c; + } + return true; +} + +int32_t MutableCodePointTrie::allocDataBlock(int32_t blockLength) { + int32_t newBlock = dataLength; + int32_t newTop = newBlock + blockLength; + if (newTop > dataCapacity) { + int32_t capacity; + if (dataCapacity < MEDIUM_DATA_LENGTH) { + capacity = MEDIUM_DATA_LENGTH; + } else if (dataCapacity < MAX_DATA_LENGTH) { + capacity = MAX_DATA_LENGTH; + } else { + // Should never occur. + // Either MAX_DATA_LENGTH is incorrect, + // or the code writes more values than should be possible. + return -1; + } + uint32_t *newData = (uint32_t *)uprv_malloc(capacity * 4); + if (newData == nullptr) { + return -1; + } + uprv_memcpy(newData, data, (size_t)dataLength * 4); + uprv_free(data); + data = newData; + dataCapacity = capacity; + } + dataLength = newTop; + return newBlock; +} + +/** + * No error checking for illegal arguments. + * + * @return -1 if no new data block available (out of memory in data array) + * @internal + */ +int32_t MutableCodePointTrie::getDataBlock(int32_t i) { + if (flags[i] == MIXED) { + return index[i]; + } + if (i < BMP_I_LIMIT) { + int32_t newBlock = allocDataBlock(UCPTRIE_FAST_DATA_BLOCK_LENGTH); + if (newBlock < 0) { return newBlock; } + int32_t iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1); + int32_t iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; + do { + U_ASSERT(flags[iStart] == ALL_SAME); + writeBlock(data + newBlock, index[iStart]); + flags[iStart] = MIXED; + index[iStart++] = newBlock; + newBlock += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; + } while (iStart < iLimit); + return index[i]; + } else { + int32_t newBlock = allocDataBlock(UCPTRIE_SMALL_DATA_BLOCK_LENGTH); + if (newBlock < 0) { return newBlock; } + writeBlock(data + newBlock, index[i]); + flags[i] = MIXED; + index[i] = newBlock; + return newBlock; + } +} + +void MutableCodePointTrie::set(UChar32 c, uint32_t value, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return; + } + if ((uint32_t)c > MAX_UNICODE) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + int32_t block; + if (!ensureHighStart(c) || (block = getDataBlock(c >> UCPTRIE_SHIFT_3)) < 0) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + + data[block + (c & UCPTRIE_SMALL_DATA_MASK)] = value; +} + +void +fillBlock(uint32_t *block, UChar32 start, UChar32 limit, uint32_t value) { + uint32_t *pLimit = block + limit; + block += start; + while (block < pLimit) { + *block++ = value; + } +} + +void MutableCodePointTrie::setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return; + } + if ((uint32_t)start > MAX_UNICODE || (uint32_t)end > MAX_UNICODE || start > end) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (!ensureHighStart(end)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + + UChar32 limit = end + 1; + if (start & UCPTRIE_SMALL_DATA_MASK) { + // Set partial block at [start..following block boundary[. + int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3); + if (block < 0) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + + UChar32 nextStart = (start + UCPTRIE_SMALL_DATA_MASK) & ~UCPTRIE_SMALL_DATA_MASK; + if (nextStart <= limit) { + fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, + value); + start = nextStart; + } else { + fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, limit & UCPTRIE_SMALL_DATA_MASK, + value); + return; + } + } + + // Number of positions in the last, partial block. + int32_t rest = limit & UCPTRIE_SMALL_DATA_MASK; + + // Round down limit to a block boundary. + limit &= ~UCPTRIE_SMALL_DATA_MASK; + + // Iterate over all-value blocks. + while (start < limit) { + int32_t i = start >> UCPTRIE_SHIFT_3; + if (flags[i] == ALL_SAME) { + index[i] = value; + } else /* MIXED */ { + fillBlock(data + index[i], 0, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, value); + } + start += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; + } + + if (rest > 0) { + // Set partial block at [last block boundary..limit[. + int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3); + if (block < 0) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + + fillBlock(data + block, 0, rest, value); + } +} + +/* compaction --------------------------------------------------------------- */ + +void MutableCodePointTrie::maskValues(uint32_t mask) { + initialValue &= mask; + errorValue &= mask; + highValue &= mask; + int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; + for (int32_t i = 0; i < iLimit; ++i) { + if (flags[i] == ALL_SAME) { + index[i] &= mask; + } + } + for (int32_t i = 0; i < dataLength; ++i) { + data[i] &= mask; + } +} + +template<typename UIntA, typename UIntB> +bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) { + while (length > 0 && *s == *t) { + ++s; + ++t; + --length; + } + return length == 0; +} + +bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) { + const uint32_t *pLimit = p + length; + while (p < pLimit && *p == value) { ++p; } + return p == pLimit; +} + +/** Search for an identical block. */ +int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, + const uint16_t *q, int32_t qStart, int32_t blockLength) { + // Ensure that we do not even partially get past length. + length -= blockLength; + + q += qStart; + while (pStart <= length) { + if (equalBlocks(p + pStart, q, blockLength)) { + return pStart; + } + ++pStart; + } + return -1; +} + +int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit, + uint32_t value, int32_t blockLength) { + // Ensure that we do not even partially get past limit. + limit -= blockLength; + + for (int32_t block = start; block <= limit; ++block) { + if (p[block] == value) { + for (int32_t i = 1;; ++i) { + if (i == blockLength) { + return block; + } + if (p[block + i] != value) { + block += i; + break; + } + } + } + } + return -1; +} + +/** + * Look for maximum overlap of the beginning of the other block + * with the previous, adjacent block. + */ +template<typename UIntA, typename UIntB> +int32_t getOverlap(const UIntA *p, int32_t length, + const UIntB *q, int32_t qStart, int32_t blockLength) { + int32_t overlap = blockLength - 1; + U_ASSERT(overlap <= length); + q += qStart; + while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { + --overlap; + } + return overlap; +} + +int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value, + int32_t blockLength) { + int32_t min = length - (blockLength - 1); + int32_t i = length; + while (min < i && p[i - 1] == value) { --i; } + return length - i; +} + +bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) { + for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { + if (index[i] == dataOffset) { + return true; + } + } + return false; +} + +/** + * Finds the start of the last range in the trie by enumerating backward. + * Indexes for code points higher than this will be omitted. + */ +UChar32 MutableCodePointTrie::findHighStart() const { + int32_t i = highStart >> UCPTRIE_SHIFT_3; + while (i > 0) { + bool match; + if (flags[--i] == ALL_SAME) { + match = index[i] == highValue; + } else /* MIXED */ { + const uint32_t *p = data + index[i]; + for (int32_t j = 0;; ++j) { + if (j == UCPTRIE_SMALL_DATA_BLOCK_LENGTH) { + match = true; + break; + } + if (p[j] != highValue) { + match = false; + break; + } + } + } + if (!match) { + return (i + 1) << UCPTRIE_SHIFT_3; + } + } + return 0; +} + +class AllSameBlocks { +public: + static constexpr int32_t NEW_UNIQUE = -1; + static constexpr int32_t OVERFLOW = -2; + + AllSameBlocks() : length(0), mostRecent(-1) {} + + int32_t findOrAdd(int32_t index, int32_t count, uint32_t value) { + if (mostRecent >= 0 && values[mostRecent] == value) { + refCounts[mostRecent] += count; + return indexes[mostRecent]; + } + for (int32_t i = 0; i < length; ++i) { + if (values[i] == value) { + mostRecent = i; + refCounts[i] += count; + return indexes[i]; + } + } + if (length == CAPACITY) { + return OVERFLOW; + } + mostRecent = length; + indexes[length] = index; + values[length] = value; + refCounts[length++] = count; + return NEW_UNIQUE; + } + + /** Replaces the block which has the lowest reference count. */ + void add(int32_t index, int32_t count, uint32_t value) { + U_ASSERT(length == CAPACITY); + int32_t least = -1; + int32_t leastCount = I_LIMIT; + for (int32_t i = 0; i < length; ++i) { + U_ASSERT(values[i] != value); + if (refCounts[i] < leastCount) { + least = i; + leastCount = refCounts[i]; + } + } + U_ASSERT(least >= 0); + mostRecent = least; + indexes[least] = index; + values[least] = value; + refCounts[least] = count; + } + + int32_t findMostUsed() const { + if (length == 0) { return -1; } + int32_t max = -1; + int32_t maxCount = 0; + for (int32_t i = 0; i < length; ++i) { + if (refCounts[i] > maxCount) { + max = i; + maxCount = refCounts[i]; + } + } + return indexes[max]; + } + +private: + static constexpr int32_t CAPACITY = 32; + + int32_t length; + int32_t mostRecent; + + int32_t indexes[CAPACITY]; + uint32_t values[CAPACITY]; + int32_t refCounts[CAPACITY]; +}; + +// Custom hash table for mixed-value blocks to be found anywhere in the +// compacted data or index so far. +class MixedBlocks { +public: + MixedBlocks() {} + ~MixedBlocks() { + uprv_free(table); + } + + bool init(int32_t maxLength, int32_t newBlockLength) { + // We store actual data indexes + 1 to reserve 0 for empty entries. + int32_t maxDataIndex = maxLength - newBlockLength + 1; + int32_t newLength; + if (maxDataIndex <= 0xfff) { // 4k + newLength = 6007; + shift = 12; + mask = 0xfff; + } else if (maxDataIndex <= 0x7fff) { // 32k + newLength = 50021; + shift = 15; + mask = 0x7fff; + } else if (maxDataIndex <= 0x1ffff) { // 128k + newLength = 200003; + shift = 17; + mask = 0x1ffff; + } else { + // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M + newLength = 1500007; + shift = 21; + mask = 0x1fffff; + } + if (newLength > capacity) { + uprv_free(table); + table = (uint32_t *)uprv_malloc(newLength * 4); + if (table == nullptr) { + return false; + } + capacity = newLength; + } + length = newLength; + uprv_memset(table, 0, length * 4); + + blockLength = newBlockLength; + return true; + } + + template<typename UInt> + void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) { + int32_t start = prevDataLength - blockLength; + if (start >= minStart) { + ++start; // Skip the last block that we added last time. + } else { + start = minStart; // Begin with the first full block. + } + for (int32_t end = newDataLength - blockLength; start <= end; ++start) { + uint32_t hashCode = makeHashCode(data, start); + addEntry(data, start, hashCode, start); + } + } + + template<typename UIntA, typename UIntB> + int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const { + uint32_t hashCode = makeHashCode(blockData, blockStart); + int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + + int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const { + uint32_t hashCode = makeHashCode(blockValue); + int32_t entryIndex = findEntry(data, blockValue, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + +private: + template<typename UInt> + uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const { + int32_t blockLimit = blockStart + blockLength; + uint32_t hashCode = blockData[blockStart++]; + do { + hashCode = 37 * hashCode + blockData[blockStart++]; + } while (blockStart < blockLimit); + return hashCode; + } + + uint32_t makeHashCode(uint32_t blockValue) const { + uint32_t hashCode = blockValue; + for (int32_t i = 1; i < blockLength; ++i) { + hashCode = 37 * hashCode + blockValue; + } + return hashCode; + } + + template<typename UInt> + void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) { + U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask); + int32_t entryIndex = findEntry(data, data, blockStart, hashCode); + if (entryIndex < 0) { + table[~entryIndex] = (hashCode << shift) | (dataIndex + 1); + } + } + + template<typename UIntA, typename UIntB> + int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart, + uint32_t hashCode) const { + uint32_t shiftedHashCode = hashCode << shift; + int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 + for (int32_t entryIndex = initialEntryIndex;;) { + uint32_t entry = table[entryIndex]; + if (entry == 0) { + return ~entryIndex; + } + if ((entry & ~mask) == shiftedHashCode) { + int32_t dataIndex = (entry & mask) - 1; + if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) { + return entryIndex; + } + } + entryIndex = nextIndex(initialEntryIndex, entryIndex); + } + } + + int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const { + uint32_t shiftedHashCode = hashCode << shift; + int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 + for (int32_t entryIndex = initialEntryIndex;;) { + uint32_t entry = table[entryIndex]; + if (entry == 0) { + return ~entryIndex; + } + if ((entry & ~mask) == shiftedHashCode) { + int32_t dataIndex = (entry & mask) - 1; + if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) { + return entryIndex; + } + } + entryIndex = nextIndex(initialEntryIndex, entryIndex); + } + } + + inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const { + // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length); + return (entryIndex + initialEntryIndex) % length; + } + + // Hash table. + // The length is a prime number, larger than the maximum data length. + // The "shift" lower bits store a data index + 1. + // The remaining upper bits store a partial hashCode of the block data values. + uint32_t *table = nullptr; + int32_t capacity = 0; + int32_t length = 0; + int32_t shift = 0; + uint32_t mask = 0; + + int32_t blockLength = 0; +}; + +int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) { +#ifdef UCPTRIE_DEBUG + bool overflow = false; +#endif + + // ASCII data will be stored as a linear table, even if the following code + // does not yet count it that way. + int32_t newDataCapacity = ASCII_LIMIT; + // Add room for a small data null block in case it would match the start of + // a fast data block where dataNullOffset must not be set in that case. + newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; + // Add room for special values (errorValue, highValue) and padding. + newDataCapacity += 4; + int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; + int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; + int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; + for (int32_t i = 0; i < iLimit; i += inc) { + if (i == fastILimit) { + blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; + inc = 1; + } + uint32_t value = index[i]; + if (flags[i] == MIXED) { + // Really mixed? + const uint32_t *p = data + value; + value = *p; + if (allValuesSameAs(p + 1, blockLength - 1, value)) { + flags[i] = ALL_SAME; + index[i] = value; + // Fall through to ALL_SAME handling. + } else { + newDataCapacity += blockLength; + continue; + } + } else { + U_ASSERT(flags[i] == ALL_SAME); + if (inc > 1) { + // Do all of the fast-range data block's ALL_SAME parts have the same value? + bool allSame = true; + int32_t next_i = i + inc; + for (int32_t j = i + 1; j < next_i; ++j) { + U_ASSERT(flags[j] == ALL_SAME); + if (index[j] != value) { + allSame = false; + break; + } + } + if (!allSame) { + // Turn it into a MIXED block. + if (getDataBlock(i) < 0) { + return -1; + } + newDataCapacity += blockLength; + continue; + } + } + } + // Is there another ALL_SAME block with the same value? + int32_t other = allSameBlocks.findOrAdd(i, inc, value); + if (other == AllSameBlocks::OVERFLOW) { + // The fixed-size array overflowed. Slow check for a duplicate block. +#ifdef UCPTRIE_DEBUG + if (!overflow) { + puts("UCPTrie AllSameBlocks overflow"); + overflow = true; + } +#endif + int32_t jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; + for (int32_t j = 0;; j += jInc) { + if (j == i) { + allSameBlocks.add(i, inc, value); + break; + } + if (j == fastILimit) { + jInc = 1; + } + if (flags[j] == ALL_SAME && index[j] == value) { + allSameBlocks.add(j, jInc + inc, value); + other = j; + break; + // We could keep counting blocks with the same value + // before we add the first one, which may improve compaction in rare cases, + // but it would make it slower. + } + } + } + if (other >= 0) { + flags[i] = SAME_AS; + index[i] = other; + } else { + // New unique same-value block. + newDataCapacity += blockLength; + } + } + return newDataCapacity; +} + +#ifdef UCPTRIE_DEBUG +# define DEBUG_DO(expr) expr +#else +# define DEBUG_DO(expr) +#endif + +#ifdef UCPTRIE_DEBUG +// Braille symbols: U+28xx = UTF-8 E2 A0 80..E2 A3 BF +int32_t appendValue(char s[], int32_t length, uint32_t value) { + value ^= value >> 16; + value ^= value >> 8; + s[length] = 0xE2; + s[length + 1] = (char)(0xA0 + ((value >> 6) & 3)); + s[length + 2] = (char)(0x80 + (value & 0x3F)); + return length + 3; +} + +void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value, + UChar32 start, int32_t overlap, uint32_t initialValue) { + char s[UCPTRIE_FAST_DATA_BLOCK_LENGTH * 3 + 3]; + int32_t length = 0; + int32_t i; + for (i = 0; i < overlap; ++i) { + length = appendValue(s, length, 0); // Braille blank + } + s[length++] = '|'; + for (; i < blockLength; ++i) { + if (block != nullptr) { + value = block[i]; + } + if (value == initialValue) { + value = 0x40; // Braille lower left dot + } + length = appendValue(s, length, value); + } + s[length] = 0; + start += overlap; + if (start <= 0xffff) { + printf(" %04lX %s|\n", (long)start, s); + } else if (start <= 0xfffff) { + printf(" %5lX %s|\n", (long)start, s); + } else { + printf(" %6lX %s|\n", (long)start, s); + } +} +#endif + +/** + * Compacts a build-time trie. + * + * The compaction + * - removes blocks that are identical with earlier ones + * - overlaps each new non-duplicate block as much as possible with the previously-written one + * - works with fast-range data blocks whose length is a multiple of that of + * higher-code-point data blocks + * + * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. + */ +int32_t MutableCodePointTrie::compactData( + int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, + int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) { +#ifdef UCPTRIE_DEBUG + int32_t countSame=0, sumOverlaps=0; + bool printData = dataLength == 29088 /* line.brk */ || + // dataLength == 30048 /* CanonIterData */ || + dataLength == 50400 /* zh.txt~stroke */; +#endif + + // The linear ASCII data has been copied into newData already. + int32_t newDataLength = 0; + for (int32_t i = 0; newDataLength < ASCII_LIMIT; + newDataLength += UCPTRIE_FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { + index[i] = newDataLength; +#ifdef UCPTRIE_DEBUG + if (printData) { + printBlock(newData + newDataLength, UCPTRIE_FAST_DATA_BLOCK_LENGTH, 0, newDataLength, 0, initialValue); + } +#endif + } + + int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; + if (!mixedBlocks.init(newDataCapacity, blockLength)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + mixedBlocks.extend(newData, 0, 0, newDataLength); + + int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; + int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; + int32_t fastLength = 0; + for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) { + if (i == fastILimit) { + blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; + inc = 1; + fastLength = newDataLength; + if (!mixedBlocks.init(newDataCapacity, blockLength)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + mixedBlocks.extend(newData, 0, 0, newDataLength); + } + if (flags[i] == ALL_SAME) { + uint32_t value = index[i]; + // Find an earlier part of the data array of length blockLength + // that is filled with this value. + int32_t n = mixedBlocks.findAllSameBlock(newData, value); + // If we find a match, and the current block is the data null block, + // and it is not a fast block but matches the start of a fast block, + // then we need to continue looking. + // This is because this small block is shorter than the fast block, + // and not all of the rest of the fast block is filled with this value. + // Otherwise trie.getRange() would detect that the fast block starts at + // dataNullOffset and assume incorrectly that it is filled with the null value. + while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength && + isStartOfSomeFastBlock(n, index, fastILimit)) { + n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength); + } + if (n >= 0) { + DEBUG_DO(++countSame); + index[i] = n; + } else { + n = getAllSameOverlap(newData, newDataLength, value, blockLength); + DEBUG_DO(sumOverlaps += n); +#ifdef UCPTRIE_DEBUG + if (printData) { + printBlock(nullptr, blockLength, value, i << UCPTRIE_SHIFT_3, n, initialValue); + } +#endif + index[i] = newDataLength - n; + int32_t prevDataLength = newDataLength; + while (n < blockLength) { + newData[newDataLength++] = value; + ++n; + } + mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); + } + } else if (flags[i] == MIXED) { + const uint32_t *block = data + index[i]; + int32_t n = mixedBlocks.findBlock(newData, block, 0); + if (n >= 0) { + DEBUG_DO(++countSame); + index[i] = n; + } else { + n = getOverlap(newData, newDataLength, block, 0, blockLength); + DEBUG_DO(sumOverlaps += n); +#ifdef UCPTRIE_DEBUG + if (printData) { + printBlock(block, blockLength, 0, i << UCPTRIE_SHIFT_3, n, initialValue); + } +#endif + index[i] = newDataLength - n; + int32_t prevDataLength = newDataLength; + while (n < blockLength) { + newData[newDataLength++] = block[n++]; + } + mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); + } + } else /* SAME_AS */ { + uint32_t j = index[i]; + index[i] = index[j]; + } + } + +#ifdef UCPTRIE_DEBUG + /* we saved some space */ + printf("compacting UCPTrie: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n", + (long)dataLength, (long)newDataLength, (long)countSame, (long)sumOverlaps); +#endif + return newDataLength; +} + +int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, + UErrorCode &errorCode) { + int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3); + if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) { + // Only the linear fast index, no multi-stage index tables. + index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET; + return fastIndexLength; + } + + // Condense the fast index table. + // Also, does it contain an index-3 block with all dataNullOffset? + uint16_t fastIndex[UCPTRIE_BMP_INDEX_LENGTH]; // fastIndexLength + int32_t i3FirstNull = -1; + for (int32_t i = 0, j = 0; i < fastILimit; ++j) { + uint32_t i3 = index[i]; + fastIndex[j] = (uint16_t)i3; + if (i3 == (uint32_t)dataNullOffset) { + if (i3FirstNull < 0) { + i3FirstNull = j; + } else if (index3NullOffset < 0 && + (j - i3FirstNull + 1) == UCPTRIE_INDEX_3_BLOCK_LENGTH) { + index3NullOffset = i3FirstNull; + } + } else { + i3FirstNull = -1; + } + // Set the index entries that compactData() skipped. + // Needed when the multi-stage index covers the fast index range as well. + int32_t iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; + while (++i < iNext) { + i3 += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; + index[i] = i3; + } + } + + if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength); + + // Examine index-3 blocks. For each determine one of: + // - same as the index-3 null block + // - same as a fast-index block + // - 16-bit indexes + // - 18-bit indexes + // We store this in the first flags entry for the index-3 block. + // + // Also determine an upper limit for the index-3 table length. + int32_t index3Capacity = 0; + i3FirstNull = index3NullOffset; + bool hasLongI3Blocks = false; + // If the fast index covers the whole BMP, then + // the multi-stage index is only for supplementary code points. + // Otherwise, the multi-stage index covers all of Unicode. + int32_t iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT; + int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; + for (int32_t i = iStart; i < iLimit;) { + int32_t j = i; + int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; + uint32_t oredI3 = 0; + bool isNull = true; + do { + uint32_t i3 = index[j]; + oredI3 |= i3; + if (i3 != (uint32_t)dataNullOffset) { + isNull = false; + } + } while (++j < jLimit); + if (isNull) { + flags[i] = I3_NULL; + if (i3FirstNull < 0) { + if (oredI3 <= 0xffff) { + index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; + } else { + index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; + hasLongI3Blocks = true; + } + i3FirstNull = 0; + } + } else { + if (oredI3 <= 0xffff) { + int32_t n = mixedBlocks.findBlock(fastIndex, index, i); + if (n >= 0) { + flags[i] = I3_BMP; + index[i] = n; + } else { + flags[i] = I3_16; + index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; + } + } else { + flags[i] = I3_18; + index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; + hasLongI3Blocks = true; + } + } + i = j; + } + + int32_t index2Capacity = (iLimit - iStart) >> UCPTRIE_SHIFT_2_3; + + // Length of the index-1 table, rounded up. + int32_t index1Length = (index2Capacity + UCPTRIE_INDEX_2_MASK) >> UCPTRIE_SHIFT_1_2; + + // Index table: Fast index, index-1, index-3, index-2. + // +1 for possible index table padding. + int32_t index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1; + index16 = (uint16_t *)uprv_malloc(index16Capacity * 2); + if (index16 == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + uprv_memcpy(index16, fastIndex, fastIndexLength * 2); + + if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + MixedBlocks longI3Blocks; + if (hasLongI3Blocks) { + if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + } + + // Compact the index-3 table and write an uncompacted version of the index-2 table. + uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity + int32_t i2Length = 0; + i3FirstNull = index3NullOffset; + int32_t index3Start = fastIndexLength + index1Length; + int32_t indexLength = index3Start; + for (int32_t i = iStart; i < iLimit; i += UCPTRIE_INDEX_3_BLOCK_LENGTH) { + int32_t i3; + uint8_t f = flags[i]; + if (f == I3_NULL && i3FirstNull < 0) { + // First index-3 null block. Write & overlap it like a normal block, then remember it. + f = dataNullOffset <= 0xffff ? I3_16 : I3_18; + i3FirstNull = 0; + } + if (f == I3_NULL) { + i3 = index3NullOffset; + } else if (f == I3_BMP) { + i3 = index[i]; + } else if (f == I3_16) { + int32_t n = mixedBlocks.findBlock(index16, index, i); + if (n >= 0) { + i3 = n; + } else { + if (indexLength == index3Start) { + // No overlap at the boundary between the index-1 and index-3 tables. + n = 0; + } else { + n = getOverlap(index16, indexLength, + index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); + } + i3 = indexLength - n; + int32_t prevIndexLength = indexLength; + while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) { + index16[indexLength++] = index[i + n++]; + } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); + if (hasLongI3Blocks) { + longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); + } + } + } else { + U_ASSERT(f == I3_18); + U_ASSERT(hasLongI3Blocks); + // Encode an index-3 block that contains one or more data indexes exceeding 16 bits. + int32_t j = i; + int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; + int32_t k = indexLength; + do { + ++k; + uint32_t v = index[j++]; + uint32_t upperBits = (v & 0x30000) >> 2; + index16[k++] = v; + v = index[j++]; + upperBits |= (v & 0x30000) >> 4; + index16[k++] = v; + v = index[j++]; + upperBits |= (v & 0x30000) >> 6; + index16[k++] = v; + v = index[j++]; + upperBits |= (v & 0x30000) >> 8; + index16[k++] = v; + v = index[j++]; + upperBits |= (v & 0x30000) >> 10; + index16[k++] = v; + v = index[j++]; + upperBits |= (v & 0x30000) >> 12; + index16[k++] = v; + v = index[j++]; + upperBits |= (v & 0x30000) >> 14; + index16[k++] = v; + v = index[j++]; + upperBits |= (v & 0x30000) >> 16; + index16[k++] = v; + index16[k - 9] = upperBits; + } while (j < jLimit); + int32_t n = longI3Blocks.findBlock(index16, index16, indexLength); + if (n >= 0) { + i3 = n | 0x8000; + } else { + if (indexLength == index3Start) { + // No overlap at the boundary between the index-1 and index-3 tables. + n = 0; + } else { + n = getOverlap(index16, indexLength, + index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); + } + i3 = (indexLength - n) | 0x8000; + int32_t prevIndexLength = indexLength; + if (n > 0) { + int32_t start = indexLength; + while (n < INDEX_3_18BIT_BLOCK_LENGTH) { + index16[indexLength++] = index16[start + n++]; + } + } else { + indexLength += INDEX_3_18BIT_BLOCK_LENGTH; + } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); + if (hasLongI3Blocks) { + longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); + } + } + } + if (index3NullOffset < 0 && i3FirstNull >= 0) { + index3NullOffset = i3; + } + // Set the index-2 table entry. + index2[i2Length++] = i3; + } + U_ASSERT(i2Length == index2Capacity); + U_ASSERT(indexLength <= index3Start + index3Capacity); + + if (index3NullOffset < 0) { + index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET; + } + if (indexLength >= (UCPTRIE_NO_INDEX3_NULL_OFFSET + UCPTRIE_INDEX_3_BLOCK_LENGTH)) { + // The index-3 offsets exceed 15 bits, or + // the last one cannot be distinguished from the no-null-block value. + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + // Compact the index-2 table and write the index-1 table. + static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH, + "must re-init mixedBlocks"); + int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH; + int32_t i1 = fastIndexLength; + for (int32_t i = 0; i < i2Length; i += blockLength) { + int32_t n; + if ((i2Length - i) >= blockLength) { + // normal block + U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH); + n = mixedBlocks.findBlock(index16, index2, i); + } else { + // highStart is inside the last index-2 block. Shorten it. + blockLength = i2Length - i; + n = findSameBlock(index16, index3Start, indexLength, + index2, i, blockLength); + } + int32_t i2; + if (n >= 0) { + i2 = n; + } else { + if (indexLength == index3Start) { + // No overlap at the boundary between the index-1 and index-3/2 tables. + n = 0; + } else { + n = getOverlap(index16, indexLength, index2, i, blockLength); + } + i2 = indexLength - n; + int32_t prevIndexLength = indexLength; + while (n < blockLength) { + index16[indexLength++] = index2[i + n++]; + } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); + } + // Set the index-1 table entry. + index16[i1++] = i2; + } + U_ASSERT(i1 == index3Start); + U_ASSERT(indexLength <= index16Capacity); + +#ifdef UCPTRIE_DEBUG + /* we saved some space */ + printf("compacting UCPTrie: count of 16-bit index words %lu->%lu\n", + (long)iLimit, (long)indexLength); +#endif + + return indexLength; +} + +int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorCode) { + // Find the real highStart and round it up. + U_ASSERT((highStart & (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) == 0); + highValue = get(MAX_UNICODE); + int32_t realHighStart = findHighStart(); + realHighStart = (realHighStart + (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) & + ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1); + if (realHighStart == UNICODE_LIMIT) { + highValue = initialValue; + } + +#ifdef UCPTRIE_DEBUG + printf("UCPTrie: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n", + (long)realHighStart, (long)highValue, (long)initialValue); +#endif + + // We always store indexes and data values for the fast range. + // Pin highStart to the top of that range while building. + UChar32 fastLimit = fastILimit << UCPTRIE_SHIFT_3; + if (realHighStart < fastLimit) { + for (int32_t i = (realHighStart >> UCPTRIE_SHIFT_3); i < fastILimit; ++i) { + flags[i] = ALL_SAME; + index[i] = highValue; + } + highStart = fastLimit; + } else { + highStart = realHighStart; + } + + uint32_t asciiData[ASCII_LIMIT]; + for (int32_t i = 0; i < ASCII_LIMIT; ++i) { + asciiData[i] = get(i); + } + + // First we look for which data blocks have the same value repeated over the whole block, + // deduplicate such blocks, find a good null data block (for faster enumeration), + // and get an upper bound for the necessary data array length. + AllSameBlocks allSameBlocks; + int32_t newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks); + if (newDataCapacity < 0) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + uint32_t *newData = (uint32_t *)uprv_malloc(newDataCapacity * 4); + if (newData == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + uprv_memcpy(newData, asciiData, sizeof(asciiData)); + + int32_t dataNullIndex = allSameBlocks.findMostUsed(); + + MixedBlocks mixedBlocks; + int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity, + dataNullIndex, mixedBlocks, errorCode); + if (U_FAILURE(errorCode)) { return 0; } + U_ASSERT(newDataLength <= newDataCapacity); + uprv_free(data); + data = newData; + dataCapacity = newDataCapacity; + dataLength = newDataLength; + if (dataLength > (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH)) { + // The offset of the last data block is too high to be stored in the index table. + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + if (dataNullIndex >= 0) { + dataNullOffset = index[dataNullIndex]; +#ifdef UCPTRIE_DEBUG + if (data[dataNullOffset] != initialValue) { + printf("UCPTrie initialValue %lx -> more common nullValue %lx\n", + (long)initialValue, (long)data[dataNullOffset]); + } +#endif + initialValue = data[dataNullOffset]; + } else { + dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET; + } + + int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode); + highStart = realHighStart; + return indexLength; +} + +UCPTrie *MutableCodePointTrie::build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return nullptr; + } + if (type < UCPTRIE_TYPE_FAST || UCPTRIE_TYPE_SMALL < type || + valueWidth < UCPTRIE_VALUE_BITS_16 || UCPTRIE_VALUE_BITS_8 < valueWidth) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + + // The mutable trie always stores 32-bit values. + // When we build a UCPTrie for a smaller value width, we first mask off unused bits + // before compacting the data. + switch (valueWidth) { + case UCPTRIE_VALUE_BITS_32: + break; + case UCPTRIE_VALUE_BITS_16: + maskValues(0xffff); + break; + case UCPTRIE_VALUE_BITS_8: + maskValues(0xff); + break; + default: + break; + } + + UChar32 fastLimit = type == UCPTRIE_TYPE_FAST ? BMP_LIMIT : UCPTRIE_SMALL_LIMIT; + int32_t indexLength = compactTrie(fastLimit >> UCPTRIE_SHIFT_3, errorCode); + if (U_FAILURE(errorCode)) { + clear(); + return nullptr; + } + + // Ensure data table alignment: The index length must be even for uint32_t data. + if (valueWidth == UCPTRIE_VALUE_BITS_32 && (indexLength & 1) != 0) { + index16[indexLength++] = 0xffee; // arbitrary value + } + + // Make the total trie structure length a multiple of 4 bytes by padding the data table, + // and store special values as the last two data values. + int32_t length = indexLength * 2; + if (valueWidth == UCPTRIE_VALUE_BITS_16) { + if (((indexLength ^ dataLength) & 1) != 0) { + // padding + data[dataLength++] = errorValue; + } + if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { + data[dataLength++] = highValue; + data[dataLength++] = errorValue; + } + length += dataLength * 2; + } else if (valueWidth == UCPTRIE_VALUE_BITS_32) { + // 32-bit data words never need padding to a multiple of 4 bytes. + if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { + if (data[dataLength - 1] != highValue) { + data[dataLength++] = highValue; + } + data[dataLength++] = errorValue; + } + length += dataLength * 4; + } else { + int32_t and3 = (length + dataLength) & 3; + if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) { + // all set + } else if(and3 == 3 && data[dataLength - 1] == highValue) { + data[dataLength++] = errorValue; + } else { + while (and3 != 2) { + data[dataLength++] = highValue; + and3 = (and3 + 1) & 3; + } + data[dataLength++] = highValue; + data[dataLength++] = errorValue; + } + length += dataLength; + } + + // Calculate the total length of the UCPTrie as a single memory block. + length += sizeof(UCPTrie); + U_ASSERT((length & 3) == 0); + + uint8_t *bytes = (uint8_t *)uprv_malloc(length); + if (bytes == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + clear(); + return nullptr; + } + UCPTrie *trie = reinterpret_cast<UCPTrie *>(bytes); + uprv_memset(trie, 0, sizeof(UCPTrie)); + trie->indexLength = indexLength; + trie->dataLength = dataLength; + + trie->highStart = highStart; + // Round up shifted12HighStart to a multiple of 0x1000 for easy testing from UTF-8 lead bytes. + // Runtime code needs to then test for the real highStart as well. + trie->shifted12HighStart = (highStart + 0xfff) >> 12; + trie->type = type; + trie->valueWidth = valueWidth; + + trie->index3NullOffset = index3NullOffset; + trie->dataNullOffset = dataNullOffset; + trie->nullValue = initialValue; + + bytes += sizeof(UCPTrie); + + // Fill the index and data arrays. + uint16_t *dest16 = (uint16_t *)bytes; + trie->index = dest16; + + if (highStart <= fastLimit) { + // Condense only the fast index from the mutable-trie index. + for (int32_t i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) { + *dest16++ = (uint16_t)index[i]; // dest16[j] + } + } else { + uprv_memcpy(dest16, index16, indexLength * 2); + dest16 += indexLength; + } + bytes += indexLength * 2; + + // Write the data array. + const uint32_t *p = data; + switch (valueWidth) { + case UCPTRIE_VALUE_BITS_16: + // Write 16-bit data values. + trie->data.ptr16 = dest16; + for (int32_t i = dataLength; i > 0; --i) { + *dest16++ = (uint16_t)*p++; + } + break; + case UCPTRIE_VALUE_BITS_32: + // Write 32-bit data values. + trie->data.ptr32 = (uint32_t *)bytes; + uprv_memcpy(bytes, p, (size_t)dataLength * 4); + break; + case UCPTRIE_VALUE_BITS_8: + // Write 8-bit data values. + trie->data.ptr8 = bytes; + for (int32_t i = dataLength; i > 0; --i) { + *bytes++ = (uint8_t)*p++; + } + break; + default: + // Will not occur, valueWidth checked at the beginning. + break; + } + +#ifdef UCPTRIE_DEBUG + trie->name = name; + + ucptrie_printLengths(trie, ""); +#endif + + clear(); + return trie; +} + +} // namespace + +U_NAMESPACE_END + +U_NAMESPACE_USE + +U_CAPI UMutableCPTrie * U_EXPORT2 +umutablecptrie_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { + return nullptr; + } + LocalPointer<MutableCodePointTrie> trie( + new MutableCodePointTrie(initialValue, errorValue, *pErrorCode), *pErrorCode); + if (U_FAILURE(*pErrorCode)) { + return nullptr; + } + return reinterpret_cast<UMutableCPTrie *>(trie.orphan()); +} + +U_CAPI UMutableCPTrie * U_EXPORT2 +umutablecptrie_clone(const UMutableCPTrie *other, UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { + return nullptr; + } + if (other == nullptr) { + return nullptr; + } + LocalPointer<MutableCodePointTrie> clone( + new MutableCodePointTrie(*reinterpret_cast<const MutableCodePointTrie *>(other), *pErrorCode), *pErrorCode); + if (U_FAILURE(*pErrorCode)) { + return nullptr; + } + return reinterpret_cast<UMutableCPTrie *>(clone.orphan()); +} + +U_CAPI void U_EXPORT2 +umutablecptrie_close(UMutableCPTrie *trie) { + delete reinterpret_cast<MutableCodePointTrie *>(trie); +} + +U_CAPI UMutableCPTrie * U_EXPORT2 +umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { + return nullptr; + } + if (map == nullptr) { + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode)); +} + +U_CAPI UMutableCPTrie * U_EXPORT2 +umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { + return nullptr; + } + if (trie == nullptr) { + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPTrie(trie, *pErrorCode)); +} + +U_CAPI uint32_t U_EXPORT2 +umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c) { + return reinterpret_cast<const MutableCodePointTrie *>(trie)->get(c); +} + +namespace { + +UChar32 getRange(const void *trie, UChar32 start, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { + return reinterpret_cast<const MutableCodePointTrie *>(trie)-> + getRange(start, filter, context, pValue); +} + +} // namespace + +U_CAPI UChar32 U_EXPORT2 +umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start, + UCPMapRangeOption option, uint32_t surrogateValue, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { + return ucptrie_internalGetRange(getRange, trie, start, + option, surrogateValue, + filter, context, pValue); +} + +U_CAPI void U_EXPORT2 +umutablecptrie_set(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { + return; + } + reinterpret_cast<MutableCodePointTrie *>(trie)->set(c, value, *pErrorCode); +} + +U_CAPI void U_EXPORT2 +umutablecptrie_setRange(UMutableCPTrie *trie, UChar32 start, UChar32 end, + uint32_t value, UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { + return; + } + reinterpret_cast<MutableCodePointTrie *>(trie)->setRange(start, end, value, *pErrorCode); +} + +/* Compact and internally serialize the trie. */ +U_CAPI UCPTrie * U_EXPORT2 +umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth, + UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { + return nullptr; + } + return reinterpret_cast<MutableCodePointTrie *>(trie)->build(type, valueWidth, *pErrorCode); +} + +#ifdef UCPTRIE_DEBUG +U_CFUNC void umutablecptrie_setName(UMutableCPTrie *trie, const char *name) { + reinterpret_cast<MutableCodePointTrie *>(trie)->name = name; +} +#endif |