aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/icu/common/umutablecptrie.cpp
diff options
context:
space:
mode:
authormcheshkov <mcheshkov@yandex-team.ru>2022-02-10 16:46:15 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:15 +0300
commite9d19cec64684c9c1e6b0c98297e5b895cf904fe (patch)
tree2768b1223e96a8a0610a93d18425d9647c1123c8 /contrib/libs/icu/common/umutablecptrie.cpp
parent60040c91ffe701a84689b2c6310ff845e65cff42 (diff)
downloadydb-e9d19cec64684c9c1e6b0c98297e5b895cf904fe.tar.gz
Restoring authorship annotation for <mcheshkov@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/common/umutablecptrie.cpp')
-rw-r--r--contrib/libs/icu/common/umutablecptrie.cpp3704
1 files changed, 1852 insertions, 1852 deletions
diff --git a/contrib/libs/icu/common/umutablecptrie.cpp b/contrib/libs/icu/common/umutablecptrie.cpp
index cdbe27080b..6ef717a172 100644
--- a/contrib/libs/icu/common/umutablecptrie.cpp
+++ b/contrib/libs/icu/common/umutablecptrie.cpp
@@ -1,1852 +1,1852 @@
-// © 2017 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-
-// umutablecptrie.cpp (inspired by utrie2_builder.cpp)
-// created: 2017dec29 Markus W. Scherer
-
-// #define UCPTRIE_DEBUG
-#ifdef UCPTRIE_DEBUG
-# include <stdio.h>
-#endif
-
-#include "unicode/utypes.h"
-#include "unicode/ucptrie.h"
-#include "unicode/umutablecptrie.h"
-#include "unicode/uobject.h"
-#include "unicode/utf16.h"
-#include "cmemory.h"
-#include "uassert.h"
-#include "ucptrie_impl.h"
-
-// ICU-20235 In case Microsoft math.h has defined this, undefine it.
-#ifdef OVERFLOW
-#undef OVERFLOW
-#endif
-
-U_NAMESPACE_BEGIN
-
-namespace {
-
-constexpr int32_t MAX_UNICODE = 0x10ffff;
-
-constexpr int32_t UNICODE_LIMIT = 0x110000;
-constexpr int32_t BMP_LIMIT = 0x10000;
-constexpr int32_t ASCII_LIMIT = 0x80;
-
-constexpr int32_t I_LIMIT = UNICODE_LIMIT >> UCPTRIE_SHIFT_3;
-constexpr int32_t BMP_I_LIMIT = BMP_LIMIT >> UCPTRIE_SHIFT_3;
-constexpr int32_t ASCII_I_LIMIT = ASCII_LIMIT >> UCPTRIE_SHIFT_3;
-
-constexpr int32_t SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3));
-
-// Flag values for data blocks.
-constexpr uint8_t ALL_SAME = 0;
-constexpr uint8_t MIXED = 1;
-constexpr uint8_t SAME_AS = 2;
-
-/** Start with allocation of 16k data entries. */
-constexpr int32_t INITIAL_DATA_LENGTH = ((int32_t)1 << 14);
-
-/** Grow about 8x each time. */
-constexpr int32_t MEDIUM_DATA_LENGTH = ((int32_t)1 << 17);
-
-/**
- * Maximum length of the build-time data array.
- * One entry per 0x110000 code points.
- */
-constexpr int32_t MAX_DATA_LENGTH = UNICODE_LIMIT;
-
-// Flag values for index-3 blocks while compacting/building.
-constexpr uint8_t I3_NULL = 0;
-constexpr uint8_t I3_BMP = 1;
-constexpr uint8_t I3_16 = 2;
-constexpr uint8_t I3_18 = 3;
-
-constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8;
-
-class AllSameBlocks;
-class MixedBlocks;
-
-class MutableCodePointTrie : public UMemory {
-public:
- MutableCodePointTrie(uint32_t initialValue, uint32_t errorValue, UErrorCode &errorCode);
- MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode);
- MutableCodePointTrie(const MutableCodePointTrie &other) = delete;
- ~MutableCodePointTrie();
-
- MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete;
-
- static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode);
- static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode);
-
- uint32_t get(UChar32 c) const;
- int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context,
- uint32_t *pValue) const;
-
- void set(UChar32 c, uint32_t value, UErrorCode &errorCode);
- void setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode);
-
- UCPTrie *build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode);
-
-private:
- void clear();
-
- bool ensureHighStart(UChar32 c);
- int32_t allocDataBlock(int32_t blockLength);
- int32_t getDataBlock(int32_t i);
-
- void maskValues(uint32_t mask);
- UChar32 findHighStart() const;
- int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks);
- int32_t compactData(
- int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
- int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
- int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
- int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode);
-
- uint32_t *index = nullptr;
- int32_t indexCapacity = 0;
- int32_t index3NullOffset = -1;
- uint32_t *data = nullptr;
- int32_t dataCapacity = 0;
- int32_t dataLength = 0;
- int32_t dataNullOffset = -1;
-
- uint32_t origInitialValue;
- uint32_t initialValue;
- uint32_t errorValue;
- UChar32 highStart;
- uint32_t highValue;
-#ifdef UCPTRIE_DEBUG
-public:
- const char *name;
-#endif
-private:
- /** Temporary array while building the final data. */
- uint16_t *index16 = nullptr;
- uint8_t flags[UNICODE_LIMIT >> UCPTRIE_SHIFT_3];
-};
-
-MutableCodePointTrie::MutableCodePointTrie(uint32_t iniValue, uint32_t errValue, UErrorCode &errorCode) :
- origInitialValue(iniValue), initialValue(iniValue), errorValue(errValue),
- highStart(0), highValue(initialValue)
-#ifdef UCPTRIE_DEBUG
- , name("open")
-#endif
- {
- if (U_FAILURE(errorCode)) { return; }
- index = (uint32_t *)uprv_malloc(BMP_I_LIMIT * 4);
- data = (uint32_t *)uprv_malloc(INITIAL_DATA_LENGTH * 4);
- if (index == nullptr || data == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- indexCapacity = BMP_I_LIMIT;
- dataCapacity = INITIAL_DATA_LENGTH;
-}
-
-MutableCodePointTrie::MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode) :
- index3NullOffset(other.index3NullOffset),
- dataNullOffset(other.dataNullOffset),
- origInitialValue(other.origInitialValue), initialValue(other.initialValue),
- errorValue(other.errorValue),
- highStart(other.highStart), highValue(other.highValue)
-#ifdef UCPTRIE_DEBUG
- , name("mutable clone")
-#endif
- {
- if (U_FAILURE(errorCode)) { return; }
- int32_t iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT;
- index = (uint32_t *)uprv_malloc(iCapacity * 4);
- data = (uint32_t *)uprv_malloc(other.dataCapacity * 4);
- if (index == nullptr || data == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- indexCapacity = iCapacity;
- dataCapacity = other.dataCapacity;
-
- int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
- uprv_memcpy(flags, other.flags, iLimit);
- uprv_memcpy(index, other.index, iLimit * 4);
- uprv_memcpy(data, other.data, (size_t)other.dataLength * 4);
- dataLength = other.dataLength;
- U_ASSERT(other.index16 == nullptr);
-}
-
-MutableCodePointTrie::~MutableCodePointTrie() {
- uprv_free(index);
- uprv_free(data);
- uprv_free(index16);
-}
-
-MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) {
- // Use the highValue as the initialValue to reduce the highStart.
- uint32_t errorValue = ucpmap_get(map, -1);
- uint32_t initialValue = ucpmap_get(map, 0x10ffff);
- LocalPointer<MutableCodePointTrie> mutableTrie(
- new MutableCodePointTrie(initialValue, errorValue, errorCode),
- errorCode);
- if (U_FAILURE(errorCode)) {
- return nullptr;
- }
- UChar32 start = 0, end;
- uint32_t value;
- while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
- nullptr, nullptr, &value)) >= 0) {
- if (value != initialValue) {
- if (start == end) {
- mutableTrie->set(start, value, errorCode);
- } else {
- mutableTrie->setRange(start, end, value, errorCode);
- }
- }
- start = end + 1;
- }
- if (U_SUCCESS(errorCode)) {
- return mutableTrie.orphan();
- } else {
- return nullptr;
- }
-}
-
-MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) {
- // Use the highValue as the initialValue to reduce the highStart.
- uint32_t errorValue;
- uint32_t initialValue;
- switch (trie->valueWidth) {
- case UCPTRIE_VALUE_BITS_16:
- errorValue = trie->data.ptr16[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
- initialValue = trie->data.ptr16[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
- break;
- case UCPTRIE_VALUE_BITS_32:
- errorValue = trie->data.ptr32[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
- initialValue = trie->data.ptr32[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
- break;
- case UCPTRIE_VALUE_BITS_8:
- errorValue = trie->data.ptr8[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
- initialValue = trie->data.ptr8[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
- break;
- default:
- // Unreachable if the trie is properly initialized.
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- LocalPointer<MutableCodePointTrie> mutableTrie(
- new MutableCodePointTrie(initialValue, errorValue, errorCode),
- errorCode);
- if (U_FAILURE(errorCode)) {
- return nullptr;
- }
- UChar32 start = 0, end;
- uint32_t value;
- while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
- nullptr, nullptr, &value)) >= 0) {
- if (value != initialValue) {
- if (start == end) {
- mutableTrie->set(start, value, errorCode);
- } else {
- mutableTrie->setRange(start, end, value, errorCode);
- }
- }
- start = end + 1;
- }
- if (U_SUCCESS(errorCode)) {
- return mutableTrie.orphan();
- } else {
- return nullptr;
- }
-}
-
-void MutableCodePointTrie::clear() {
- index3NullOffset = dataNullOffset = -1;
- dataLength = 0;
- highValue = initialValue = origInitialValue;
- highStart = 0;
- uprv_free(index16);
- index16 = nullptr;
-}
-
-uint32_t MutableCodePointTrie::get(UChar32 c) const {
- if ((uint32_t)c > MAX_UNICODE) {
- return errorValue;
- }
- if (c >= highStart) {
- return highValue;
- }
- int32_t i = c >> UCPTRIE_SHIFT_3;
- if (flags[i] == ALL_SAME) {
- return index[i];
- } else {
- return data[index[i] + (c & UCPTRIE_SMALL_DATA_MASK)];
- }
-}
-
-inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue,
- UCPMapValueFilter *filter, const void *context) {
- if (value == initialValue) {
- value = nullValue;
- } else if (filter != nullptr) {
- value = filter(context, value);
- }
- return value;
-}
-
-UChar32 MutableCodePointTrie::getRange(
- UChar32 start, UCPMapValueFilter *filter, const void *context,
- uint32_t *pValue) const {
- if ((uint32_t)start > MAX_UNICODE) {
- return U_SENTINEL;
- }
- if (start >= highStart) {
- if (pValue != nullptr) {
- uint32_t value = highValue;
- if (filter != nullptr) { value = filter(context, value); }
- *pValue = value;
- }
- return MAX_UNICODE;
- }
- uint32_t nullValue = initialValue;
- if (filter != nullptr) { nullValue = filter(context, nullValue); }
- UChar32 c = start;
- uint32_t trieValue, value;
- bool haveValue = false;
- int32_t i = c >> UCPTRIE_SHIFT_3;
- do {
- if (flags[i] == ALL_SAME) {
- uint32_t trieValue2 = index[i];
- if (haveValue) {
- if (trieValue2 != trieValue) {
- if (filter == nullptr ||
- maybeFilterValue(trieValue2, initialValue, nullValue,
- filter, context) != value) {
- return c - 1;
- }
- trieValue = trieValue2; // may or may not help
- }
- } else {
- trieValue = trieValue2;
- value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
- if (pValue != nullptr) { *pValue = value; }
- haveValue = true;
- }
- c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK;
- } else /* MIXED */ {
- int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK);
- uint32_t trieValue2 = data[di];
- if (haveValue) {
- if (trieValue2 != trieValue) {
- if (filter == nullptr ||
- maybeFilterValue(trieValue2, initialValue, nullValue,
- filter, context) != value) {
- return c - 1;
- }
- trieValue = trieValue2; // may or may not help
- }
- } else {
- trieValue = trieValue2;
- value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
- if (pValue != nullptr) { *pValue = value; }
- haveValue = true;
- }
- while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) {
- trieValue2 = data[++di];
- if (trieValue2 != trieValue) {
- if (filter == nullptr ||
- maybeFilterValue(trieValue2, initialValue, nullValue,
- filter, context) != value) {
- return c - 1;
- }
- }
- trieValue = trieValue2; // may or may not help
- }
- }
- ++i;
- } while (c < highStart);
- U_ASSERT(haveValue);
- if (maybeFilterValue(highValue, initialValue, nullValue,
- filter, context) != value) {
- return c - 1;
- } else {
- return MAX_UNICODE;
- }
-}
-
-void
-writeBlock(uint32_t *block, uint32_t value) {
- uint32_t *limit = block + UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
- while (block < limit) {
- *block++ = value;
- }
-}
-
-bool MutableCodePointTrie::ensureHighStart(UChar32 c) {
- if (c >= highStart) {
- // Round up to a UCPTRIE_CP_PER_INDEX_2_ENTRY boundary to simplify compaction.
- c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
- int32_t i = highStart >> UCPTRIE_SHIFT_3;
- int32_t iLimit = c >> UCPTRIE_SHIFT_3;
- if (iLimit > indexCapacity) {
- uint32_t *newIndex = (uint32_t *)uprv_malloc(I_LIMIT * 4);
- if (newIndex == nullptr) { return false; }
- uprv_memcpy(newIndex, index, i * 4);
- uprv_free(index);
- index = newIndex;
- indexCapacity = I_LIMIT;
- }
- do {
- flags[i] = ALL_SAME;
- index[i] = initialValue;
- } while(++i < iLimit);
- highStart = c;
- }
- return true;
-}
-
-int32_t MutableCodePointTrie::allocDataBlock(int32_t blockLength) {
- int32_t newBlock = dataLength;
- int32_t newTop = newBlock + blockLength;
- if (newTop > dataCapacity) {
- int32_t capacity;
- if (dataCapacity < MEDIUM_DATA_LENGTH) {
- capacity = MEDIUM_DATA_LENGTH;
- } else if (dataCapacity < MAX_DATA_LENGTH) {
- capacity = MAX_DATA_LENGTH;
- } else {
- // Should never occur.
- // Either MAX_DATA_LENGTH is incorrect,
- // or the code writes more values than should be possible.
- return -1;
- }
- uint32_t *newData = (uint32_t *)uprv_malloc(capacity * 4);
- if (newData == nullptr) {
- return -1;
- }
- uprv_memcpy(newData, data, (size_t)dataLength * 4);
- uprv_free(data);
- data = newData;
- dataCapacity = capacity;
- }
- dataLength = newTop;
- return newBlock;
-}
-
-/**
- * No error checking for illegal arguments.
- *
- * @return -1 if no new data block available (out of memory in data array)
- * @internal
- */
-int32_t MutableCodePointTrie::getDataBlock(int32_t i) {
- if (flags[i] == MIXED) {
- return index[i];
- }
- if (i < BMP_I_LIMIT) {
- int32_t newBlock = allocDataBlock(UCPTRIE_FAST_DATA_BLOCK_LENGTH);
- if (newBlock < 0) { return newBlock; }
- int32_t iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1);
- int32_t iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
- do {
- U_ASSERT(flags[iStart] == ALL_SAME);
- writeBlock(data + newBlock, index[iStart]);
- flags[iStart] = MIXED;
- index[iStart++] = newBlock;
- newBlock += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
- } while (iStart < iLimit);
- return index[i];
- } else {
- int32_t newBlock = allocDataBlock(UCPTRIE_SMALL_DATA_BLOCK_LENGTH);
- if (newBlock < 0) { return newBlock; }
- writeBlock(data + newBlock, index[i]);
- flags[i] = MIXED;
- index[i] = newBlock;
- return newBlock;
- }
-}
-
-void MutableCodePointTrie::set(UChar32 c, uint32_t value, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) {
- return;
- }
- if ((uint32_t)c > MAX_UNICODE) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
-
- int32_t block;
- if (!ensureHighStart(c) || (block = getDataBlock(c >> UCPTRIE_SHIFT_3)) < 0) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
-
- data[block + (c & UCPTRIE_SMALL_DATA_MASK)] = value;
-}
-
-void
-fillBlock(uint32_t *block, UChar32 start, UChar32 limit, uint32_t value) {
- uint32_t *pLimit = block + limit;
- block += start;
- while (block < pLimit) {
- *block++ = value;
- }
-}
-
-void MutableCodePointTrie::setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) {
- return;
- }
- if ((uint32_t)start > MAX_UNICODE || (uint32_t)end > MAX_UNICODE || start > end) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- if (!ensureHighStart(end)) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
-
- UChar32 limit = end + 1;
- if (start & UCPTRIE_SMALL_DATA_MASK) {
- // Set partial block at [start..following block boundary[.
- int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
- if (block < 0) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
-
- UChar32 nextStart = (start + UCPTRIE_SMALL_DATA_MASK) & ~UCPTRIE_SMALL_DATA_MASK;
- if (nextStart <= limit) {
- fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, UCPTRIE_SMALL_DATA_BLOCK_LENGTH,
- value);
- start = nextStart;
- } else {
- fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, limit & UCPTRIE_SMALL_DATA_MASK,
- value);
- return;
- }
- }
-
- // Number of positions in the last, partial block.
- int32_t rest = limit & UCPTRIE_SMALL_DATA_MASK;
-
- // Round down limit to a block boundary.
- limit &= ~UCPTRIE_SMALL_DATA_MASK;
-
- // Iterate over all-value blocks.
- while (start < limit) {
- int32_t i = start >> UCPTRIE_SHIFT_3;
- if (flags[i] == ALL_SAME) {
- index[i] = value;
- } else /* MIXED */ {
- fillBlock(data + index[i], 0, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, value);
- }
- start += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
- }
-
- if (rest > 0) {
- // Set partial block at [last block boundary..limit[.
- int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
- if (block < 0) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
-
- fillBlock(data + block, 0, rest, value);
- }
-}
-
-/* compaction --------------------------------------------------------------- */
-
-void MutableCodePointTrie::maskValues(uint32_t mask) {
- initialValue &= mask;
- errorValue &= mask;
- highValue &= mask;
- int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
- for (int32_t i = 0; i < iLimit; ++i) {
- if (flags[i] == ALL_SAME) {
- index[i] &= mask;
- }
- }
- for (int32_t i = 0; i < dataLength; ++i) {
- data[i] &= mask;
- }
-}
-
-template<typename UIntA, typename UIntB>
-bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) {
- while (length > 0 && *s == *t) {
- ++s;
- ++t;
- --length;
- }
- return length == 0;
-}
-
-bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) {
- const uint32_t *pLimit = p + length;
- while (p < pLimit && *p == value) { ++p; }
- return p == pLimit;
-}
-
-/** Search for an identical block. */
-int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
- const uint16_t *q, int32_t qStart, int32_t blockLength) {
- // Ensure that we do not even partially get past length.
- length -= blockLength;
-
- q += qStart;
- while (pStart <= length) {
- if (equalBlocks(p + pStart, q, blockLength)) {
- return pStart;
- }
- ++pStart;
- }
- return -1;
-}
-
-int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit,
- uint32_t value, int32_t blockLength) {
- // Ensure that we do not even partially get past limit.
- limit -= blockLength;
-
- for (int32_t block = start; block <= limit; ++block) {
- if (p[block] == value) {
- for (int32_t i = 1;; ++i) {
- if (i == blockLength) {
- return block;
- }
- if (p[block + i] != value) {
- block += i;
- break;
- }
- }
- }
- }
- return -1;
-}
-
-/**
- * Look for maximum overlap of the beginning of the other block
- * with the previous, adjacent block.
- */
-template<typename UIntA, typename UIntB>
-int32_t getOverlap(const UIntA *p, int32_t length,
- const UIntB *q, int32_t qStart, int32_t blockLength) {
- int32_t overlap = blockLength - 1;
- U_ASSERT(overlap <= length);
- q += qStart;
- while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
- --overlap;
- }
- return overlap;
-}
-
-int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value,
- int32_t blockLength) {
- int32_t min = length - (blockLength - 1);
- int32_t i = length;
- while (min < i && p[i - 1] == value) { --i; }
- return length - i;
-}
-
-bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) {
- for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
- if (index[i] == dataOffset) {
- return true;
- }
- }
- return false;
-}
-
-/**
- * Finds the start of the last range in the trie by enumerating backward.
- * Indexes for code points higher than this will be omitted.
- */
-UChar32 MutableCodePointTrie::findHighStart() const {
- int32_t i = highStart >> UCPTRIE_SHIFT_3;
- while (i > 0) {
- bool match;
- if (flags[--i] == ALL_SAME) {
- match = index[i] == highValue;
- } else /* MIXED */ {
- const uint32_t *p = data + index[i];
- for (int32_t j = 0;; ++j) {
- if (j == UCPTRIE_SMALL_DATA_BLOCK_LENGTH) {
- match = true;
- break;
- }
- if (p[j] != highValue) {
- match = false;
- break;
- }
- }
- }
- if (!match) {
- return (i + 1) << UCPTRIE_SHIFT_3;
- }
- }
- return 0;
-}
-
-class AllSameBlocks {
-public:
- static constexpr int32_t NEW_UNIQUE = -1;
- static constexpr int32_t OVERFLOW = -2;
-
- AllSameBlocks() : length(0), mostRecent(-1) {}
-
- int32_t findOrAdd(int32_t index, int32_t count, uint32_t value) {
- if (mostRecent >= 0 && values[mostRecent] == value) {
- refCounts[mostRecent] += count;
- return indexes[mostRecent];
- }
- for (int32_t i = 0; i < length; ++i) {
- if (values[i] == value) {
- mostRecent = i;
- refCounts[i] += count;
- return indexes[i];
- }
- }
- if (length == CAPACITY) {
- return OVERFLOW;
- }
- mostRecent = length;
- indexes[length] = index;
- values[length] = value;
- refCounts[length++] = count;
- return NEW_UNIQUE;
- }
-
- /** Replaces the block which has the lowest reference count. */
- void add(int32_t index, int32_t count, uint32_t value) {
- U_ASSERT(length == CAPACITY);
- int32_t least = -1;
- int32_t leastCount = I_LIMIT;
- for (int32_t i = 0; i < length; ++i) {
- U_ASSERT(values[i] != value);
- if (refCounts[i] < leastCount) {
- least = i;
- leastCount = refCounts[i];
- }
- }
- U_ASSERT(least >= 0);
- mostRecent = least;
- indexes[least] = index;
- values[least] = value;
- refCounts[least] = count;
- }
-
- int32_t findMostUsed() const {
- if (length == 0) { return -1; }
- int32_t max = -1;
- int32_t maxCount = 0;
- for (int32_t i = 0; i < length; ++i) {
- if (refCounts[i] > maxCount) {
- max = i;
- maxCount = refCounts[i];
- }
- }
- return indexes[max];
- }
-
-private:
- static constexpr int32_t CAPACITY = 32;
-
- int32_t length;
- int32_t mostRecent;
-
- int32_t indexes[CAPACITY];
- uint32_t values[CAPACITY];
- int32_t refCounts[CAPACITY];
-};
-
-// Custom hash table for mixed-value blocks to be found anywhere in the
-// compacted data or index so far.
-class MixedBlocks {
-public:
- MixedBlocks() {}
- ~MixedBlocks() {
- uprv_free(table);
- }
-
- bool init(int32_t maxLength, int32_t newBlockLength) {
- // We store actual data indexes + 1 to reserve 0 for empty entries.
- int32_t maxDataIndex = maxLength - newBlockLength + 1;
- int32_t newLength;
- if (maxDataIndex <= 0xfff) { // 4k
- newLength = 6007;
- shift = 12;
- mask = 0xfff;
- } else if (maxDataIndex <= 0x7fff) { // 32k
- newLength = 50021;
- shift = 15;
- mask = 0x7fff;
- } else if (maxDataIndex <= 0x1ffff) { // 128k
- newLength = 200003;
- shift = 17;
- mask = 0x1ffff;
- } else {
- // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M
- newLength = 1500007;
- shift = 21;
- mask = 0x1fffff;
- }
- if (newLength > capacity) {
- uprv_free(table);
- table = (uint32_t *)uprv_malloc(newLength * 4);
- if (table == nullptr) {
- return false;
- }
- capacity = newLength;
- }
- length = newLength;
- uprv_memset(table, 0, length * 4);
-
- blockLength = newBlockLength;
- return true;
- }
-
- template<typename UInt>
- void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) {
- int32_t start = prevDataLength - blockLength;
- if (start >= minStart) {
- ++start; // Skip the last block that we added last time.
- } else {
- start = minStart; // Begin with the first full block.
- }
- for (int32_t end = newDataLength - blockLength; start <= end; ++start) {
- uint32_t hashCode = makeHashCode(data, start);
- addEntry(data, start, hashCode, start);
- }
- }
-
- template<typename UIntA, typename UIntB>
- int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const {
- uint32_t hashCode = makeHashCode(blockData, blockStart);
- int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode);
- if (entryIndex >= 0) {
- return (table[entryIndex] & mask) - 1;
- } else {
- return -1;
- }
- }
-
- int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const {
- uint32_t hashCode = makeHashCode(blockValue);
- int32_t entryIndex = findEntry(data, blockValue, hashCode);
- if (entryIndex >= 0) {
- return (table[entryIndex] & mask) - 1;
- } else {
- return -1;
- }
- }
-
-private:
- template<typename UInt>
- uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const {
- int32_t blockLimit = blockStart + blockLength;
- uint32_t hashCode = blockData[blockStart++];
- do {
- hashCode = 37 * hashCode + blockData[blockStart++];
- } while (blockStart < blockLimit);
- return hashCode;
- }
-
- uint32_t makeHashCode(uint32_t blockValue) const {
- uint32_t hashCode = blockValue;
- for (int32_t i = 1; i < blockLength; ++i) {
- hashCode = 37 * hashCode + blockValue;
- }
- return hashCode;
- }
-
- template<typename UInt>
- void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) {
- U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask);
- int32_t entryIndex = findEntry(data, data, blockStart, hashCode);
- if (entryIndex < 0) {
- table[~entryIndex] = (hashCode << shift) | (dataIndex + 1);
- }
- }
-
- template<typename UIntA, typename UIntB>
- int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart,
- uint32_t hashCode) const {
- uint32_t shiftedHashCode = hashCode << shift;
- int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
- for (int32_t entryIndex = initialEntryIndex;;) {
- uint32_t entry = table[entryIndex];
- if (entry == 0) {
- return ~entryIndex;
- }
- if ((entry & ~mask) == shiftedHashCode) {
- int32_t dataIndex = (entry & mask) - 1;
- if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) {
- return entryIndex;
- }
- }
- entryIndex = nextIndex(initialEntryIndex, entryIndex);
- }
- }
-
- int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const {
- uint32_t shiftedHashCode = hashCode << shift;
- int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
- for (int32_t entryIndex = initialEntryIndex;;) {
- uint32_t entry = table[entryIndex];
- if (entry == 0) {
- return ~entryIndex;
- }
- if ((entry & ~mask) == shiftedHashCode) {
- int32_t dataIndex = (entry & mask) - 1;
- if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) {
- return entryIndex;
- }
- }
- entryIndex = nextIndex(initialEntryIndex, entryIndex);
- }
- }
-
- inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const {
- // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length);
- return (entryIndex + initialEntryIndex) % length;
- }
-
- // Hash table.
- // The length is a prime number, larger than the maximum data length.
- // The "shift" lower bits store a data index + 1.
- // The remaining upper bits store a partial hashCode of the block data values.
- uint32_t *table = nullptr;
- int32_t capacity = 0;
- int32_t length = 0;
- int32_t shift = 0;
- uint32_t mask = 0;
-
- int32_t blockLength = 0;
-};
-
-int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) {
-#ifdef UCPTRIE_DEBUG
- bool overflow = false;
-#endif
-
- // ASCII data will be stored as a linear table, even if the following code
- // does not yet count it that way.
- int32_t newDataCapacity = ASCII_LIMIT;
- // Add room for a small data null block in case it would match the start of
- // a fast data block where dataNullOffset must not be set in that case.
- newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
- // Add room for special values (errorValue, highValue) and padding.
- newDataCapacity += 4;
- int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
- int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
- int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
- for (int32_t i = 0; i < iLimit; i += inc) {
- if (i == fastILimit) {
- blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
- inc = 1;
- }
- uint32_t value = index[i];
- if (flags[i] == MIXED) {
- // Really mixed?
- const uint32_t *p = data + value;
- value = *p;
- if (allValuesSameAs(p + 1, blockLength - 1, value)) {
- flags[i] = ALL_SAME;
- index[i] = value;
- // Fall through to ALL_SAME handling.
- } else {
- newDataCapacity += blockLength;
- continue;
- }
- } else {
- U_ASSERT(flags[i] == ALL_SAME);
- if (inc > 1) {
- // Do all of the fast-range data block's ALL_SAME parts have the same value?
- bool allSame = true;
- int32_t next_i = i + inc;
- for (int32_t j = i + 1; j < next_i; ++j) {
- U_ASSERT(flags[j] == ALL_SAME);
- if (index[j] != value) {
- allSame = false;
- break;
- }
- }
- if (!allSame) {
- // Turn it into a MIXED block.
- if (getDataBlock(i) < 0) {
- return -1;
- }
- newDataCapacity += blockLength;
- continue;
- }
- }
- }
- // Is there another ALL_SAME block with the same value?
- int32_t other = allSameBlocks.findOrAdd(i, inc, value);
- if (other == AllSameBlocks::OVERFLOW) {
- // The fixed-size array overflowed. Slow check for a duplicate block.
-#ifdef UCPTRIE_DEBUG
- if (!overflow) {
- puts("UCPTrie AllSameBlocks overflow");
- overflow = true;
- }
-#endif
- int32_t jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
- for (int32_t j = 0;; j += jInc) {
- if (j == i) {
- allSameBlocks.add(i, inc, value);
- break;
- }
- if (j == fastILimit) {
- jInc = 1;
- }
- if (flags[j] == ALL_SAME && index[j] == value) {
- allSameBlocks.add(j, jInc + inc, value);
- other = j;
- break;
- // We could keep counting blocks with the same value
- // before we add the first one, which may improve compaction in rare cases,
- // but it would make it slower.
- }
- }
- }
- if (other >= 0) {
- flags[i] = SAME_AS;
- index[i] = other;
- } else {
- // New unique same-value block.
- newDataCapacity += blockLength;
- }
- }
- return newDataCapacity;
-}
-
-#ifdef UCPTRIE_DEBUG
-# define DEBUG_DO(expr) expr
-#else
-# define DEBUG_DO(expr)
-#endif
-
-#ifdef UCPTRIE_DEBUG
-// Braille symbols: U+28xx = UTF-8 E2 A0 80..E2 A3 BF
-int32_t appendValue(char s[], int32_t length, uint32_t value) {
- value ^= value >> 16;
- value ^= value >> 8;
- s[length] = 0xE2;
- s[length + 1] = (char)(0xA0 + ((value >> 6) & 3));
- s[length + 2] = (char)(0x80 + (value & 0x3F));
- return length + 3;
-}
-
-void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value,
- UChar32 start, int32_t overlap, uint32_t initialValue) {
- char s[UCPTRIE_FAST_DATA_BLOCK_LENGTH * 3 + 3];
- int32_t length = 0;
- int32_t i;
- for (i = 0; i < overlap; ++i) {
- length = appendValue(s, length, 0); // Braille blank
- }
- s[length++] = '|';
- for (; i < blockLength; ++i) {
- if (block != nullptr) {
- value = block[i];
- }
- if (value == initialValue) {
- value = 0x40; // Braille lower left dot
- }
- length = appendValue(s, length, value);
- }
- s[length] = 0;
- start += overlap;
- if (start <= 0xffff) {
- printf(" %04lX %s|\n", (long)start, s);
- } else if (start <= 0xfffff) {
- printf(" %5lX %s|\n", (long)start, s);
- } else {
- printf(" %6lX %s|\n", (long)start, s);
- }
-}
-#endif
-
-/**
- * Compacts a build-time trie.
- *
- * The compaction
- * - removes blocks that are identical with earlier ones
- * - overlaps each new non-duplicate block as much as possible with the previously-written one
- * - works with fast-range data blocks whose length is a multiple of that of
- * higher-code-point data blocks
- *
- * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
- */
-int32_t MutableCodePointTrie::compactData(
- int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
- int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) {
-#ifdef UCPTRIE_DEBUG
- int32_t countSame=0, sumOverlaps=0;
- bool printData = dataLength == 29088 /* line.brk */ ||
- // dataLength == 30048 /* CanonIterData */ ||
- dataLength == 50400 /* zh.txt~stroke */;
-#endif
-
- // The linear ASCII data has been copied into newData already.
- int32_t newDataLength = 0;
- for (int32_t i = 0; newDataLength < ASCII_LIMIT;
- newDataLength += UCPTRIE_FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
- index[i] = newDataLength;
-#ifdef UCPTRIE_DEBUG
- if (printData) {
- printBlock(newData + newDataLength, UCPTRIE_FAST_DATA_BLOCK_LENGTH, 0, newDataLength, 0, initialValue);
- }
-#endif
- }
-
- int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
- if (!mixedBlocks.init(newDataCapacity, blockLength)) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- mixedBlocks.extend(newData, 0, 0, newDataLength);
-
- int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
- int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
- int32_t fastLength = 0;
- for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) {
- if (i == fastILimit) {
- blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
- inc = 1;
- fastLength = newDataLength;
- if (!mixedBlocks.init(newDataCapacity, blockLength)) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- mixedBlocks.extend(newData, 0, 0, newDataLength);
- }
- if (flags[i] == ALL_SAME) {
- uint32_t value = index[i];
- // Find an earlier part of the data array of length blockLength
- // that is filled with this value.
- int32_t n = mixedBlocks.findAllSameBlock(newData, value);
- // If we find a match, and the current block is the data null block,
- // and it is not a fast block but matches the start of a fast block,
- // then we need to continue looking.
- // This is because this small block is shorter than the fast block,
- // and not all of the rest of the fast block is filled with this value.
- // Otherwise trie.getRange() would detect that the fast block starts at
- // dataNullOffset and assume incorrectly that it is filled with the null value.
- while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength &&
- isStartOfSomeFastBlock(n, index, fastILimit)) {
- n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength);
- }
- if (n >= 0) {
- DEBUG_DO(++countSame);
- index[i] = n;
- } else {
- n = getAllSameOverlap(newData, newDataLength, value, blockLength);
- DEBUG_DO(sumOverlaps += n);
-#ifdef UCPTRIE_DEBUG
- if (printData) {
- printBlock(nullptr, blockLength, value, i << UCPTRIE_SHIFT_3, n, initialValue);
- }
-#endif
- index[i] = newDataLength - n;
- int32_t prevDataLength = newDataLength;
- while (n < blockLength) {
- newData[newDataLength++] = value;
- ++n;
- }
- mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
- }
- } else if (flags[i] == MIXED) {
- const uint32_t *block = data + index[i];
- int32_t n = mixedBlocks.findBlock(newData, block, 0);
- if (n >= 0) {
- DEBUG_DO(++countSame);
- index[i] = n;
- } else {
- n = getOverlap(newData, newDataLength, block, 0, blockLength);
- DEBUG_DO(sumOverlaps += n);
-#ifdef UCPTRIE_DEBUG
- if (printData) {
- printBlock(block, blockLength, 0, i << UCPTRIE_SHIFT_3, n, initialValue);
- }
-#endif
- index[i] = newDataLength - n;
- int32_t prevDataLength = newDataLength;
- while (n < blockLength) {
- newData[newDataLength++] = block[n++];
- }
- mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
- }
- } else /* SAME_AS */ {
- uint32_t j = index[i];
- index[i] = index[j];
- }
- }
-
-#ifdef UCPTRIE_DEBUG
- /* we saved some space */
- printf("compacting UCPTrie: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n",
- (long)dataLength, (long)newDataLength, (long)countSame, (long)sumOverlaps);
-#endif
- return newDataLength;
-}
-
-int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks,
- UErrorCode &errorCode) {
- int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3);
- if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) {
- // Only the linear fast index, no multi-stage index tables.
- index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
- return fastIndexLength;
- }
-
- // Condense the fast index table.
- // Also, does it contain an index-3 block with all dataNullOffset?
- uint16_t fastIndex[UCPTRIE_BMP_INDEX_LENGTH]; // fastIndexLength
- int32_t i3FirstNull = -1;
- for (int32_t i = 0, j = 0; i < fastILimit; ++j) {
- uint32_t i3 = index[i];
- fastIndex[j] = (uint16_t)i3;
- if (i3 == (uint32_t)dataNullOffset) {
- if (i3FirstNull < 0) {
- i3FirstNull = j;
- } else if (index3NullOffset < 0 &&
- (j - i3FirstNull + 1) == UCPTRIE_INDEX_3_BLOCK_LENGTH) {
- index3NullOffset = i3FirstNull;
- }
- } else {
- i3FirstNull = -1;
- }
- // Set the index entries that compactData() skipped.
- // Needed when the multi-stage index covers the fast index range as well.
- int32_t iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
- while (++i < iNext) {
- i3 += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
- index[i] = i3;
- }
- }
-
- if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength);
-
- // Examine index-3 blocks. For each determine one of:
- // - same as the index-3 null block
- // - same as a fast-index block
- // - 16-bit indexes
- // - 18-bit indexes
- // We store this in the first flags entry for the index-3 block.
- //
- // Also determine an upper limit for the index-3 table length.
- int32_t index3Capacity = 0;
- i3FirstNull = index3NullOffset;
- bool hasLongI3Blocks = false;
- // If the fast index covers the whole BMP, then
- // the multi-stage index is only for supplementary code points.
- // Otherwise, the multi-stage index covers all of Unicode.
- int32_t iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT;
- int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
- for (int32_t i = iStart; i < iLimit;) {
- int32_t j = i;
- int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
- uint32_t oredI3 = 0;
- bool isNull = true;
- do {
- uint32_t i3 = index[j];
- oredI3 |= i3;
- if (i3 != (uint32_t)dataNullOffset) {
- isNull = false;
- }
- } while (++j < jLimit);
- if (isNull) {
- flags[i] = I3_NULL;
- if (i3FirstNull < 0) {
- if (oredI3 <= 0xffff) {
- index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
- } else {
- index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
- hasLongI3Blocks = true;
- }
- i3FirstNull = 0;
- }
- } else {
- if (oredI3 <= 0xffff) {
- int32_t n = mixedBlocks.findBlock(fastIndex, index, i);
- if (n >= 0) {
- flags[i] = I3_BMP;
- index[i] = n;
- } else {
- flags[i] = I3_16;
- index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
- }
- } else {
- flags[i] = I3_18;
- index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
- hasLongI3Blocks = true;
- }
- }
- i = j;
- }
-
- int32_t index2Capacity = (iLimit - iStart) >> UCPTRIE_SHIFT_2_3;
-
- // Length of the index-1 table, rounded up.
- int32_t index1Length = (index2Capacity + UCPTRIE_INDEX_2_MASK) >> UCPTRIE_SHIFT_1_2;
-
- // Index table: Fast index, index-1, index-3, index-2.
- // +1 for possible index table padding.
- int32_t index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1;
- index16 = (uint16_t *)uprv_malloc(index16Capacity * 2);
- if (index16 == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- uprv_memcpy(index16, fastIndex, fastIndexLength * 2);
-
- if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- MixedBlocks longI3Blocks;
- if (hasLongI3Blocks) {
- if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- }
-
- // Compact the index-3 table and write an uncompacted version of the index-2 table.
- uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity
- int32_t i2Length = 0;
- i3FirstNull = index3NullOffset;
- int32_t index3Start = fastIndexLength + index1Length;
- int32_t indexLength = index3Start;
- for (int32_t i = iStart; i < iLimit; i += UCPTRIE_INDEX_3_BLOCK_LENGTH) {
- int32_t i3;
- uint8_t f = flags[i];
- if (f == I3_NULL && i3FirstNull < 0) {
- // First index-3 null block. Write & overlap it like a normal block, then remember it.
- f = dataNullOffset <= 0xffff ? I3_16 : I3_18;
- i3FirstNull = 0;
- }
- if (f == I3_NULL) {
- i3 = index3NullOffset;
- } else if (f == I3_BMP) {
- i3 = index[i];
- } else if (f == I3_16) {
- int32_t n = mixedBlocks.findBlock(index16, index, i);
- if (n >= 0) {
- i3 = n;
- } else {
- if (indexLength == index3Start) {
- // No overlap at the boundary between the index-1 and index-3 tables.
- n = 0;
- } else {
- n = getOverlap(index16, indexLength,
- index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
- }
- i3 = indexLength - n;
- int32_t prevIndexLength = indexLength;
- while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) {
- index16[indexLength++] = index[i + n++];
- }
- mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
- if (hasLongI3Blocks) {
- longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
- }
- }
- } else {
- U_ASSERT(f == I3_18);
- U_ASSERT(hasLongI3Blocks);
- // Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
- int32_t j = i;
- int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
- int32_t k = indexLength;
- do {
- ++k;
- uint32_t v = index[j++];
- uint32_t upperBits = (v & 0x30000) >> 2;
- index16[k++] = v;
- v = index[j++];
- upperBits |= (v & 0x30000) >> 4;
- index16[k++] = v;
- v = index[j++];
- upperBits |= (v & 0x30000) >> 6;
- index16[k++] = v;
- v = index[j++];
- upperBits |= (v & 0x30000) >> 8;
- index16[k++] = v;
- v = index[j++];
- upperBits |= (v & 0x30000) >> 10;
- index16[k++] = v;
- v = index[j++];
- upperBits |= (v & 0x30000) >> 12;
- index16[k++] = v;
- v = index[j++];
- upperBits |= (v & 0x30000) >> 14;
- index16[k++] = v;
- v = index[j++];
- upperBits |= (v & 0x30000) >> 16;
- index16[k++] = v;
- index16[k - 9] = upperBits;
- } while (j < jLimit);
- int32_t n = longI3Blocks.findBlock(index16, index16, indexLength);
- if (n >= 0) {
- i3 = n | 0x8000;
- } else {
- if (indexLength == index3Start) {
- // No overlap at the boundary between the index-1 and index-3 tables.
- n = 0;
- } else {
- n = getOverlap(index16, indexLength,
- index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
- }
- i3 = (indexLength - n) | 0x8000;
- int32_t prevIndexLength = indexLength;
- if (n > 0) {
- int32_t start = indexLength;
- while (n < INDEX_3_18BIT_BLOCK_LENGTH) {
- index16[indexLength++] = index16[start + n++];
- }
- } else {
- indexLength += INDEX_3_18BIT_BLOCK_LENGTH;
- }
- mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
- if (hasLongI3Blocks) {
- longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
- }
- }
- }
- if (index3NullOffset < 0 && i3FirstNull >= 0) {
- index3NullOffset = i3;
- }
- // Set the index-2 table entry.
- index2[i2Length++] = i3;
- }
- U_ASSERT(i2Length == index2Capacity);
- U_ASSERT(indexLength <= index3Start + index3Capacity);
-
- if (index3NullOffset < 0) {
- index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
- }
- if (indexLength >= (UCPTRIE_NO_INDEX3_NULL_OFFSET + UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
- // The index-3 offsets exceed 15 bits, or
- // the last one cannot be distinguished from the no-null-block value.
- errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
-
- // Compact the index-2 table and write the index-1 table.
- static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH,
- "must re-init mixedBlocks");
- int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH;
- int32_t i1 = fastIndexLength;
- for (int32_t i = 0; i < i2Length; i += blockLength) {
- int32_t n;
- if ((i2Length - i) >= blockLength) {
- // normal block
- U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH);
- n = mixedBlocks.findBlock(index16, index2, i);
- } else {
- // highStart is inside the last index-2 block. Shorten it.
- blockLength = i2Length - i;
- n = findSameBlock(index16, index3Start, indexLength,
- index2, i, blockLength);
- }
- int32_t i2;
- if (n >= 0) {
- i2 = n;
- } else {
- if (indexLength == index3Start) {
- // No overlap at the boundary between the index-1 and index-3/2 tables.
- n = 0;
- } else {
- n = getOverlap(index16, indexLength, index2, i, blockLength);
- }
- i2 = indexLength - n;
- int32_t prevIndexLength = indexLength;
- while (n < blockLength) {
- index16[indexLength++] = index2[i + n++];
- }
- mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
- }
- // Set the index-1 table entry.
- index16[i1++] = i2;
- }
- U_ASSERT(i1 == index3Start);
- U_ASSERT(indexLength <= index16Capacity);
-
-#ifdef UCPTRIE_DEBUG
- /* we saved some space */
- printf("compacting UCPTrie: count of 16-bit index words %lu->%lu\n",
- (long)iLimit, (long)indexLength);
-#endif
-
- return indexLength;
-}
-
-int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorCode) {
- // Find the real highStart and round it up.
- U_ASSERT((highStart & (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) == 0);
- highValue = get(MAX_UNICODE);
- int32_t realHighStart = findHighStart();
- realHighStart = (realHighStart + (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) &
- ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
- if (realHighStart == UNICODE_LIMIT) {
- highValue = initialValue;
- }
-
-#ifdef UCPTRIE_DEBUG
- printf("UCPTrie: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n",
- (long)realHighStart, (long)highValue, (long)initialValue);
-#endif
-
- // We always store indexes and data values for the fast range.
- // Pin highStart to the top of that range while building.
- UChar32 fastLimit = fastILimit << UCPTRIE_SHIFT_3;
- if (realHighStart < fastLimit) {
- for (int32_t i = (realHighStart >> UCPTRIE_SHIFT_3); i < fastILimit; ++i) {
- flags[i] = ALL_SAME;
- index[i] = highValue;
- }
- highStart = fastLimit;
- } else {
- highStart = realHighStart;
- }
-
- uint32_t asciiData[ASCII_LIMIT];
- for (int32_t i = 0; i < ASCII_LIMIT; ++i) {
- asciiData[i] = get(i);
- }
-
- // First we look for which data blocks have the same value repeated over the whole block,
- // deduplicate such blocks, find a good null data block (for faster enumeration),
- // and get an upper bound for the necessary data array length.
- AllSameBlocks allSameBlocks;
- int32_t newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks);
- if (newDataCapacity < 0) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- uint32_t *newData = (uint32_t *)uprv_malloc(newDataCapacity * 4);
- if (newData == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- uprv_memcpy(newData, asciiData, sizeof(asciiData));
-
- int32_t dataNullIndex = allSameBlocks.findMostUsed();
-
- MixedBlocks mixedBlocks;
- int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity,
- dataNullIndex, mixedBlocks, errorCode);
- if (U_FAILURE(errorCode)) { return 0; }
- U_ASSERT(newDataLength <= newDataCapacity);
- uprv_free(data);
- data = newData;
- dataCapacity = newDataCapacity;
- dataLength = newDataLength;
- if (dataLength > (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH)) {
- // The offset of the last data block is too high to be stored in the index table.
- errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
-
- if (dataNullIndex >= 0) {
- dataNullOffset = index[dataNullIndex];
-#ifdef UCPTRIE_DEBUG
- if (data[dataNullOffset] != initialValue) {
- printf("UCPTrie initialValue %lx -> more common nullValue %lx\n",
- (long)initialValue, (long)data[dataNullOffset]);
- }
-#endif
- initialValue = data[dataNullOffset];
- } else {
- dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET;
- }
-
- int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode);
- highStart = realHighStart;
- return indexLength;
-}
-
-UCPTrie *MutableCodePointTrie::build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) {
- return nullptr;
- }
- if (type < UCPTRIE_TYPE_FAST || UCPTRIE_TYPE_SMALL < type ||
- valueWidth < UCPTRIE_VALUE_BITS_16 || UCPTRIE_VALUE_BITS_8 < valueWidth) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
-
- // The mutable trie always stores 32-bit values.
- // When we build a UCPTrie for a smaller value width, we first mask off unused bits
- // before compacting the data.
- switch (valueWidth) {
- case UCPTRIE_VALUE_BITS_32:
- break;
- case UCPTRIE_VALUE_BITS_16:
- maskValues(0xffff);
- break;
- case UCPTRIE_VALUE_BITS_8:
- maskValues(0xff);
- break;
- default:
- break;
- }
-
- UChar32 fastLimit = type == UCPTRIE_TYPE_FAST ? BMP_LIMIT : UCPTRIE_SMALL_LIMIT;
- int32_t indexLength = compactTrie(fastLimit >> UCPTRIE_SHIFT_3, errorCode);
- if (U_FAILURE(errorCode)) {
- clear();
- return nullptr;
- }
-
- // Ensure data table alignment: The index length must be even for uint32_t data.
- if (valueWidth == UCPTRIE_VALUE_BITS_32 && (indexLength & 1) != 0) {
- index16[indexLength++] = 0xffee; // arbitrary value
- }
-
- // Make the total trie structure length a multiple of 4 bytes by padding the data table,
- // and store special values as the last two data values.
- int32_t length = indexLength * 2;
- if (valueWidth == UCPTRIE_VALUE_BITS_16) {
- if (((indexLength ^ dataLength) & 1) != 0) {
- // padding
- data[dataLength++] = errorValue;
- }
- if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
- data[dataLength++] = highValue;
- data[dataLength++] = errorValue;
- }
- length += dataLength * 2;
- } else if (valueWidth == UCPTRIE_VALUE_BITS_32) {
- // 32-bit data words never need padding to a multiple of 4 bytes.
- if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
- if (data[dataLength - 1] != highValue) {
- data[dataLength++] = highValue;
- }
- data[dataLength++] = errorValue;
- }
- length += dataLength * 4;
- } else {
- int32_t and3 = (length + dataLength) & 3;
- if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) {
- // all set
- } else if(and3 == 3 && data[dataLength - 1] == highValue) {
- data[dataLength++] = errorValue;
- } else {
- while (and3 != 2) {
- data[dataLength++] = highValue;
- and3 = (and3 + 1) & 3;
- }
- data[dataLength++] = highValue;
- data[dataLength++] = errorValue;
- }
- length += dataLength;
- }
-
- // Calculate the total length of the UCPTrie as a single memory block.
- length += sizeof(UCPTrie);
- U_ASSERT((length & 3) == 0);
-
- uint8_t *bytes = (uint8_t *)uprv_malloc(length);
- if (bytes == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- clear();
- return nullptr;
- }
- UCPTrie *trie = reinterpret_cast<UCPTrie *>(bytes);
- uprv_memset(trie, 0, sizeof(UCPTrie));
- trie->indexLength = indexLength;
- trie->dataLength = dataLength;
-
- trie->highStart = highStart;
- // Round up shifted12HighStart to a multiple of 0x1000 for easy testing from UTF-8 lead bytes.
- // Runtime code needs to then test for the real highStart as well.
- trie->shifted12HighStart = (highStart + 0xfff) >> 12;
- trie->type = type;
- trie->valueWidth = valueWidth;
-
- trie->index3NullOffset = index3NullOffset;
- trie->dataNullOffset = dataNullOffset;
- trie->nullValue = initialValue;
-
- bytes += sizeof(UCPTrie);
-
- // Fill the index and data arrays.
- uint16_t *dest16 = (uint16_t *)bytes;
- trie->index = dest16;
-
- if (highStart <= fastLimit) {
- // Condense only the fast index from the mutable-trie index.
- for (int32_t i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) {
- *dest16++ = (uint16_t)index[i]; // dest16[j]
- }
- } else {
- uprv_memcpy(dest16, index16, indexLength * 2);
- dest16 += indexLength;
- }
- bytes += indexLength * 2;
-
- // Write the data array.
- const uint32_t *p = data;
- switch (valueWidth) {
- case UCPTRIE_VALUE_BITS_16:
- // Write 16-bit data values.
- trie->data.ptr16 = dest16;
- for (int32_t i = dataLength; i > 0; --i) {
- *dest16++ = (uint16_t)*p++;
- }
- break;
- case UCPTRIE_VALUE_BITS_32:
- // Write 32-bit data values.
- trie->data.ptr32 = (uint32_t *)bytes;
- uprv_memcpy(bytes, p, (size_t)dataLength * 4);
- break;
- case UCPTRIE_VALUE_BITS_8:
- // Write 8-bit data values.
- trie->data.ptr8 = bytes;
- for (int32_t i = dataLength; i > 0; --i) {
- *bytes++ = (uint8_t)*p++;
- }
- break;
- default:
- // Will not occur, valueWidth checked at the beginning.
- break;
- }
-
-#ifdef UCPTRIE_DEBUG
- trie->name = name;
-
- ucptrie_printLengths(trie, "");
-#endif
-
- clear();
- return trie;
-}
-
-} // namespace
-
-U_NAMESPACE_END
-
-U_NAMESPACE_USE
-
-U_CAPI UMutableCPTrie * U_EXPORT2
-umutablecptrie_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
- if (U_FAILURE(*pErrorCode)) {
- return nullptr;
- }
- LocalPointer<MutableCodePointTrie> trie(
- new MutableCodePointTrie(initialValue, errorValue, *pErrorCode), *pErrorCode);
- if (U_FAILURE(*pErrorCode)) {
- return nullptr;
- }
- return reinterpret_cast<UMutableCPTrie *>(trie.orphan());
-}
-
-U_CAPI UMutableCPTrie * U_EXPORT2
-umutablecptrie_clone(const UMutableCPTrie *other, UErrorCode *pErrorCode) {
- if (U_FAILURE(*pErrorCode)) {
- return nullptr;
- }
- if (other == nullptr) {
- return nullptr;
- }
- LocalPointer<MutableCodePointTrie> clone(
- new MutableCodePointTrie(*reinterpret_cast<const MutableCodePointTrie *>(other), *pErrorCode), *pErrorCode);
- if (U_FAILURE(*pErrorCode)) {
- return nullptr;
- }
- return reinterpret_cast<UMutableCPTrie *>(clone.orphan());
-}
-
-U_CAPI void U_EXPORT2
-umutablecptrie_close(UMutableCPTrie *trie) {
- delete reinterpret_cast<MutableCodePointTrie *>(trie);
-}
-
-U_CAPI UMutableCPTrie * U_EXPORT2
-umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) {
- if (U_FAILURE(*pErrorCode)) {
- return nullptr;
- }
- if (map == nullptr) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode));
-}
-
-U_CAPI UMutableCPTrie * U_EXPORT2
-umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) {
- if (U_FAILURE(*pErrorCode)) {
- return nullptr;
- }
- if (trie == nullptr) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPTrie(trie, *pErrorCode));
-}
-
-U_CAPI uint32_t U_EXPORT2
-umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c) {
- return reinterpret_cast<const MutableCodePointTrie *>(trie)->get(c);
-}
-
-namespace {
-
-UChar32 getRange(const void *trie, UChar32 start,
- UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
- return reinterpret_cast<const MutableCodePointTrie *>(trie)->
- getRange(start, filter, context, pValue);
-}
-
-} // namespace
-
-U_CAPI UChar32 U_EXPORT2
-umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
- UCPMapRangeOption option, uint32_t surrogateValue,
- UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
- return ucptrie_internalGetRange(getRange, trie, start,
- option, surrogateValue,
- filter, context, pValue);
-}
-
-U_CAPI void U_EXPORT2
-umutablecptrie_set(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
- if (U_FAILURE(*pErrorCode)) {
- return;
- }
- reinterpret_cast<MutableCodePointTrie *>(trie)->set(c, value, *pErrorCode);
-}
-
-U_CAPI void U_EXPORT2
-umutablecptrie_setRange(UMutableCPTrie *trie, UChar32 start, UChar32 end,
- uint32_t value, UErrorCode *pErrorCode) {
- if (U_FAILURE(*pErrorCode)) {
- return;
- }
- reinterpret_cast<MutableCodePointTrie *>(trie)->setRange(start, end, value, *pErrorCode);
-}
-
-/* Compact and internally serialize the trie. */
-U_CAPI UCPTrie * U_EXPORT2
-umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth,
- UErrorCode *pErrorCode) {
- if (U_FAILURE(*pErrorCode)) {
- return nullptr;
- }
- return reinterpret_cast<MutableCodePointTrie *>(trie)->build(type, valueWidth, *pErrorCode);
-}
-
-#ifdef UCPTRIE_DEBUG
-U_CFUNC void umutablecptrie_setName(UMutableCPTrie *trie, const char *name) {
- reinterpret_cast<MutableCodePointTrie *>(trie)->name = name;
-}
-#endif
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// umutablecptrie.cpp (inspired by utrie2_builder.cpp)
+// created: 2017dec29 Markus W. Scherer
+
+// #define UCPTRIE_DEBUG
+#ifdef UCPTRIE_DEBUG
+# include <stdio.h>
+#endif
+
+#include "unicode/utypes.h"
+#include "unicode/ucptrie.h"
+#include "unicode/umutablecptrie.h"
+#include "unicode/uobject.h"
+#include "unicode/utf16.h"
+#include "cmemory.h"
+#include "uassert.h"
+#include "ucptrie_impl.h"
+
+// ICU-20235 In case Microsoft math.h has defined this, undefine it.
+#ifdef OVERFLOW
+#undef OVERFLOW
+#endif
+
+U_NAMESPACE_BEGIN
+
+namespace {
+
+constexpr int32_t MAX_UNICODE = 0x10ffff;
+
+constexpr int32_t UNICODE_LIMIT = 0x110000;
+constexpr int32_t BMP_LIMIT = 0x10000;
+constexpr int32_t ASCII_LIMIT = 0x80;
+
+constexpr int32_t I_LIMIT = UNICODE_LIMIT >> UCPTRIE_SHIFT_3;
+constexpr int32_t BMP_I_LIMIT = BMP_LIMIT >> UCPTRIE_SHIFT_3;
+constexpr int32_t ASCII_I_LIMIT = ASCII_LIMIT >> UCPTRIE_SHIFT_3;
+
+constexpr int32_t SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3));
+
+// Flag values for data blocks.
+constexpr uint8_t ALL_SAME = 0;
+constexpr uint8_t MIXED = 1;
+constexpr uint8_t SAME_AS = 2;
+
+/** Start with allocation of 16k data entries. */
+constexpr int32_t INITIAL_DATA_LENGTH = ((int32_t)1 << 14);
+
+/** Grow about 8x each time. */
+constexpr int32_t MEDIUM_DATA_LENGTH = ((int32_t)1 << 17);
+
+/**
+ * Maximum length of the build-time data array.
+ * One entry per 0x110000 code points.
+ */
+constexpr int32_t MAX_DATA_LENGTH = UNICODE_LIMIT;
+
+// Flag values for index-3 blocks while compacting/building.
+constexpr uint8_t I3_NULL = 0;
+constexpr uint8_t I3_BMP = 1;
+constexpr uint8_t I3_16 = 2;
+constexpr uint8_t I3_18 = 3;
+
+constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8;
+
+class AllSameBlocks;
+class MixedBlocks;
+
+class MutableCodePointTrie : public UMemory {
+public:
+ MutableCodePointTrie(uint32_t initialValue, uint32_t errorValue, UErrorCode &errorCode);
+ MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode);
+ MutableCodePointTrie(const MutableCodePointTrie &other) = delete;
+ ~MutableCodePointTrie();
+
+ MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete;
+
+ static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode);
+ static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode);
+
+ uint32_t get(UChar32 c) const;
+ int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context,
+ uint32_t *pValue) const;
+
+ void set(UChar32 c, uint32_t value, UErrorCode &errorCode);
+ void setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode);
+
+ UCPTrie *build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode);
+
+private:
+ void clear();
+
+ bool ensureHighStart(UChar32 c);
+ int32_t allocDataBlock(int32_t blockLength);
+ int32_t getDataBlock(int32_t i);
+
+ void maskValues(uint32_t mask);
+ UChar32 findHighStart() const;
+ int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks);
+ int32_t compactData(
+ int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
+ int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
+ int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
+ int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode);
+
+ uint32_t *index = nullptr;
+ int32_t indexCapacity = 0;
+ int32_t index3NullOffset = -1;
+ uint32_t *data = nullptr;
+ int32_t dataCapacity = 0;
+ int32_t dataLength = 0;
+ int32_t dataNullOffset = -1;
+
+ uint32_t origInitialValue;
+ uint32_t initialValue;
+ uint32_t errorValue;
+ UChar32 highStart;
+ uint32_t highValue;
+#ifdef UCPTRIE_DEBUG
+public:
+ const char *name;
+#endif
+private:
+ /** Temporary array while building the final data. */
+ uint16_t *index16 = nullptr;
+ uint8_t flags[UNICODE_LIMIT >> UCPTRIE_SHIFT_3];
+};
+
+MutableCodePointTrie::MutableCodePointTrie(uint32_t iniValue, uint32_t errValue, UErrorCode &errorCode) :
+ origInitialValue(iniValue), initialValue(iniValue), errorValue(errValue),
+ highStart(0), highValue(initialValue)
+#ifdef UCPTRIE_DEBUG
+ , name("open")
+#endif
+ {
+ if (U_FAILURE(errorCode)) { return; }
+ index = (uint32_t *)uprv_malloc(BMP_I_LIMIT * 4);
+ data = (uint32_t *)uprv_malloc(INITIAL_DATA_LENGTH * 4);
+ if (index == nullptr || data == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ indexCapacity = BMP_I_LIMIT;
+ dataCapacity = INITIAL_DATA_LENGTH;
+}
+
+MutableCodePointTrie::MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode) :
+ index3NullOffset(other.index3NullOffset),
+ dataNullOffset(other.dataNullOffset),
+ origInitialValue(other.origInitialValue), initialValue(other.initialValue),
+ errorValue(other.errorValue),
+ highStart(other.highStart), highValue(other.highValue)
+#ifdef UCPTRIE_DEBUG
+ , name("mutable clone")
+#endif
+ {
+ if (U_FAILURE(errorCode)) { return; }
+ int32_t iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT;
+ index = (uint32_t *)uprv_malloc(iCapacity * 4);
+ data = (uint32_t *)uprv_malloc(other.dataCapacity * 4);
+ if (index == nullptr || data == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ indexCapacity = iCapacity;
+ dataCapacity = other.dataCapacity;
+
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ uprv_memcpy(flags, other.flags, iLimit);
+ uprv_memcpy(index, other.index, iLimit * 4);
+ uprv_memcpy(data, other.data, (size_t)other.dataLength * 4);
+ dataLength = other.dataLength;
+ U_ASSERT(other.index16 == nullptr);
+}
+
+MutableCodePointTrie::~MutableCodePointTrie() {
+ uprv_free(index);
+ uprv_free(data);
+ uprv_free(index16);
+}
+
+MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) {
+ // Use the highValue as the initialValue to reduce the highStart.
+ uint32_t errorValue = ucpmap_get(map, -1);
+ uint32_t initialValue = ucpmap_get(map, 0x10ffff);
+ LocalPointer<MutableCodePointTrie> mutableTrie(
+ new MutableCodePointTrie(initialValue, errorValue, errorCode),
+ errorCode);
+ if (U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+ UChar32 start = 0, end;
+ uint32_t value;
+ while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
+ nullptr, nullptr, &value)) >= 0) {
+ if (value != initialValue) {
+ if (start == end) {
+ mutableTrie->set(start, value, errorCode);
+ } else {
+ mutableTrie->setRange(start, end, value, errorCode);
+ }
+ }
+ start = end + 1;
+ }
+ if (U_SUCCESS(errorCode)) {
+ return mutableTrie.orphan();
+ } else {
+ return nullptr;
+ }
+}
+
+MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) {
+ // Use the highValue as the initialValue to reduce the highStart.
+ uint32_t errorValue;
+ uint32_t initialValue;
+ switch (trie->valueWidth) {
+ case UCPTRIE_VALUE_BITS_16:
+ errorValue = trie->data.ptr16[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
+ initialValue = trie->data.ptr16[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
+ break;
+ case UCPTRIE_VALUE_BITS_32:
+ errorValue = trie->data.ptr32[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
+ initialValue = trie->data.ptr32[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
+ break;
+ case UCPTRIE_VALUE_BITS_8:
+ errorValue = trie->data.ptr8[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
+ initialValue = trie->data.ptr8[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
+ break;
+ default:
+ // Unreachable if the trie is properly initialized.
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ LocalPointer<MutableCodePointTrie> mutableTrie(
+ new MutableCodePointTrie(initialValue, errorValue, errorCode),
+ errorCode);
+ if (U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+ UChar32 start = 0, end;
+ uint32_t value;
+ while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
+ nullptr, nullptr, &value)) >= 0) {
+ if (value != initialValue) {
+ if (start == end) {
+ mutableTrie->set(start, value, errorCode);
+ } else {
+ mutableTrie->setRange(start, end, value, errorCode);
+ }
+ }
+ start = end + 1;
+ }
+ if (U_SUCCESS(errorCode)) {
+ return mutableTrie.orphan();
+ } else {
+ return nullptr;
+ }
+}
+
+void MutableCodePointTrie::clear() {
+ index3NullOffset = dataNullOffset = -1;
+ dataLength = 0;
+ highValue = initialValue = origInitialValue;
+ highStart = 0;
+ uprv_free(index16);
+ index16 = nullptr;
+}
+
+uint32_t MutableCodePointTrie::get(UChar32 c) const {
+ if ((uint32_t)c > MAX_UNICODE) {
+ return errorValue;
+ }
+ if (c >= highStart) {
+ return highValue;
+ }
+ int32_t i = c >> UCPTRIE_SHIFT_3;
+ if (flags[i] == ALL_SAME) {
+ return index[i];
+ } else {
+ return data[index[i] + (c & UCPTRIE_SMALL_DATA_MASK)];
+ }
+}
+
+inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue,
+ UCPMapValueFilter *filter, const void *context) {
+ if (value == initialValue) {
+ value = nullValue;
+ } else if (filter != nullptr) {
+ value = filter(context, value);
+ }
+ return value;
+}
+
+UChar32 MutableCodePointTrie::getRange(
+ UChar32 start, UCPMapValueFilter *filter, const void *context,
+ uint32_t *pValue) const {
+ if ((uint32_t)start > MAX_UNICODE) {
+ return U_SENTINEL;
+ }
+ if (start >= highStart) {
+ if (pValue != nullptr) {
+ uint32_t value = highValue;
+ if (filter != nullptr) { value = filter(context, value); }
+ *pValue = value;
+ }
+ return MAX_UNICODE;
+ }
+ uint32_t nullValue = initialValue;
+ if (filter != nullptr) { nullValue = filter(context, nullValue); }
+ UChar32 c = start;
+ uint32_t trieValue, value;
+ bool haveValue = false;
+ int32_t i = c >> UCPTRIE_SHIFT_3;
+ do {
+ if (flags[i] == ALL_SAME) {
+ uint32_t trieValue2 = index[i];
+ if (haveValue) {
+ if (trieValue2 != trieValue) {
+ if (filter == nullptr ||
+ maybeFilterValue(trieValue2, initialValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ }
+ trieValue = trieValue2; // may or may not help
+ }
+ } else {
+ trieValue = trieValue2;
+ value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
+ if (pValue != nullptr) { *pValue = value; }
+ haveValue = true;
+ }
+ c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK;
+ } else /* MIXED */ {
+ int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK);
+ uint32_t trieValue2 = data[di];
+ if (haveValue) {
+ if (trieValue2 != trieValue) {
+ if (filter == nullptr ||
+ maybeFilterValue(trieValue2, initialValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ }
+ trieValue = trieValue2; // may or may not help
+ }
+ } else {
+ trieValue = trieValue2;
+ value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
+ if (pValue != nullptr) { *pValue = value; }
+ haveValue = true;
+ }
+ while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) {
+ trieValue2 = data[++di];
+ if (trieValue2 != trieValue) {
+ if (filter == nullptr ||
+ maybeFilterValue(trieValue2, initialValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ }
+ }
+ trieValue = trieValue2; // may or may not help
+ }
+ }
+ ++i;
+ } while (c < highStart);
+ U_ASSERT(haveValue);
+ if (maybeFilterValue(highValue, initialValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ } else {
+ return MAX_UNICODE;
+ }
+}
+
+void
+writeBlock(uint32_t *block, uint32_t value) {
+ uint32_t *limit = block + UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ while (block < limit) {
+ *block++ = value;
+ }
+}
+
+bool MutableCodePointTrie::ensureHighStart(UChar32 c) {
+ if (c >= highStart) {
+ // Round up to a UCPTRIE_CP_PER_INDEX_2_ENTRY boundary to simplify compaction.
+ c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
+ int32_t i = highStart >> UCPTRIE_SHIFT_3;
+ int32_t iLimit = c >> UCPTRIE_SHIFT_3;
+ if (iLimit > indexCapacity) {
+ uint32_t *newIndex = (uint32_t *)uprv_malloc(I_LIMIT * 4);
+ if (newIndex == nullptr) { return false; }
+ uprv_memcpy(newIndex, index, i * 4);
+ uprv_free(index);
+ index = newIndex;
+ indexCapacity = I_LIMIT;
+ }
+ do {
+ flags[i] = ALL_SAME;
+ index[i] = initialValue;
+ } while(++i < iLimit);
+ highStart = c;
+ }
+ return true;
+}
+
+int32_t MutableCodePointTrie::allocDataBlock(int32_t blockLength) {
+ int32_t newBlock = dataLength;
+ int32_t newTop = newBlock + blockLength;
+ if (newTop > dataCapacity) {
+ int32_t capacity;
+ if (dataCapacity < MEDIUM_DATA_LENGTH) {
+ capacity = MEDIUM_DATA_LENGTH;
+ } else if (dataCapacity < MAX_DATA_LENGTH) {
+ capacity = MAX_DATA_LENGTH;
+ } else {
+ // Should never occur.
+ // Either MAX_DATA_LENGTH is incorrect,
+ // or the code writes more values than should be possible.
+ return -1;
+ }
+ uint32_t *newData = (uint32_t *)uprv_malloc(capacity * 4);
+ if (newData == nullptr) {
+ return -1;
+ }
+ uprv_memcpy(newData, data, (size_t)dataLength * 4);
+ uprv_free(data);
+ data = newData;
+ dataCapacity = capacity;
+ }
+ dataLength = newTop;
+ return newBlock;
+}
+
+/**
+ * No error checking for illegal arguments.
+ *
+ * @return -1 if no new data block available (out of memory in data array)
+ * @internal
+ */
+int32_t MutableCodePointTrie::getDataBlock(int32_t i) {
+ if (flags[i] == MIXED) {
+ return index[i];
+ }
+ if (i < BMP_I_LIMIT) {
+ int32_t newBlock = allocDataBlock(UCPTRIE_FAST_DATA_BLOCK_LENGTH);
+ if (newBlock < 0) { return newBlock; }
+ int32_t iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1);
+ int32_t iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ do {
+ U_ASSERT(flags[iStart] == ALL_SAME);
+ writeBlock(data + newBlock, index[iStart]);
+ flags[iStart] = MIXED;
+ index[iStart++] = newBlock;
+ newBlock += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ } while (iStart < iLimit);
+ return index[i];
+ } else {
+ int32_t newBlock = allocDataBlock(UCPTRIE_SMALL_DATA_BLOCK_LENGTH);
+ if (newBlock < 0) { return newBlock; }
+ writeBlock(data + newBlock, index[i]);
+ flags[i] = MIXED;
+ index[i] = newBlock;
+ return newBlock;
+ }
+}
+
+void MutableCodePointTrie::set(UChar32 c, uint32_t value, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if ((uint32_t)c > MAX_UNICODE) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+
+ int32_t block;
+ if (!ensureHighStart(c) || (block = getDataBlock(c >> UCPTRIE_SHIFT_3)) < 0) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ data[block + (c & UCPTRIE_SMALL_DATA_MASK)] = value;
+}
+
+void
+fillBlock(uint32_t *block, UChar32 start, UChar32 limit, uint32_t value) {
+ uint32_t *pLimit = block + limit;
+ block += start;
+ while (block < pLimit) {
+ *block++ = value;
+ }
+}
+
+void MutableCodePointTrie::setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if ((uint32_t)start > MAX_UNICODE || (uint32_t)end > MAX_UNICODE || start > end) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ if (!ensureHighStart(end)) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ UChar32 limit = end + 1;
+ if (start & UCPTRIE_SMALL_DATA_MASK) {
+ // Set partial block at [start..following block boundary[.
+ int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
+ if (block < 0) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ UChar32 nextStart = (start + UCPTRIE_SMALL_DATA_MASK) & ~UCPTRIE_SMALL_DATA_MASK;
+ if (nextStart <= limit) {
+ fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, UCPTRIE_SMALL_DATA_BLOCK_LENGTH,
+ value);
+ start = nextStart;
+ } else {
+ fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, limit & UCPTRIE_SMALL_DATA_MASK,
+ value);
+ return;
+ }
+ }
+
+ // Number of positions in the last, partial block.
+ int32_t rest = limit & UCPTRIE_SMALL_DATA_MASK;
+
+ // Round down limit to a block boundary.
+ limit &= ~UCPTRIE_SMALL_DATA_MASK;
+
+ // Iterate over all-value blocks.
+ while (start < limit) {
+ int32_t i = start >> UCPTRIE_SHIFT_3;
+ if (flags[i] == ALL_SAME) {
+ index[i] = value;
+ } else /* MIXED */ {
+ fillBlock(data + index[i], 0, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, value);
+ }
+ start += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ }
+
+ if (rest > 0) {
+ // Set partial block at [last block boundary..limit[.
+ int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
+ if (block < 0) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ fillBlock(data + block, 0, rest, value);
+ }
+}
+
+/* compaction --------------------------------------------------------------- */
+
+void MutableCodePointTrie::maskValues(uint32_t mask) {
+ initialValue &= mask;
+ errorValue &= mask;
+ highValue &= mask;
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ for (int32_t i = 0; i < iLimit; ++i) {
+ if (flags[i] == ALL_SAME) {
+ index[i] &= mask;
+ }
+ }
+ for (int32_t i = 0; i < dataLength; ++i) {
+ data[i] &= mask;
+ }
+}
+
+template<typename UIntA, typename UIntB>
+bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) {
+ while (length > 0 && *s == *t) {
+ ++s;
+ ++t;
+ --length;
+ }
+ return length == 0;
+}
+
+bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) {
+ const uint32_t *pLimit = p + length;
+ while (p < pLimit && *p == value) { ++p; }
+ return p == pLimit;
+}
+
+/** Search for an identical block. */
+int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
+ const uint16_t *q, int32_t qStart, int32_t blockLength) {
+ // Ensure that we do not even partially get past length.
+ length -= blockLength;
+
+ q += qStart;
+ while (pStart <= length) {
+ if (equalBlocks(p + pStart, q, blockLength)) {
+ return pStart;
+ }
+ ++pStart;
+ }
+ return -1;
+}
+
+int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit,
+ uint32_t value, int32_t blockLength) {
+ // Ensure that we do not even partially get past limit.
+ limit -= blockLength;
+
+ for (int32_t block = start; block <= limit; ++block) {
+ if (p[block] == value) {
+ for (int32_t i = 1;; ++i) {
+ if (i == blockLength) {
+ return block;
+ }
+ if (p[block + i] != value) {
+ block += i;
+ break;
+ }
+ }
+ }
+ }
+ return -1;
+}
+
+/**
+ * Look for maximum overlap of the beginning of the other block
+ * with the previous, adjacent block.
+ */
+template<typename UIntA, typename UIntB>
+int32_t getOverlap(const UIntA *p, int32_t length,
+ const UIntB *q, int32_t qStart, int32_t blockLength) {
+ int32_t overlap = blockLength - 1;
+ U_ASSERT(overlap <= length);
+ q += qStart;
+ while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
+ --overlap;
+ }
+ return overlap;
+}
+
+int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value,
+ int32_t blockLength) {
+ int32_t min = length - (blockLength - 1);
+ int32_t i = length;
+ while (min < i && p[i - 1] == value) { --i; }
+ return length - i;
+}
+
+bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) {
+ for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
+ if (index[i] == dataOffset) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/**
+ * Finds the start of the last range in the trie by enumerating backward.
+ * Indexes for code points higher than this will be omitted.
+ */
+UChar32 MutableCodePointTrie::findHighStart() const {
+ int32_t i = highStart >> UCPTRIE_SHIFT_3;
+ while (i > 0) {
+ bool match;
+ if (flags[--i] == ALL_SAME) {
+ match = index[i] == highValue;
+ } else /* MIXED */ {
+ const uint32_t *p = data + index[i];
+ for (int32_t j = 0;; ++j) {
+ if (j == UCPTRIE_SMALL_DATA_BLOCK_LENGTH) {
+ match = true;
+ break;
+ }
+ if (p[j] != highValue) {
+ match = false;
+ break;
+ }
+ }
+ }
+ if (!match) {
+ return (i + 1) << UCPTRIE_SHIFT_3;
+ }
+ }
+ return 0;
+}
+
+class AllSameBlocks {
+public:
+ static constexpr int32_t NEW_UNIQUE = -1;
+ static constexpr int32_t OVERFLOW = -2;
+
+ AllSameBlocks() : length(0), mostRecent(-1) {}
+
+ int32_t findOrAdd(int32_t index, int32_t count, uint32_t value) {
+ if (mostRecent >= 0 && values[mostRecent] == value) {
+ refCounts[mostRecent] += count;
+ return indexes[mostRecent];
+ }
+ for (int32_t i = 0; i < length; ++i) {
+ if (values[i] == value) {
+ mostRecent = i;
+ refCounts[i] += count;
+ return indexes[i];
+ }
+ }
+ if (length == CAPACITY) {
+ return OVERFLOW;
+ }
+ mostRecent = length;
+ indexes[length] = index;
+ values[length] = value;
+ refCounts[length++] = count;
+ return NEW_UNIQUE;
+ }
+
+ /** Replaces the block which has the lowest reference count. */
+ void add(int32_t index, int32_t count, uint32_t value) {
+ U_ASSERT(length == CAPACITY);
+ int32_t least = -1;
+ int32_t leastCount = I_LIMIT;
+ for (int32_t i = 0; i < length; ++i) {
+ U_ASSERT(values[i] != value);
+ if (refCounts[i] < leastCount) {
+ least = i;
+ leastCount = refCounts[i];
+ }
+ }
+ U_ASSERT(least >= 0);
+ mostRecent = least;
+ indexes[least] = index;
+ values[least] = value;
+ refCounts[least] = count;
+ }
+
+ int32_t findMostUsed() const {
+ if (length == 0) { return -1; }
+ int32_t max = -1;
+ int32_t maxCount = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ if (refCounts[i] > maxCount) {
+ max = i;
+ maxCount = refCounts[i];
+ }
+ }
+ return indexes[max];
+ }
+
+private:
+ static constexpr int32_t CAPACITY = 32;
+
+ int32_t length;
+ int32_t mostRecent;
+
+ int32_t indexes[CAPACITY];
+ uint32_t values[CAPACITY];
+ int32_t refCounts[CAPACITY];
+};
+
+// Custom hash table for mixed-value blocks to be found anywhere in the
+// compacted data or index so far.
+class MixedBlocks {
+public:
+ MixedBlocks() {}
+ ~MixedBlocks() {
+ uprv_free(table);
+ }
+
+ bool init(int32_t maxLength, int32_t newBlockLength) {
+ // We store actual data indexes + 1 to reserve 0 for empty entries.
+ int32_t maxDataIndex = maxLength - newBlockLength + 1;
+ int32_t newLength;
+ if (maxDataIndex <= 0xfff) { // 4k
+ newLength = 6007;
+ shift = 12;
+ mask = 0xfff;
+ } else if (maxDataIndex <= 0x7fff) { // 32k
+ newLength = 50021;
+ shift = 15;
+ mask = 0x7fff;
+ } else if (maxDataIndex <= 0x1ffff) { // 128k
+ newLength = 200003;
+ shift = 17;
+ mask = 0x1ffff;
+ } else {
+ // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M
+ newLength = 1500007;
+ shift = 21;
+ mask = 0x1fffff;
+ }
+ if (newLength > capacity) {
+ uprv_free(table);
+ table = (uint32_t *)uprv_malloc(newLength * 4);
+ if (table == nullptr) {
+ return false;
+ }
+ capacity = newLength;
+ }
+ length = newLength;
+ uprv_memset(table, 0, length * 4);
+
+ blockLength = newBlockLength;
+ return true;
+ }
+
+ template<typename UInt>
+ void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) {
+ int32_t start = prevDataLength - blockLength;
+ if (start >= minStart) {
+ ++start; // Skip the last block that we added last time.
+ } else {
+ start = minStart; // Begin with the first full block.
+ }
+ for (int32_t end = newDataLength - blockLength; start <= end; ++start) {
+ uint32_t hashCode = makeHashCode(data, start);
+ addEntry(data, start, hashCode, start);
+ }
+ }
+
+ template<typename UIntA, typename UIntB>
+ int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const {
+ uint32_t hashCode = makeHashCode(blockData, blockStart);
+ int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode);
+ if (entryIndex >= 0) {
+ return (table[entryIndex] & mask) - 1;
+ } else {
+ return -1;
+ }
+ }
+
+ int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const {
+ uint32_t hashCode = makeHashCode(blockValue);
+ int32_t entryIndex = findEntry(data, blockValue, hashCode);
+ if (entryIndex >= 0) {
+ return (table[entryIndex] & mask) - 1;
+ } else {
+ return -1;
+ }
+ }
+
+private:
+ template<typename UInt>
+ uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const {
+ int32_t blockLimit = blockStart + blockLength;
+ uint32_t hashCode = blockData[blockStart++];
+ do {
+ hashCode = 37 * hashCode + blockData[blockStart++];
+ } while (blockStart < blockLimit);
+ return hashCode;
+ }
+
+ uint32_t makeHashCode(uint32_t blockValue) const {
+ uint32_t hashCode = blockValue;
+ for (int32_t i = 1; i < blockLength; ++i) {
+ hashCode = 37 * hashCode + blockValue;
+ }
+ return hashCode;
+ }
+
+ template<typename UInt>
+ void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) {
+ U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask);
+ int32_t entryIndex = findEntry(data, data, blockStart, hashCode);
+ if (entryIndex < 0) {
+ table[~entryIndex] = (hashCode << shift) | (dataIndex + 1);
+ }
+ }
+
+ template<typename UIntA, typename UIntB>
+ int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart,
+ uint32_t hashCode) const {
+ uint32_t shiftedHashCode = hashCode << shift;
+ int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
+ for (int32_t entryIndex = initialEntryIndex;;) {
+ uint32_t entry = table[entryIndex];
+ if (entry == 0) {
+ return ~entryIndex;
+ }
+ if ((entry & ~mask) == shiftedHashCode) {
+ int32_t dataIndex = (entry & mask) - 1;
+ if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) {
+ return entryIndex;
+ }
+ }
+ entryIndex = nextIndex(initialEntryIndex, entryIndex);
+ }
+ }
+
+ int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const {
+ uint32_t shiftedHashCode = hashCode << shift;
+ int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
+ for (int32_t entryIndex = initialEntryIndex;;) {
+ uint32_t entry = table[entryIndex];
+ if (entry == 0) {
+ return ~entryIndex;
+ }
+ if ((entry & ~mask) == shiftedHashCode) {
+ int32_t dataIndex = (entry & mask) - 1;
+ if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) {
+ return entryIndex;
+ }
+ }
+ entryIndex = nextIndex(initialEntryIndex, entryIndex);
+ }
+ }
+
+ inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const {
+ // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length);
+ return (entryIndex + initialEntryIndex) % length;
+ }
+
+ // Hash table.
+ // The length is a prime number, larger than the maximum data length.
+ // The "shift" lower bits store a data index + 1.
+ // The remaining upper bits store a partial hashCode of the block data values.
+ uint32_t *table = nullptr;
+ int32_t capacity = 0;
+ int32_t length = 0;
+ int32_t shift = 0;
+ uint32_t mask = 0;
+
+ int32_t blockLength = 0;
+};
+
+int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) {
+#ifdef UCPTRIE_DEBUG
+ bool overflow = false;
+#endif
+
+ // ASCII data will be stored as a linear table, even if the following code
+ // does not yet count it that way.
+ int32_t newDataCapacity = ASCII_LIMIT;
+ // Add room for a small data null block in case it would match the start of
+ // a fast data block where dataNullOffset must not be set in that case.
+ newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ // Add room for special values (errorValue, highValue) and padding.
+ newDataCapacity += 4;
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
+ int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ for (int32_t i = 0; i < iLimit; i += inc) {
+ if (i == fastILimit) {
+ blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ inc = 1;
+ }
+ uint32_t value = index[i];
+ if (flags[i] == MIXED) {
+ // Really mixed?
+ const uint32_t *p = data + value;
+ value = *p;
+ if (allValuesSameAs(p + 1, blockLength - 1, value)) {
+ flags[i] = ALL_SAME;
+ index[i] = value;
+ // Fall through to ALL_SAME handling.
+ } else {
+ newDataCapacity += blockLength;
+ continue;
+ }
+ } else {
+ U_ASSERT(flags[i] == ALL_SAME);
+ if (inc > 1) {
+ // Do all of the fast-range data block's ALL_SAME parts have the same value?
+ bool allSame = true;
+ int32_t next_i = i + inc;
+ for (int32_t j = i + 1; j < next_i; ++j) {
+ U_ASSERT(flags[j] == ALL_SAME);
+ if (index[j] != value) {
+ allSame = false;
+ break;
+ }
+ }
+ if (!allSame) {
+ // Turn it into a MIXED block.
+ if (getDataBlock(i) < 0) {
+ return -1;
+ }
+ newDataCapacity += blockLength;
+ continue;
+ }
+ }
+ }
+ // Is there another ALL_SAME block with the same value?
+ int32_t other = allSameBlocks.findOrAdd(i, inc, value);
+ if (other == AllSameBlocks::OVERFLOW) {
+ // The fixed-size array overflowed. Slow check for a duplicate block.
+#ifdef UCPTRIE_DEBUG
+ if (!overflow) {
+ puts("UCPTrie AllSameBlocks overflow");
+ overflow = true;
+ }
+#endif
+ int32_t jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ for (int32_t j = 0;; j += jInc) {
+ if (j == i) {
+ allSameBlocks.add(i, inc, value);
+ break;
+ }
+ if (j == fastILimit) {
+ jInc = 1;
+ }
+ if (flags[j] == ALL_SAME && index[j] == value) {
+ allSameBlocks.add(j, jInc + inc, value);
+ other = j;
+ break;
+ // We could keep counting blocks with the same value
+ // before we add the first one, which may improve compaction in rare cases,
+ // but it would make it slower.
+ }
+ }
+ }
+ if (other >= 0) {
+ flags[i] = SAME_AS;
+ index[i] = other;
+ } else {
+ // New unique same-value block.
+ newDataCapacity += blockLength;
+ }
+ }
+ return newDataCapacity;
+}
+
+#ifdef UCPTRIE_DEBUG
+# define DEBUG_DO(expr) expr
+#else
+# define DEBUG_DO(expr)
+#endif
+
+#ifdef UCPTRIE_DEBUG
+// Braille symbols: U+28xx = UTF-8 E2 A0 80..E2 A3 BF
+int32_t appendValue(char s[], int32_t length, uint32_t value) {
+ value ^= value >> 16;
+ value ^= value >> 8;
+ s[length] = 0xE2;
+ s[length + 1] = (char)(0xA0 + ((value >> 6) & 3));
+ s[length + 2] = (char)(0x80 + (value & 0x3F));
+ return length + 3;
+}
+
+void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value,
+ UChar32 start, int32_t overlap, uint32_t initialValue) {
+ char s[UCPTRIE_FAST_DATA_BLOCK_LENGTH * 3 + 3];
+ int32_t length = 0;
+ int32_t i;
+ for (i = 0; i < overlap; ++i) {
+ length = appendValue(s, length, 0); // Braille blank
+ }
+ s[length++] = '|';
+ for (; i < blockLength; ++i) {
+ if (block != nullptr) {
+ value = block[i];
+ }
+ if (value == initialValue) {
+ value = 0x40; // Braille lower left dot
+ }
+ length = appendValue(s, length, value);
+ }
+ s[length] = 0;
+ start += overlap;
+ if (start <= 0xffff) {
+ printf(" %04lX %s|\n", (long)start, s);
+ } else if (start <= 0xfffff) {
+ printf(" %5lX %s|\n", (long)start, s);
+ } else {
+ printf(" %6lX %s|\n", (long)start, s);
+ }
+}
+#endif
+
+/**
+ * Compacts a build-time trie.
+ *
+ * The compaction
+ * - removes blocks that are identical with earlier ones
+ * - overlaps each new non-duplicate block as much as possible with the previously-written one
+ * - works with fast-range data blocks whose length is a multiple of that of
+ * higher-code-point data blocks
+ *
+ * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
+ */
+int32_t MutableCodePointTrie::compactData(
+ int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
+ int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) {
+#ifdef UCPTRIE_DEBUG
+ int32_t countSame=0, sumOverlaps=0;
+ bool printData = dataLength == 29088 /* line.brk */ ||
+ // dataLength == 30048 /* CanonIterData */ ||
+ dataLength == 50400 /* zh.txt~stroke */;
+#endif
+
+ // The linear ASCII data has been copied into newData already.
+ int32_t newDataLength = 0;
+ for (int32_t i = 0; newDataLength < ASCII_LIMIT;
+ newDataLength += UCPTRIE_FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
+ index[i] = newDataLength;
+#ifdef UCPTRIE_DEBUG
+ if (printData) {
+ printBlock(newData + newDataLength, UCPTRIE_FAST_DATA_BLOCK_LENGTH, 0, newDataLength, 0, initialValue);
+ }
+#endif
+ }
+
+ int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
+ if (!mixedBlocks.init(newDataCapacity, blockLength)) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ mixedBlocks.extend(newData, 0, 0, newDataLength);
+
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ int32_t fastLength = 0;
+ for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) {
+ if (i == fastILimit) {
+ blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ inc = 1;
+ fastLength = newDataLength;
+ if (!mixedBlocks.init(newDataCapacity, blockLength)) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ mixedBlocks.extend(newData, 0, 0, newDataLength);
+ }
+ if (flags[i] == ALL_SAME) {
+ uint32_t value = index[i];
+ // Find an earlier part of the data array of length blockLength
+ // that is filled with this value.
+ int32_t n = mixedBlocks.findAllSameBlock(newData, value);
+ // If we find a match, and the current block is the data null block,
+ // and it is not a fast block but matches the start of a fast block,
+ // then we need to continue looking.
+ // This is because this small block is shorter than the fast block,
+ // and not all of the rest of the fast block is filled with this value.
+ // Otherwise trie.getRange() would detect that the fast block starts at
+ // dataNullOffset and assume incorrectly that it is filled with the null value.
+ while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength &&
+ isStartOfSomeFastBlock(n, index, fastILimit)) {
+ n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength);
+ }
+ if (n >= 0) {
+ DEBUG_DO(++countSame);
+ index[i] = n;
+ } else {
+ n = getAllSameOverlap(newData, newDataLength, value, blockLength);
+ DEBUG_DO(sumOverlaps += n);
+#ifdef UCPTRIE_DEBUG
+ if (printData) {
+ printBlock(nullptr, blockLength, value, i << UCPTRIE_SHIFT_3, n, initialValue);
+ }
+#endif
+ index[i] = newDataLength - n;
+ int32_t prevDataLength = newDataLength;
+ while (n < blockLength) {
+ newData[newDataLength++] = value;
+ ++n;
+ }
+ mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
+ }
+ } else if (flags[i] == MIXED) {
+ const uint32_t *block = data + index[i];
+ int32_t n = mixedBlocks.findBlock(newData, block, 0);
+ if (n >= 0) {
+ DEBUG_DO(++countSame);
+ index[i] = n;
+ } else {
+ n = getOverlap(newData, newDataLength, block, 0, blockLength);
+ DEBUG_DO(sumOverlaps += n);
+#ifdef UCPTRIE_DEBUG
+ if (printData) {
+ printBlock(block, blockLength, 0, i << UCPTRIE_SHIFT_3, n, initialValue);
+ }
+#endif
+ index[i] = newDataLength - n;
+ int32_t prevDataLength = newDataLength;
+ while (n < blockLength) {
+ newData[newDataLength++] = block[n++];
+ }
+ mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
+ }
+ } else /* SAME_AS */ {
+ uint32_t j = index[i];
+ index[i] = index[j];
+ }
+ }
+
+#ifdef UCPTRIE_DEBUG
+ /* we saved some space */
+ printf("compacting UCPTrie: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n",
+ (long)dataLength, (long)newDataLength, (long)countSame, (long)sumOverlaps);
+#endif
+ return newDataLength;
+}
+
+int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks,
+ UErrorCode &errorCode) {
+ int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3);
+ if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) {
+ // Only the linear fast index, no multi-stage index tables.
+ index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
+ return fastIndexLength;
+ }
+
+ // Condense the fast index table.
+ // Also, does it contain an index-3 block with all dataNullOffset?
+ uint16_t fastIndex[UCPTRIE_BMP_INDEX_LENGTH]; // fastIndexLength
+ int32_t i3FirstNull = -1;
+ for (int32_t i = 0, j = 0; i < fastILimit; ++j) {
+ uint32_t i3 = index[i];
+ fastIndex[j] = (uint16_t)i3;
+ if (i3 == (uint32_t)dataNullOffset) {
+ if (i3FirstNull < 0) {
+ i3FirstNull = j;
+ } else if (index3NullOffset < 0 &&
+ (j - i3FirstNull + 1) == UCPTRIE_INDEX_3_BLOCK_LENGTH) {
+ index3NullOffset = i3FirstNull;
+ }
+ } else {
+ i3FirstNull = -1;
+ }
+ // Set the index entries that compactData() skipped.
+ // Needed when the multi-stage index covers the fast index range as well.
+ int32_t iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ while (++i < iNext) {
+ i3 += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ index[i] = i3;
+ }
+ }
+
+ if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength);
+
+ // Examine index-3 blocks. For each determine one of:
+ // - same as the index-3 null block
+ // - same as a fast-index block
+ // - 16-bit indexes
+ // - 18-bit indexes
+ // We store this in the first flags entry for the index-3 block.
+ //
+ // Also determine an upper limit for the index-3 table length.
+ int32_t index3Capacity = 0;
+ i3FirstNull = index3NullOffset;
+ bool hasLongI3Blocks = false;
+ // If the fast index covers the whole BMP, then
+ // the multi-stage index is only for supplementary code points.
+ // Otherwise, the multi-stage index covers all of Unicode.
+ int32_t iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT;
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ for (int32_t i = iStart; i < iLimit;) {
+ int32_t j = i;
+ int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
+ uint32_t oredI3 = 0;
+ bool isNull = true;
+ do {
+ uint32_t i3 = index[j];
+ oredI3 |= i3;
+ if (i3 != (uint32_t)dataNullOffset) {
+ isNull = false;
+ }
+ } while (++j < jLimit);
+ if (isNull) {
+ flags[i] = I3_NULL;
+ if (i3FirstNull < 0) {
+ if (oredI3 <= 0xffff) {
+ index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
+ } else {
+ index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
+ hasLongI3Blocks = true;
+ }
+ i3FirstNull = 0;
+ }
+ } else {
+ if (oredI3 <= 0xffff) {
+ int32_t n = mixedBlocks.findBlock(fastIndex, index, i);
+ if (n >= 0) {
+ flags[i] = I3_BMP;
+ index[i] = n;
+ } else {
+ flags[i] = I3_16;
+ index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
+ }
+ } else {
+ flags[i] = I3_18;
+ index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
+ hasLongI3Blocks = true;
+ }
+ }
+ i = j;
+ }
+
+ int32_t index2Capacity = (iLimit - iStart) >> UCPTRIE_SHIFT_2_3;
+
+ // Length of the index-1 table, rounded up.
+ int32_t index1Length = (index2Capacity + UCPTRIE_INDEX_2_MASK) >> UCPTRIE_SHIFT_1_2;
+
+ // Index table: Fast index, index-1, index-3, index-2.
+ // +1 for possible index table padding.
+ int32_t index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1;
+ index16 = (uint16_t *)uprv_malloc(index16Capacity * 2);
+ if (index16 == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ uprv_memcpy(index16, fastIndex, fastIndexLength * 2);
+
+ if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ MixedBlocks longI3Blocks;
+ if (hasLongI3Blocks) {
+ if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ }
+
+ // Compact the index-3 table and write an uncompacted version of the index-2 table.
+ uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity
+ int32_t i2Length = 0;
+ i3FirstNull = index3NullOffset;
+ int32_t index3Start = fastIndexLength + index1Length;
+ int32_t indexLength = index3Start;
+ for (int32_t i = iStart; i < iLimit; i += UCPTRIE_INDEX_3_BLOCK_LENGTH) {
+ int32_t i3;
+ uint8_t f = flags[i];
+ if (f == I3_NULL && i3FirstNull < 0) {
+ // First index-3 null block. Write & overlap it like a normal block, then remember it.
+ f = dataNullOffset <= 0xffff ? I3_16 : I3_18;
+ i3FirstNull = 0;
+ }
+ if (f == I3_NULL) {
+ i3 = index3NullOffset;
+ } else if (f == I3_BMP) {
+ i3 = index[i];
+ } else if (f == I3_16) {
+ int32_t n = mixedBlocks.findBlock(index16, index, i);
+ if (n >= 0) {
+ i3 = n;
+ } else {
+ if (indexLength == index3Start) {
+ // No overlap at the boundary between the index-1 and index-3 tables.
+ n = 0;
+ } else {
+ n = getOverlap(index16, indexLength,
+ index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
+ }
+ i3 = indexLength - n;
+ int32_t prevIndexLength = indexLength;
+ while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) {
+ index16[indexLength++] = index[i + n++];
+ }
+ mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
+ if (hasLongI3Blocks) {
+ longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
+ }
+ }
+ } else {
+ U_ASSERT(f == I3_18);
+ U_ASSERT(hasLongI3Blocks);
+ // Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
+ int32_t j = i;
+ int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
+ int32_t k = indexLength;
+ do {
+ ++k;
+ uint32_t v = index[j++];
+ uint32_t upperBits = (v & 0x30000) >> 2;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 4;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 6;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 8;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 10;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 12;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 14;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 16;
+ index16[k++] = v;
+ index16[k - 9] = upperBits;
+ } while (j < jLimit);
+ int32_t n = longI3Blocks.findBlock(index16, index16, indexLength);
+ if (n >= 0) {
+ i3 = n | 0x8000;
+ } else {
+ if (indexLength == index3Start) {
+ // No overlap at the boundary between the index-1 and index-3 tables.
+ n = 0;
+ } else {
+ n = getOverlap(index16, indexLength,
+ index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
+ }
+ i3 = (indexLength - n) | 0x8000;
+ int32_t prevIndexLength = indexLength;
+ if (n > 0) {
+ int32_t start = indexLength;
+ while (n < INDEX_3_18BIT_BLOCK_LENGTH) {
+ index16[indexLength++] = index16[start + n++];
+ }
+ } else {
+ indexLength += INDEX_3_18BIT_BLOCK_LENGTH;
+ }
+ mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
+ if (hasLongI3Blocks) {
+ longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
+ }
+ }
+ }
+ if (index3NullOffset < 0 && i3FirstNull >= 0) {
+ index3NullOffset = i3;
+ }
+ // Set the index-2 table entry.
+ index2[i2Length++] = i3;
+ }
+ U_ASSERT(i2Length == index2Capacity);
+ U_ASSERT(indexLength <= index3Start + index3Capacity);
+
+ if (index3NullOffset < 0) {
+ index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
+ }
+ if (indexLength >= (UCPTRIE_NO_INDEX3_NULL_OFFSET + UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
+ // The index-3 offsets exceed 15 bits, or
+ // the last one cannot be distinguished from the no-null-block value.
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ // Compact the index-2 table and write the index-1 table.
+ static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH,
+ "must re-init mixedBlocks");
+ int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH;
+ int32_t i1 = fastIndexLength;
+ for (int32_t i = 0; i < i2Length; i += blockLength) {
+ int32_t n;
+ if ((i2Length - i) >= blockLength) {
+ // normal block
+ U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH);
+ n = mixedBlocks.findBlock(index16, index2, i);
+ } else {
+ // highStart is inside the last index-2 block. Shorten it.
+ blockLength = i2Length - i;
+ n = findSameBlock(index16, index3Start, indexLength,
+ index2, i, blockLength);
+ }
+ int32_t i2;
+ if (n >= 0) {
+ i2 = n;
+ } else {
+ if (indexLength == index3Start) {
+ // No overlap at the boundary between the index-1 and index-3/2 tables.
+ n = 0;
+ } else {
+ n = getOverlap(index16, indexLength, index2, i, blockLength);
+ }
+ i2 = indexLength - n;
+ int32_t prevIndexLength = indexLength;
+ while (n < blockLength) {
+ index16[indexLength++] = index2[i + n++];
+ }
+ mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
+ }
+ // Set the index-1 table entry.
+ index16[i1++] = i2;
+ }
+ U_ASSERT(i1 == index3Start);
+ U_ASSERT(indexLength <= index16Capacity);
+
+#ifdef UCPTRIE_DEBUG
+ /* we saved some space */
+ printf("compacting UCPTrie: count of 16-bit index words %lu->%lu\n",
+ (long)iLimit, (long)indexLength);
+#endif
+
+ return indexLength;
+}
+
+int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorCode) {
+ // Find the real highStart and round it up.
+ U_ASSERT((highStart & (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) == 0);
+ highValue = get(MAX_UNICODE);
+ int32_t realHighStart = findHighStart();
+ realHighStart = (realHighStart + (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) &
+ ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
+ if (realHighStart == UNICODE_LIMIT) {
+ highValue = initialValue;
+ }
+
+#ifdef UCPTRIE_DEBUG
+ printf("UCPTrie: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n",
+ (long)realHighStart, (long)highValue, (long)initialValue);
+#endif
+
+ // We always store indexes and data values for the fast range.
+ // Pin highStart to the top of that range while building.
+ UChar32 fastLimit = fastILimit << UCPTRIE_SHIFT_3;
+ if (realHighStart < fastLimit) {
+ for (int32_t i = (realHighStart >> UCPTRIE_SHIFT_3); i < fastILimit; ++i) {
+ flags[i] = ALL_SAME;
+ index[i] = highValue;
+ }
+ highStart = fastLimit;
+ } else {
+ highStart = realHighStart;
+ }
+
+ uint32_t asciiData[ASCII_LIMIT];
+ for (int32_t i = 0; i < ASCII_LIMIT; ++i) {
+ asciiData[i] = get(i);
+ }
+
+ // First we look for which data blocks have the same value repeated over the whole block,
+ // deduplicate such blocks, find a good null data block (for faster enumeration),
+ // and get an upper bound for the necessary data array length.
+ AllSameBlocks allSameBlocks;
+ int32_t newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks);
+ if (newDataCapacity < 0) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ uint32_t *newData = (uint32_t *)uprv_malloc(newDataCapacity * 4);
+ if (newData == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ uprv_memcpy(newData, asciiData, sizeof(asciiData));
+
+ int32_t dataNullIndex = allSameBlocks.findMostUsed();
+
+ MixedBlocks mixedBlocks;
+ int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity,
+ dataNullIndex, mixedBlocks, errorCode);
+ if (U_FAILURE(errorCode)) { return 0; }
+ U_ASSERT(newDataLength <= newDataCapacity);
+ uprv_free(data);
+ data = newData;
+ dataCapacity = newDataCapacity;
+ dataLength = newDataLength;
+ if (dataLength > (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH)) {
+ // The offset of the last data block is too high to be stored in the index table.
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ if (dataNullIndex >= 0) {
+ dataNullOffset = index[dataNullIndex];
+#ifdef UCPTRIE_DEBUG
+ if (data[dataNullOffset] != initialValue) {
+ printf("UCPTrie initialValue %lx -> more common nullValue %lx\n",
+ (long)initialValue, (long)data[dataNullOffset]);
+ }
+#endif
+ initialValue = data[dataNullOffset];
+ } else {
+ dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET;
+ }
+
+ int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode);
+ highStart = realHighStart;
+ return indexLength;
+}
+
+UCPTrie *MutableCodePointTrie::build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+ if (type < UCPTRIE_TYPE_FAST || UCPTRIE_TYPE_SMALL < type ||
+ valueWidth < UCPTRIE_VALUE_BITS_16 || UCPTRIE_VALUE_BITS_8 < valueWidth) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+
+ // The mutable trie always stores 32-bit values.
+ // When we build a UCPTrie for a smaller value width, we first mask off unused bits
+ // before compacting the data.
+ switch (valueWidth) {
+ case UCPTRIE_VALUE_BITS_32:
+ break;
+ case UCPTRIE_VALUE_BITS_16:
+ maskValues(0xffff);
+ break;
+ case UCPTRIE_VALUE_BITS_8:
+ maskValues(0xff);
+ break;
+ default:
+ break;
+ }
+
+ UChar32 fastLimit = type == UCPTRIE_TYPE_FAST ? BMP_LIMIT : UCPTRIE_SMALL_LIMIT;
+ int32_t indexLength = compactTrie(fastLimit >> UCPTRIE_SHIFT_3, errorCode);
+ if (U_FAILURE(errorCode)) {
+ clear();
+ return nullptr;
+ }
+
+ // Ensure data table alignment: The index length must be even for uint32_t data.
+ if (valueWidth == UCPTRIE_VALUE_BITS_32 && (indexLength & 1) != 0) {
+ index16[indexLength++] = 0xffee; // arbitrary value
+ }
+
+ // Make the total trie structure length a multiple of 4 bytes by padding the data table,
+ // and store special values as the last two data values.
+ int32_t length = indexLength * 2;
+ if (valueWidth == UCPTRIE_VALUE_BITS_16) {
+ if (((indexLength ^ dataLength) & 1) != 0) {
+ // padding
+ data[dataLength++] = errorValue;
+ }
+ if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
+ data[dataLength++] = highValue;
+ data[dataLength++] = errorValue;
+ }
+ length += dataLength * 2;
+ } else if (valueWidth == UCPTRIE_VALUE_BITS_32) {
+ // 32-bit data words never need padding to a multiple of 4 bytes.
+ if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
+ if (data[dataLength - 1] != highValue) {
+ data[dataLength++] = highValue;
+ }
+ data[dataLength++] = errorValue;
+ }
+ length += dataLength * 4;
+ } else {
+ int32_t and3 = (length + dataLength) & 3;
+ if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) {
+ // all set
+ } else if(and3 == 3 && data[dataLength - 1] == highValue) {
+ data[dataLength++] = errorValue;
+ } else {
+ while (and3 != 2) {
+ data[dataLength++] = highValue;
+ and3 = (and3 + 1) & 3;
+ }
+ data[dataLength++] = highValue;
+ data[dataLength++] = errorValue;
+ }
+ length += dataLength;
+ }
+
+ // Calculate the total length of the UCPTrie as a single memory block.
+ length += sizeof(UCPTrie);
+ U_ASSERT((length & 3) == 0);
+
+ uint8_t *bytes = (uint8_t *)uprv_malloc(length);
+ if (bytes == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ clear();
+ return nullptr;
+ }
+ UCPTrie *trie = reinterpret_cast<UCPTrie *>(bytes);
+ uprv_memset(trie, 0, sizeof(UCPTrie));
+ trie->indexLength = indexLength;
+ trie->dataLength = dataLength;
+
+ trie->highStart = highStart;
+ // Round up shifted12HighStart to a multiple of 0x1000 for easy testing from UTF-8 lead bytes.
+ // Runtime code needs to then test for the real highStart as well.
+ trie->shifted12HighStart = (highStart + 0xfff) >> 12;
+ trie->type = type;
+ trie->valueWidth = valueWidth;
+
+ trie->index3NullOffset = index3NullOffset;
+ trie->dataNullOffset = dataNullOffset;
+ trie->nullValue = initialValue;
+
+ bytes += sizeof(UCPTrie);
+
+ // Fill the index and data arrays.
+ uint16_t *dest16 = (uint16_t *)bytes;
+ trie->index = dest16;
+
+ if (highStart <= fastLimit) {
+ // Condense only the fast index from the mutable-trie index.
+ for (int32_t i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) {
+ *dest16++ = (uint16_t)index[i]; // dest16[j]
+ }
+ } else {
+ uprv_memcpy(dest16, index16, indexLength * 2);
+ dest16 += indexLength;
+ }
+ bytes += indexLength * 2;
+
+ // Write the data array.
+ const uint32_t *p = data;
+ switch (valueWidth) {
+ case UCPTRIE_VALUE_BITS_16:
+ // Write 16-bit data values.
+ trie->data.ptr16 = dest16;
+ for (int32_t i = dataLength; i > 0; --i) {
+ *dest16++ = (uint16_t)*p++;
+ }
+ break;
+ case UCPTRIE_VALUE_BITS_32:
+ // Write 32-bit data values.
+ trie->data.ptr32 = (uint32_t *)bytes;
+ uprv_memcpy(bytes, p, (size_t)dataLength * 4);
+ break;
+ case UCPTRIE_VALUE_BITS_8:
+ // Write 8-bit data values.
+ trie->data.ptr8 = bytes;
+ for (int32_t i = dataLength; i > 0; --i) {
+ *bytes++ = (uint8_t)*p++;
+ }
+ break;
+ default:
+ // Will not occur, valueWidth checked at the beginning.
+ break;
+ }
+
+#ifdef UCPTRIE_DEBUG
+ trie->name = name;
+
+ ucptrie_printLengths(trie, "");
+#endif
+
+ clear();
+ return trie;
+}
+
+} // namespace
+
+U_NAMESPACE_END
+
+U_NAMESPACE_USE
+
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ LocalPointer<MutableCodePointTrie> trie(
+ new MutableCodePointTrie(initialValue, errorValue, *pErrorCode), *pErrorCode);
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ return reinterpret_cast<UMutableCPTrie *>(trie.orphan());
+}
+
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_clone(const UMutableCPTrie *other, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ if (other == nullptr) {
+ return nullptr;
+ }
+ LocalPointer<MutableCodePointTrie> clone(
+ new MutableCodePointTrie(*reinterpret_cast<const MutableCodePointTrie *>(other), *pErrorCode), *pErrorCode);
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ return reinterpret_cast<UMutableCPTrie *>(clone.orphan());
+}
+
+U_CAPI void U_EXPORT2
+umutablecptrie_close(UMutableCPTrie *trie) {
+ delete reinterpret_cast<MutableCodePointTrie *>(trie);
+}
+
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ if (map == nullptr) {
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode));
+}
+
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ if (trie == nullptr) {
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPTrie(trie, *pErrorCode));
+}
+
+U_CAPI uint32_t U_EXPORT2
+umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c) {
+ return reinterpret_cast<const MutableCodePointTrie *>(trie)->get(c);
+}
+
+namespace {
+
+UChar32 getRange(const void *trie, UChar32 start,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
+ return reinterpret_cast<const MutableCodePointTrie *>(trie)->
+ getRange(start, filter, context, pValue);
+}
+
+} // namespace
+
+U_CAPI UChar32 U_EXPORT2
+umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
+ return ucptrie_internalGetRange(getRange, trie, start,
+ option, surrogateValue,
+ filter, context, pValue);
+}
+
+U_CAPI void U_EXPORT2
+umutablecptrie_set(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return;
+ }
+ reinterpret_cast<MutableCodePointTrie *>(trie)->set(c, value, *pErrorCode);
+}
+
+U_CAPI void U_EXPORT2
+umutablecptrie_setRange(UMutableCPTrie *trie, UChar32 start, UChar32 end,
+ uint32_t value, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return;
+ }
+ reinterpret_cast<MutableCodePointTrie *>(trie)->setRange(start, end, value, *pErrorCode);
+}
+
+/* Compact and internally serialize the trie. */
+U_CAPI UCPTrie * U_EXPORT2
+umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth,
+ UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ return reinterpret_cast<MutableCodePointTrie *>(trie)->build(type, valueWidth, *pErrorCode);
+}
+
+#ifdef UCPTRIE_DEBUG
+U_CFUNC void umutablecptrie_setName(UMutableCPTrie *trie, const char *name) {
+ reinterpret_cast<MutableCodePointTrie *>(trie)->name = name;
+}
+#endif