diff options
author | neksard <neksard@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
commit | 1d9c550e7c38e051d7961f576013a482003a70d9 (patch) | |
tree | b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/common/stringtriebuilder.cpp | |
parent | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff) | |
download | ydb-1d9c550e7c38e051d7961f576013a482003a70d9.tar.gz |
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/common/stringtriebuilder.cpp')
-rw-r--r-- | contrib/libs/icu/common/stringtriebuilder.cpp | 1230 |
1 files changed, 615 insertions, 615 deletions
diff --git a/contrib/libs/icu/common/stringtriebuilder.cpp b/contrib/libs/icu/common/stringtriebuilder.cpp index 632bf528bf..6f9cc2e5c2 100644 --- a/contrib/libs/icu/common/stringtriebuilder.cpp +++ b/contrib/libs/icu/common/stringtriebuilder.cpp @@ -1,618 +1,618 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* Copyright (C) 2010-2012, International Business Machines -* Corporation and others. All Rights Reserved. -******************************************************************************* -* file name: stringtriebuilder.cpp +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2010-2012, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: stringtriebuilder.cpp * encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2010dec24 -* created by: Markus W. Scherer -*/ - -#include "utypeinfo.h" // for 'typeid' to work -#include "unicode/utypes.h" -#include "unicode/stringtriebuilder.h" -#include "uassert.h" -#include "uhash.h" - -U_CDECL_BEGIN - -static int32_t U_CALLCONV -hashStringTrieNode(const UHashTok key) { - return icu::StringTrieBuilder::hashNode(key.pointer); -} - -static UBool U_CALLCONV -equalStringTrieNodes(const UHashTok key1, const UHashTok key2) { - return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer); -} - -U_CDECL_END - -U_NAMESPACE_BEGIN - -StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {} - -StringTrieBuilder::~StringTrieBuilder() { - deleteCompactBuilder(); -} - -void -StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) { - if(U_FAILURE(errorCode)) { - return; - } - nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL, - sizeGuess, &errorCode); - if(U_SUCCESS(errorCode)) { - if(nodes==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - } else { - uhash_setKeyDeleter(nodes, uprv_deleteUObject); - } - } -} - -void -StringTrieBuilder::deleteCompactBuilder() { - uhash_close(nodes); - nodes=NULL; -} - -void -StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength, - UErrorCode &errorCode) { - if(buildOption==USTRINGTRIE_BUILD_FAST) { - writeNode(0, elementsLength, 0); - } else /* USTRINGTRIE_BUILD_SMALL */ { - createCompactBuilder(2*elementsLength, errorCode); - Node *root=makeNode(0, elementsLength, 0, errorCode); - if(U_SUCCESS(errorCode)) { - root->markRightEdgesFirst(-1); - root->write(*this); - } - deleteCompactBuilder(); - } -} - -// Requires start<limit, -// and all strings of the [start..limit[ elements must be sorted and -// have a common prefix of length unitIndex. -int32_t -StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) { - UBool hasValue=FALSE; - int32_t value=0; - int32_t type; - if(unitIndex==getElementStringLength(start)) { - // An intermediate or final value. - value=getElementValue(start++); - if(start==limit) { - return writeValueAndFinal(value, TRUE); // final-value node - } - hasValue=TRUE; - } - // Now all [start..limit[ strings are longer than unitIndex. - int32_t minUnit=getElementUnit(start, unitIndex); - int32_t maxUnit=getElementUnit(limit-1, unitIndex); - if(minUnit==maxUnit) { - // Linear-match node: All strings have the same character at unitIndex. - int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); - writeNode(start, limit, lastUnitIndex); - // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. - int32_t length=lastUnitIndex-unitIndex; - int32_t maxLinearMatchLength=getMaxLinearMatchLength(); - while(length>maxLinearMatchLength) { - lastUnitIndex-=maxLinearMatchLength; - length-=maxLinearMatchLength; - writeElementUnits(start, lastUnitIndex, maxLinearMatchLength); - write(getMinLinearMatch()+maxLinearMatchLength-1); - } - writeElementUnits(start, unitIndex, length); - type=getMinLinearMatch()+length-1; - } else { - // Branch node. - int32_t length=countElementUnits(start, limit, unitIndex); - // length>=2 because minUnit!=maxUnit. - writeBranchSubNode(start, limit, unitIndex, length); - if(--length<getMinLinearMatch()) { - type=length; - } else { - write(length); - type=0; - } - } - return writeValueAndType(hasValue, value, type); -} - -// start<limit && all strings longer than unitIndex && -// length different units at unitIndex -int32_t -StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) { - UChar middleUnits[kMaxSplitBranchLevels]; - int32_t lessThan[kMaxSplitBranchLevels]; - int32_t ltLength=0; - while(length>getMaxBranchLinearSubNodeLength()) { - // Branch on the middle unit. - // First, find the middle unit. - int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); - // Encode the less-than branch first. - middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit - lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2); - ++ltLength; - // Continue for the greater-or-equal branch. - start=i; - length=length-length/2; - } - // For each unit, find its elements array start and whether it has a final value. - int32_t starts[kMaxBranchLinearSubNodeLength]; - UBool isFinal[kMaxBranchLinearSubNodeLength-1]; - int32_t unitNumber=0; - do { - int32_t i=starts[unitNumber]=start; - UChar unit=getElementUnit(i++, unitIndex); - i=indexOfElementWithNextUnit(i, unitIndex, unit); - isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start); - start=i; - } while(++unitNumber<length-1); - // unitNumber==length-1, and the maxUnit elements range is [start..limit[ - starts[unitNumber]=start; - - // Write the sub-nodes in reverse order: The jump lengths are deltas from - // after their own positions, so if we wrote the minUnit sub-node first, - // then its jump delta would be larger. - // Instead we write the minUnit sub-node last, for a shorter delta. - int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1]; - do { - --unitNumber; - if(!isFinal[unitNumber]) { - jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1); - } - } while(unitNumber>0); - // The maxUnit sub-node is written as the very last one because we do - // not jump for it at all. - unitNumber=length-1; - writeNode(start, limit, unitIndex+1); - int32_t offset=write(getElementUnit(start, unitIndex)); - // Write the rest of this node's unit-value pairs. - while(--unitNumber>=0) { - start=starts[unitNumber]; - int32_t value; - if(isFinal[unitNumber]) { - // Write the final value for the one string ending with this unit. - value=getElementValue(start); - } else { - // Write the delta to the start position of the sub-node. - value=offset-jumpTargets[unitNumber]; - } - writeValueAndFinal(value, isFinal[unitNumber]); - offset=write(getElementUnit(start, unitIndex)); - } - // Write the split-branch nodes. - while(ltLength>0) { - --ltLength; - writeDeltaTo(lessThan[ltLength]); - offset=write(middleUnits[ltLength]); - } - return offset; -} - -// Requires start<limit, -// and all strings of the [start..limit[ elements must be sorted and -// have a common prefix of length unitIndex. -StringTrieBuilder::Node * -StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) { - if(U_FAILURE(errorCode)) { - return NULL; - } - UBool hasValue=FALSE; - int32_t value=0; - if(unitIndex==getElementStringLength(start)) { - // An intermediate or final value. - value=getElementValue(start++); - if(start==limit) { - return registerFinalValue(value, errorCode); - } - hasValue=TRUE; - } - Node *node; - // Now all [start..limit[ strings are longer than unitIndex. - int32_t minUnit=getElementUnit(start, unitIndex); - int32_t maxUnit=getElementUnit(limit-1, unitIndex); - if(minUnit==maxUnit) { - // Linear-match node: All strings have the same character at unitIndex. - int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); - Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode); - // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. - int32_t length=lastUnitIndex-unitIndex; - int32_t maxLinearMatchLength=getMaxLinearMatchLength(); - while(length>maxLinearMatchLength) { - lastUnitIndex-=maxLinearMatchLength; - length-=maxLinearMatchLength; - node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode); - nextNode=registerNode(node, errorCode); - } - node=createLinearMatchNode(start, unitIndex, length, nextNode); - } else { - // Branch node. - int32_t length=countElementUnits(start, limit, unitIndex); - // length>=2 because minUnit!=maxUnit. - Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode); - node=new BranchHeadNode(length, subNode); - } - if(hasValue && node!=NULL) { - if(matchNodesCanHaveValues()) { - ((ValueNode *)node)->setValue(value); - } else { - node=new IntermediateValueNode(value, registerNode(node, errorCode)); - } - } - return registerNode(node, errorCode); -} - -// start<limit && all strings longer than unitIndex && -// length different units at unitIndex -StringTrieBuilder::Node * -StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, - int32_t length, UErrorCode &errorCode) { - if(U_FAILURE(errorCode)) { - return NULL; - } - UChar middleUnits[kMaxSplitBranchLevels]; - Node *lessThan[kMaxSplitBranchLevels]; - int32_t ltLength=0; - while(length>getMaxBranchLinearSubNodeLength()) { - // Branch on the middle unit. - // First, find the middle unit. - int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); - // Create the less-than branch. - middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit - lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode); - ++ltLength; - // Continue for the greater-or-equal branch. - start=i; - length=length-length/2; - } - if(U_FAILURE(errorCode)) { - return NULL; - } - ListBranchNode *listNode=new ListBranchNode(); - if(listNode==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - // For each unit, find its elements array start and whether it has a final value. - int32_t unitNumber=0; - do { - int32_t i=start; - UChar unit=getElementUnit(i++, unitIndex); - i=indexOfElementWithNextUnit(i, unitIndex, unit); - if(start==i-1 && unitIndex+1==getElementStringLength(start)) { - listNode->add(unit, getElementValue(start)); - } else { - listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode)); - } - start=i; - } while(++unitNumber<length-1); - // unitNumber==length-1, and the maxUnit elements range is [start..limit[ - UChar unit=getElementUnit(start, unitIndex); - if(start==limit-1 && unitIndex+1==getElementStringLength(start)) { - listNode->add(unit, getElementValue(start)); - } else { - listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode)); - } - Node *node=registerNode(listNode, errorCode); - // Create the split-branch nodes. - while(ltLength>0) { - --ltLength; - node=registerNode( - new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode); - } - return node; -} - -StringTrieBuilder::Node * -StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) { - if(U_FAILURE(errorCode)) { - delete newNode; - return NULL; - } - if(newNode==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - const UHashElement *old=uhash_find(nodes, newNode); - if(old!=NULL) { - delete newNode; - return (Node *)old->key.pointer; - } - // If uhash_puti() returns a non-zero value from an equivalent, previously - // registered node, then uhash_find() failed to find that and we will leak newNode. -#if U_DEBUG - int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. -#endif - uhash_puti(nodes, newNode, 1, &errorCode); - U_ASSERT(oldValue==0); - if(U_FAILURE(errorCode)) { - delete newNode; - return NULL; - } - return newNode; -} - -StringTrieBuilder::Node * -StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) { - if(U_FAILURE(errorCode)) { - return NULL; - } - FinalValueNode key(value); - const UHashElement *old=uhash_find(nodes, &key); - if(old!=NULL) { - return (Node *)old->key.pointer; - } - Node *newNode=new FinalValueNode(value); - if(newNode==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - // If uhash_puti() returns a non-zero value from an equivalent, previously - // registered node, then uhash_find() failed to find that and we will leak newNode. -#if U_DEBUG - int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. -#endif - uhash_puti(nodes, newNode, 1, &errorCode); - U_ASSERT(oldValue==0); - if(U_FAILURE(errorCode)) { - delete newNode; - return NULL; - } - return newNode; -} - +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010dec24 +* created by: Markus W. Scherer +*/ + +#include "utypeinfo.h" // for 'typeid' to work +#include "unicode/utypes.h" +#include "unicode/stringtriebuilder.h" +#include "uassert.h" +#include "uhash.h" + +U_CDECL_BEGIN + +static int32_t U_CALLCONV +hashStringTrieNode(const UHashTok key) { + return icu::StringTrieBuilder::hashNode(key.pointer); +} + +static UBool U_CALLCONV +equalStringTrieNodes(const UHashTok key1, const UHashTok key2) { + return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer); +} + +U_CDECL_END + +U_NAMESPACE_BEGIN + +StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {} + +StringTrieBuilder::~StringTrieBuilder() { + deleteCompactBuilder(); +} + +void +StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return; + } + nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL, + sizeGuess, &errorCode); + if(U_SUCCESS(errorCode)) { + if(nodes==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + } else { + uhash_setKeyDeleter(nodes, uprv_deleteUObject); + } + } +} + +void +StringTrieBuilder::deleteCompactBuilder() { + uhash_close(nodes); + nodes=NULL; +} + +void +StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength, + UErrorCode &errorCode) { + if(buildOption==USTRINGTRIE_BUILD_FAST) { + writeNode(0, elementsLength, 0); + } else /* USTRINGTRIE_BUILD_SMALL */ { + createCompactBuilder(2*elementsLength, errorCode); + Node *root=makeNode(0, elementsLength, 0, errorCode); + if(U_SUCCESS(errorCode)) { + root->markRightEdgesFirst(-1); + root->write(*this); + } + deleteCompactBuilder(); + } +} + +// Requires start<limit, +// and all strings of the [start..limit[ elements must be sorted and +// have a common prefix of length unitIndex. int32_t -StringTrieBuilder::hashNode(const void *node) { - return ((const Node *)node)->hashCode(); -} - -UBool -StringTrieBuilder::equalNodes(const void *left, const void *right) { - return *(const Node *)left==*(const Node *)right; -} - -UBool -StringTrieBuilder::Node::operator==(const Node &other) const { - return this==&other || (typeid(*this)==typeid(other) && hash==other.hash); -} - -int32_t -StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) { - if(offset==0) { - offset=edgeNumber; - } - return edgeNumber; -} - -UBool -StringTrieBuilder::FinalValueNode::operator==(const Node &other) const { - if(this==&other) { - return TRUE; - } - if(!Node::operator==(other)) { - return FALSE; - } - const FinalValueNode &o=(const FinalValueNode &)other; - return value==o.value; -} - -void -StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) { - offset=builder.writeValueAndFinal(value, TRUE); -} - -UBool -StringTrieBuilder::ValueNode::operator==(const Node &other) const { - if(this==&other) { - return TRUE; - } - if(!Node::operator==(other)) { - return FALSE; - } - const ValueNode &o=(const ValueNode &)other; - return hasValue==o.hasValue && (!hasValue || value==o.value); -} - -UBool -StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const { - if(this==&other) { - return TRUE; - } - if(!ValueNode::operator==(other)) { - return FALSE; - } - const IntermediateValueNode &o=(const IntermediateValueNode &)other; - return next==o.next; -} - -int32_t -StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) { - if(offset==0) { - offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); - } - return edgeNumber; -} - -void -StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) { - next->write(builder); - offset=builder.writeValueAndFinal(value, FALSE); -} - -UBool -StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const { - if(this==&other) { - return TRUE; - } - if(!ValueNode::operator==(other)) { - return FALSE; - } - const LinearMatchNode &o=(const LinearMatchNode &)other; - return length==o.length && next==o.next; -} - -int32_t -StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) { - if(offset==0) { - offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); - } - return edgeNumber; -} - -UBool -StringTrieBuilder::ListBranchNode::operator==(const Node &other) const { - if(this==&other) { - return TRUE; - } - if(!Node::operator==(other)) { - return FALSE; - } - const ListBranchNode &o=(const ListBranchNode &)other; - for(int32_t i=0; i<length; ++i) { - if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) { - return FALSE; - } - } - return TRUE; -} - -int32_t -StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) { - if(offset==0) { - firstEdgeNumber=edgeNumber; - int32_t step=0; - int32_t i=length; - do { - Node *edge=equal[--i]; - if(edge!=NULL) { - edgeNumber=edge->markRightEdgesFirst(edgeNumber-step); - } - // For all but the rightmost edge, decrement the edge number. - step=1; - } while(i>0); - offset=edgeNumber; - } - return edgeNumber; -} - -void -StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) { - // Write the sub-nodes in reverse order: The jump lengths are deltas from - // after their own positions, so if we wrote the minUnit sub-node first, - // then its jump delta would be larger. - // Instead we write the minUnit sub-node last, for a shorter delta. - int32_t unitNumber=length-1; - Node *rightEdge=equal[unitNumber]; - int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset(); - do { - --unitNumber; - if(equal[unitNumber]!=NULL) { - equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder); - } - } while(unitNumber>0); - // The maxUnit sub-node is written as the very last one because we do - // not jump for it at all. - unitNumber=length-1; - if(rightEdge==NULL) { - builder.writeValueAndFinal(values[unitNumber], TRUE); - } else { - rightEdge->write(builder); - } - offset=builder.write(units[unitNumber]); - // Write the rest of this node's unit-value pairs. - while(--unitNumber>=0) { - int32_t value; - UBool isFinal; - if(equal[unitNumber]==NULL) { - // Write the final value for the one string ending with this unit. - value=values[unitNumber]; - isFinal=TRUE; - } else { - // Write the delta to the start position of the sub-node. - U_ASSERT(equal[unitNumber]->getOffset()>0); - value=offset-equal[unitNumber]->getOffset(); - isFinal=FALSE; - } - builder.writeValueAndFinal(value, isFinal); - offset=builder.write(units[unitNumber]); - } -} - -UBool -StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const { - if(this==&other) { - return TRUE; - } - if(!Node::operator==(other)) { - return FALSE; - } - const SplitBranchNode &o=(const SplitBranchNode &)other; - return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual; -} - -int32_t -StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) { - if(offset==0) { - firstEdgeNumber=edgeNumber; - edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber); - offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1); - } - return edgeNumber; -} - -void -StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) { - // Encode the less-than branch first. - lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder); - // Encode the greater-or-equal branch last because we do not jump for it at all. - greaterOrEqual->write(builder); - // Write this node. - U_ASSERT(lessThan->getOffset()>0); - builder.writeDeltaTo(lessThan->getOffset()); // less-than - offset=builder.write(unit); -} - -UBool -StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const { - if(this==&other) { - return TRUE; - } - if(!ValueNode::operator==(other)) { - return FALSE; - } - const BranchHeadNode &o=(const BranchHeadNode &)other; - return length==o.length && next==o.next; -} - -int32_t -StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) { - if(offset==0) { - offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); - } - return edgeNumber; -} - -void -StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) { - next->write(builder); - if(length<=builder.getMinLinearMatch()) { - offset=builder.writeValueAndType(hasValue, value, length-1); - } else { - builder.write(length-1); - offset=builder.writeValueAndType(hasValue, value, 0); - } -} - -U_NAMESPACE_END +StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) { + UBool hasValue=FALSE; + int32_t value=0; + int32_t type; + if(unitIndex==getElementStringLength(start)) { + // An intermediate or final value. + value=getElementValue(start++); + if(start==limit) { + return writeValueAndFinal(value, TRUE); // final-value node + } + hasValue=TRUE; + } + // Now all [start..limit[ strings are longer than unitIndex. + int32_t minUnit=getElementUnit(start, unitIndex); + int32_t maxUnit=getElementUnit(limit-1, unitIndex); + if(minUnit==maxUnit) { + // Linear-match node: All strings have the same character at unitIndex. + int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); + writeNode(start, limit, lastUnitIndex); + // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. + int32_t length=lastUnitIndex-unitIndex; + int32_t maxLinearMatchLength=getMaxLinearMatchLength(); + while(length>maxLinearMatchLength) { + lastUnitIndex-=maxLinearMatchLength; + length-=maxLinearMatchLength; + writeElementUnits(start, lastUnitIndex, maxLinearMatchLength); + write(getMinLinearMatch()+maxLinearMatchLength-1); + } + writeElementUnits(start, unitIndex, length); + type=getMinLinearMatch()+length-1; + } else { + // Branch node. + int32_t length=countElementUnits(start, limit, unitIndex); + // length>=2 because minUnit!=maxUnit. + writeBranchSubNode(start, limit, unitIndex, length); + if(--length<getMinLinearMatch()) { + type=length; + } else { + write(length); + type=0; + } + } + return writeValueAndType(hasValue, value, type); +} + +// start<limit && all strings longer than unitIndex && +// length different units at unitIndex +int32_t +StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) { + UChar middleUnits[kMaxSplitBranchLevels]; + int32_t lessThan[kMaxSplitBranchLevels]; + int32_t ltLength=0; + while(length>getMaxBranchLinearSubNodeLength()) { + // Branch on the middle unit. + // First, find the middle unit. + int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); + // Encode the less-than branch first. + middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit + lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2); + ++ltLength; + // Continue for the greater-or-equal branch. + start=i; + length=length-length/2; + } + // For each unit, find its elements array start and whether it has a final value. + int32_t starts[kMaxBranchLinearSubNodeLength]; + UBool isFinal[kMaxBranchLinearSubNodeLength-1]; + int32_t unitNumber=0; + do { + int32_t i=starts[unitNumber]=start; + UChar unit=getElementUnit(i++, unitIndex); + i=indexOfElementWithNextUnit(i, unitIndex, unit); + isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start); + start=i; + } while(++unitNumber<length-1); + // unitNumber==length-1, and the maxUnit elements range is [start..limit[ + starts[unitNumber]=start; + + // Write the sub-nodes in reverse order: The jump lengths are deltas from + // after their own positions, so if we wrote the minUnit sub-node first, + // then its jump delta would be larger. + // Instead we write the minUnit sub-node last, for a shorter delta. + int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1]; + do { + --unitNumber; + if(!isFinal[unitNumber]) { + jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1); + } + } while(unitNumber>0); + // The maxUnit sub-node is written as the very last one because we do + // not jump for it at all. + unitNumber=length-1; + writeNode(start, limit, unitIndex+1); + int32_t offset=write(getElementUnit(start, unitIndex)); + // Write the rest of this node's unit-value pairs. + while(--unitNumber>=0) { + start=starts[unitNumber]; + int32_t value; + if(isFinal[unitNumber]) { + // Write the final value for the one string ending with this unit. + value=getElementValue(start); + } else { + // Write the delta to the start position of the sub-node. + value=offset-jumpTargets[unitNumber]; + } + writeValueAndFinal(value, isFinal[unitNumber]); + offset=write(getElementUnit(start, unitIndex)); + } + // Write the split-branch nodes. + while(ltLength>0) { + --ltLength; + writeDeltaTo(lessThan[ltLength]); + offset=write(middleUnits[ltLength]); + } + return offset; +} + +// Requires start<limit, +// and all strings of the [start..limit[ elements must be sorted and +// have a common prefix of length unitIndex. +StringTrieBuilder::Node * +StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return NULL; + } + UBool hasValue=FALSE; + int32_t value=0; + if(unitIndex==getElementStringLength(start)) { + // An intermediate or final value. + value=getElementValue(start++); + if(start==limit) { + return registerFinalValue(value, errorCode); + } + hasValue=TRUE; + } + Node *node; + // Now all [start..limit[ strings are longer than unitIndex. + int32_t minUnit=getElementUnit(start, unitIndex); + int32_t maxUnit=getElementUnit(limit-1, unitIndex); + if(minUnit==maxUnit) { + // Linear-match node: All strings have the same character at unitIndex. + int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); + Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode); + // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. + int32_t length=lastUnitIndex-unitIndex; + int32_t maxLinearMatchLength=getMaxLinearMatchLength(); + while(length>maxLinearMatchLength) { + lastUnitIndex-=maxLinearMatchLength; + length-=maxLinearMatchLength; + node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode); + nextNode=registerNode(node, errorCode); + } + node=createLinearMatchNode(start, unitIndex, length, nextNode); + } else { + // Branch node. + int32_t length=countElementUnits(start, limit, unitIndex); + // length>=2 because minUnit!=maxUnit. + Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode); + node=new BranchHeadNode(length, subNode); + } + if(hasValue && node!=NULL) { + if(matchNodesCanHaveValues()) { + ((ValueNode *)node)->setValue(value); + } else { + node=new IntermediateValueNode(value, registerNode(node, errorCode)); + } + } + return registerNode(node, errorCode); +} + +// start<limit && all strings longer than unitIndex && +// length different units at unitIndex +StringTrieBuilder::Node * +StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, + int32_t length, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return NULL; + } + UChar middleUnits[kMaxSplitBranchLevels]; + Node *lessThan[kMaxSplitBranchLevels]; + int32_t ltLength=0; + while(length>getMaxBranchLinearSubNodeLength()) { + // Branch on the middle unit. + // First, find the middle unit. + int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); + // Create the less-than branch. + middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit + lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode); + ++ltLength; + // Continue for the greater-or-equal branch. + start=i; + length=length-length/2; + } + if(U_FAILURE(errorCode)) { + return NULL; + } + ListBranchNode *listNode=new ListBranchNode(); + if(listNode==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + // For each unit, find its elements array start and whether it has a final value. + int32_t unitNumber=0; + do { + int32_t i=start; + UChar unit=getElementUnit(i++, unitIndex); + i=indexOfElementWithNextUnit(i, unitIndex, unit); + if(start==i-1 && unitIndex+1==getElementStringLength(start)) { + listNode->add(unit, getElementValue(start)); + } else { + listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode)); + } + start=i; + } while(++unitNumber<length-1); + // unitNumber==length-1, and the maxUnit elements range is [start..limit[ + UChar unit=getElementUnit(start, unitIndex); + if(start==limit-1 && unitIndex+1==getElementStringLength(start)) { + listNode->add(unit, getElementValue(start)); + } else { + listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode)); + } + Node *node=registerNode(listNode, errorCode); + // Create the split-branch nodes. + while(ltLength>0) { + --ltLength; + node=registerNode( + new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode); + } + return node; +} + +StringTrieBuilder::Node * +StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + delete newNode; + return NULL; + } + if(newNode==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + const UHashElement *old=uhash_find(nodes, newNode); + if(old!=NULL) { + delete newNode; + return (Node *)old->key.pointer; + } + // If uhash_puti() returns a non-zero value from an equivalent, previously + // registered node, then uhash_find() failed to find that and we will leak newNode. +#if U_DEBUG + int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. +#endif + uhash_puti(nodes, newNode, 1, &errorCode); + U_ASSERT(oldValue==0); + if(U_FAILURE(errorCode)) { + delete newNode; + return NULL; + } + return newNode; +} + +StringTrieBuilder::Node * +StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return NULL; + } + FinalValueNode key(value); + const UHashElement *old=uhash_find(nodes, &key); + if(old!=NULL) { + return (Node *)old->key.pointer; + } + Node *newNode=new FinalValueNode(value); + if(newNode==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + // If uhash_puti() returns a non-zero value from an equivalent, previously + // registered node, then uhash_find() failed to find that and we will leak newNode. +#if U_DEBUG + int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. +#endif + uhash_puti(nodes, newNode, 1, &errorCode); + U_ASSERT(oldValue==0); + if(U_FAILURE(errorCode)) { + delete newNode; + return NULL; + } + return newNode; +} + +int32_t +StringTrieBuilder::hashNode(const void *node) { + return ((const Node *)node)->hashCode(); +} + +UBool +StringTrieBuilder::equalNodes(const void *left, const void *right) { + return *(const Node *)left==*(const Node *)right; +} + +UBool +StringTrieBuilder::Node::operator==(const Node &other) const { + return this==&other || (typeid(*this)==typeid(other) && hash==other.hash); +} + +int32_t +StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + offset=edgeNumber; + } + return edgeNumber; +} + +UBool +StringTrieBuilder::FinalValueNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!Node::operator==(other)) { + return FALSE; + } + const FinalValueNode &o=(const FinalValueNode &)other; + return value==o.value; +} + +void +StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) { + offset=builder.writeValueAndFinal(value, TRUE); +} + +UBool +StringTrieBuilder::ValueNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!Node::operator==(other)) { + return FALSE; + } + const ValueNode &o=(const ValueNode &)other; + return hasValue==o.hasValue && (!hasValue || value==o.value); +} + +UBool +StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!ValueNode::operator==(other)) { + return FALSE; + } + const IntermediateValueNode &o=(const IntermediateValueNode &)other; + return next==o.next; +} + +int32_t +StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); + } + return edgeNumber; +} + +void +StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) { + next->write(builder); + offset=builder.writeValueAndFinal(value, FALSE); +} + +UBool +StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!ValueNode::operator==(other)) { + return FALSE; + } + const LinearMatchNode &o=(const LinearMatchNode &)other; + return length==o.length && next==o.next; +} + +int32_t +StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); + } + return edgeNumber; +} + +UBool +StringTrieBuilder::ListBranchNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!Node::operator==(other)) { + return FALSE; + } + const ListBranchNode &o=(const ListBranchNode &)other; + for(int32_t i=0; i<length; ++i) { + if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) { + return FALSE; + } + } + return TRUE; +} + +int32_t +StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + firstEdgeNumber=edgeNumber; + int32_t step=0; + int32_t i=length; + do { + Node *edge=equal[--i]; + if(edge!=NULL) { + edgeNumber=edge->markRightEdgesFirst(edgeNumber-step); + } + // For all but the rightmost edge, decrement the edge number. + step=1; + } while(i>0); + offset=edgeNumber; + } + return edgeNumber; +} + +void +StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) { + // Write the sub-nodes in reverse order: The jump lengths are deltas from + // after their own positions, so if we wrote the minUnit sub-node first, + // then its jump delta would be larger. + // Instead we write the minUnit sub-node last, for a shorter delta. + int32_t unitNumber=length-1; + Node *rightEdge=equal[unitNumber]; + int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset(); + do { + --unitNumber; + if(equal[unitNumber]!=NULL) { + equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder); + } + } while(unitNumber>0); + // The maxUnit sub-node is written as the very last one because we do + // not jump for it at all. + unitNumber=length-1; + if(rightEdge==NULL) { + builder.writeValueAndFinal(values[unitNumber], TRUE); + } else { + rightEdge->write(builder); + } + offset=builder.write(units[unitNumber]); + // Write the rest of this node's unit-value pairs. + while(--unitNumber>=0) { + int32_t value; + UBool isFinal; + if(equal[unitNumber]==NULL) { + // Write the final value for the one string ending with this unit. + value=values[unitNumber]; + isFinal=TRUE; + } else { + // Write the delta to the start position of the sub-node. + U_ASSERT(equal[unitNumber]->getOffset()>0); + value=offset-equal[unitNumber]->getOffset(); + isFinal=FALSE; + } + builder.writeValueAndFinal(value, isFinal); + offset=builder.write(units[unitNumber]); + } +} + +UBool +StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!Node::operator==(other)) { + return FALSE; + } + const SplitBranchNode &o=(const SplitBranchNode &)other; + return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual; +} + +int32_t +StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + firstEdgeNumber=edgeNumber; + edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber); + offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1); + } + return edgeNumber; +} + +void +StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) { + // Encode the less-than branch first. + lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder); + // Encode the greater-or-equal branch last because we do not jump for it at all. + greaterOrEqual->write(builder); + // Write this node. + U_ASSERT(lessThan->getOffset()>0); + builder.writeDeltaTo(lessThan->getOffset()); // less-than + offset=builder.write(unit); +} + +UBool +StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!ValueNode::operator==(other)) { + return FALSE; + } + const BranchHeadNode &o=(const BranchHeadNode &)other; + return length==o.length && next==o.next; +} + +int32_t +StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); + } + return edgeNumber; +} + +void +StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) { + next->write(builder); + if(length<=builder.getMinLinearMatch()) { + offset=builder.writeValueAndType(hasValue, value, length-1); + } else { + builder.write(length-1); + offset=builder.writeValueAndType(hasValue, value, 0); + } +} + +U_NAMESPACE_END |