diff options
author | mcheshkov <mcheshkov@yandex-team.ru> | 2022-02-10 16:46:16 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:16 +0300 |
commit | 1312621288956f199a5bd5342b0133d4395fa725 (patch) | |
tree | 1a2c5ffcf89eb53ecd79dbc9bc0a195c27404d0c /contrib/libs/icu/common/rbbi_cache.cpp | |
parent | e9d19cec64684c9c1e6b0c98297e5b895cf904fe (diff) | |
download | ydb-1312621288956f199a5bd5342b0133d4395fa725.tar.gz |
Restoring authorship annotation for <mcheshkov@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/common/rbbi_cache.cpp')
-rw-r--r-- | contrib/libs/icu/common/rbbi_cache.cpp | 1306 |
1 files changed, 653 insertions, 653 deletions
diff --git a/contrib/libs/icu/common/rbbi_cache.cpp b/contrib/libs/icu/common/rbbi_cache.cpp index 5814d5e03b..4f9e83360a 100644 --- a/contrib/libs/icu/common/rbbi_cache.cpp +++ b/contrib/libs/icu/common/rbbi_cache.cpp @@ -1,653 +1,653 @@ -// Copyright (C) 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html - -// file: rbbi_cache.cpp - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_BREAK_ITERATION - -#include "unicode/ubrk.h" -#include "unicode/rbbi.h" - -#include "rbbi_cache.h" - -#include "brkeng.h" -#include "cmemory.h" -#include "rbbidata.h" -#include "rbbirb.h" -#include "uassert.h" -#include "uvectr32.h" - -U_NAMESPACE_BEGIN - -/* - * DictionaryCache implementation - */ - -RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) : - fBI(bi), fBreaks(status), fPositionInCache(-1), - fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) { -} - -RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() { -} - -void RuleBasedBreakIterator::DictionaryCache::reset() { - fPositionInCache = -1; - fStart = 0; - fLimit = 0; - fFirstRuleStatusIndex = 0; - fOtherRuleStatusIndex = 0; - fBreaks.removeAllElements(); -} - -UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) { - if (fromPos >= fLimit || fromPos < fStart) { - fPositionInCache = -1; - return FALSE; - } - - // Sequential iteration, move from previous boundary to the following - - int32_t r = 0; - if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { - ++fPositionInCache; - if (fPositionInCache >= fBreaks.size()) { - fPositionInCache = -1; - return FALSE; - } - r = fBreaks.elementAti(fPositionInCache); - U_ASSERT(r > fromPos); - *result = r; - *statusIndex = fOtherRuleStatusIndex; - return TRUE; - } - - // Random indexing. Linear search for the boundary following the given position. - - for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { - r= fBreaks.elementAti(fPositionInCache); - if (r > fromPos) { - *result = r; - *statusIndex = fOtherRuleStatusIndex; - return TRUE; - } - } - UPRV_UNREACHABLE; -} - - -UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) { - if (fromPos <= fStart || fromPos > fLimit) { - fPositionInCache = -1; - return FALSE; - } - - if (fromPos == fLimit) { - fPositionInCache = fBreaks.size() - 1; - if (fPositionInCache >= 0) { - U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos); - } - } - - int32_t r; - if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { - --fPositionInCache; - r = fBreaks.elementAti(fPositionInCache); - U_ASSERT(r < fromPos); - *result = r; - *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; - return TRUE; - } - - if (fPositionInCache == 0) { - fPositionInCache = -1; - return FALSE; - } - - for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { - r = fBreaks.elementAti(fPositionInCache); - if (r < fromPos) { - *result = r; - *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; - return TRUE; - } - } - UPRV_UNREACHABLE; -} - -void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, - int32_t firstRuleStatus, int32_t otherRuleStatus) { - if ((endPos - startPos) <= 1) { - return; - } - - reset(); - fFirstRuleStatusIndex = firstRuleStatus; - fOtherRuleStatusIndex = otherRuleStatus; - - int32_t rangeStart = startPos; - int32_t rangeEnd = endPos; - - uint16_t category; - int32_t current; - UErrorCode status = U_ZERO_ERROR; - int32_t foundBreakCount = 0; - UText *text = &fBI->fText; - - // Loop through the text, looking for ranges of dictionary characters. - // For each span, find the appropriate break engine, and ask it to find - // any breaks within the span. - - utext_setNativeIndex(text, rangeStart); - UChar32 c = utext_current32(text); - category = UTRIE2_GET16(fBI->fData->fTrie, c); - - while(U_SUCCESS(status)) { - while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) { - utext_next32(text); // TODO: cleaner loop structure. - c = utext_current32(text); - category = UTRIE2_GET16(fBI->fData->fTrie, c); - } - if (current >= rangeEnd) { - break; - } - - // We now have a dictionary character. Get the appropriate language object - // to deal with it. - const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c); - - // Ask the language object if there are any breaks. It will add them to the cache and - // leave the text pointer on the other side of its range, ready to search for the next one. - if (lbe != NULL) { - foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks); - } - - // Reload the loop variables for the next go-round - c = utext_current32(text); - category = UTRIE2_GET16(fBI->fData->fTrie, c); - } - - // If we found breaks, ensure that the first and last entries are - // the original starting and ending position. And initialize the - // cache iteration position to the first entry. - - // printf("foundBreakCount = %d\n", foundBreakCount); - if (foundBreakCount > 0) { - U_ASSERT(foundBreakCount == fBreaks.size()); - if (startPos < fBreaks.elementAti(0)) { - // The dictionary did not place a boundary at the start of the segment of text. - // Add one now. This should not commonly happen, but it would be easy for interactions - // of the rules for dictionary segments and the break engine implementations to - // inadvertently cause it. Cover it here, just in case. - fBreaks.insertElementAt(startPos, 0, status); - } - if (endPos > fBreaks.peeki()) { - fBreaks.push(endPos, status); - } - fPositionInCache = 0; - // Note: Dictionary matching may extend beyond the original limit. - fStart = fBreaks.elementAti(0); - fLimit = fBreaks.peeki(); - } else { - // there were no language-based breaks, even though the segment contained - // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache - // for this range will fail, and the calling code will fall back to the rule based boundaries. - } -} - - -/* - * BreakCache implemetation - */ - -RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : - fBI(bi), fSideBuffer(status) { - reset(); -} - - -RuleBasedBreakIterator::BreakCache::~BreakCache() { -} - - -void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) { - fStartBufIdx = 0; - fEndBufIdx = 0; - fTextIdx = pos; - fBufIdx = 0; - fBoundaries[0] = pos; - fStatuses[0] = (uint16_t)ruleStatus; -} - - -int32_t RuleBasedBreakIterator::BreakCache::current() { - fBI->fPosition = fTextIdx; - fBI->fRuleStatusIndex = fStatuses[fBufIdx]; - fBI->fDone = FALSE; - return fTextIdx; -} - - -void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) { - if (U_FAILURE(status)) { - return; - } - if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { - // startPos is in the cache. Do a next() from that position. - // TODO: an awkward set of interactions with bi->fDone - // seek() does not clear it; it can't because of interactions with populateNear(). - // next() does not clear it in the fast-path case, where everything matters. Maybe it should. - // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. - fBI->fDone = false; - next(); - } - return; -} - - -void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) { - if (U_FAILURE(status)) { - return; - } - if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { - if (startPos == fTextIdx) { - previous(status); - } else { - // seek() leaves the BreakCache positioned at the preceding boundary - // if the requested position is between two bounaries. - // current() pushes the BreakCache position out to the BreakIterator itself. - U_ASSERT(startPos > fTextIdx); - current(); - } - } - return; -} - - -/* - * Out-of-line code for BreakCache::next(). - * Cache does not already contain the boundary - */ -void RuleBasedBreakIterator::BreakCache::nextOL() { - fBI->fDone = !populateFollowing(); - fBI->fPosition = fTextIdx; - fBI->fRuleStatusIndex = fStatuses[fBufIdx]; - return; -} - - -void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) { - if (U_FAILURE(status)) { - return; - } - int32_t initialBufIdx = fBufIdx; - if (fBufIdx == fStartBufIdx) { - // At start of cache. Prepend to it. - populatePreceding(status); - } else { - // Cache already holds the next boundary - fBufIdx = modChunkSize(fBufIdx - 1); - fTextIdx = fBoundaries[fBufIdx]; - } - fBI->fDone = (fBufIdx == initialBufIdx); - fBI->fPosition = fTextIdx; - fBI->fRuleStatusIndex = fStatuses[fBufIdx]; - return; -} - - -UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { - if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { - return FALSE; - } - if (pos == fBoundaries[fStartBufIdx]) { - // Common case: seek(0), from BreakIterator::first() - fBufIdx = fStartBufIdx; - fTextIdx = fBoundaries[fBufIdx]; - return TRUE; - } - if (pos == fBoundaries[fEndBufIdx]) { - fBufIdx = fEndBufIdx; - fTextIdx = fBoundaries[fBufIdx]; - return TRUE; - } - - int32_t min = fStartBufIdx; - int32_t max = fEndBufIdx; - while (min != max) { - int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; - probe = modChunkSize(probe); - if (fBoundaries[probe] > pos) { - max = probe; - } else { - min = modChunkSize(probe + 1); - } - } - U_ASSERT(fBoundaries[max] > pos); - fBufIdx = modChunkSize(max - 1); - fTextIdx = fBoundaries[fBufIdx]; - U_ASSERT(fTextIdx <= pos); - return TRUE; -} - - -UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) { - if (U_FAILURE(status)) { - return FALSE; - } - U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); - - // Find a boundary somewhere in the vicinity of the requested position. - // Depending on the safe rules and the text data, it could be either before, at, or after - // the requested position. - - - // If the requested position is not near already cached positions, clear the existing cache, - // find a near-by boundary and begin new cache contents there. - - if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { - int32_t aBoundary = 0; - int32_t ruleStatusIndex = 0; - if (position > 20) { - int32_t backupPos = fBI->handleSafePrevious(position); - - if (backupPos > 0) { - // Advance to the boundary following the backup position. - // There is a complication: the safe reverse rules identify pairs of code points - // that are safe. If advancing from the safe point moves forwards by less than - // two code points, we need to advance one more time to ensure that the boundary - // is good, including a correct rules status value. - // - fBI->fPosition = backupPos; - aBoundary = fBI->handleNext(); - if (aBoundary <= backupPos + 4) { - // +4 is a quick test for possibly having advanced only one codepoint. - // Four being the length of the longest potential code point, a supplementary in UTF-8 - utext_setNativeIndex(&fBI->fText, aBoundary); - if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) { - // The initial handleNext() only advanced by a single code point. Go again. - aBoundary = fBI->handleNext(); // Safe rules identify safe pairs. - } - } - ruleStatusIndex = fBI->fRuleStatusIndex; - } - } - reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. - } - - // Fill in boundaries between existing cache content and the new requested position. - - if (fBoundaries[fEndBufIdx] < position) { - // The last position in the cache precedes the requested position. - // Add following position(s) to the cache. - while (fBoundaries[fEndBufIdx] < position) { - if (!populateFollowing()) { - UPRV_UNREACHABLE; - } - } - fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. - fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. - while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. - previous(status); - } - return true; - } - - if (fBoundaries[fStartBufIdx] > position) { - // The first position in the cache is beyond the requested position. - // back up more until we get a boundary <= the requested position. - while (fBoundaries[fStartBufIdx] > position) { - populatePreceding(status); - } - fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. - fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. - while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. - next(); - } - if (fTextIdx > position) { - // If position is not itself a boundary, the next() loop above will overshoot. - // Back up one, leaving cache position at the boundary preceding the requested position. - previous(status); - } - return true; - } - - U_ASSERT(fTextIdx == position); - return true; -} - - - -UBool RuleBasedBreakIterator::BreakCache::populateFollowing() { - int32_t fromPosition = fBoundaries[fEndBufIdx]; - int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx]; - int32_t pos = 0; - int32_t ruleStatusIdx = 0; - - if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { - addFollowing(pos, ruleStatusIdx, UpdateCachePosition); - return TRUE; - } - - fBI->fPosition = fromPosition; - pos = fBI->handleNext(); - if (pos == UBRK_DONE) { - return FALSE; - } - - ruleStatusIdx = fBI->fRuleStatusIndex; - if (fBI->fDictionaryCharCount > 0) { - // The text segment obtained from the rules includes dictionary characters. - // Subdivide it, with subdivided results going into the dictionary cache. - fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); - if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { - addFollowing(pos, ruleStatusIdx, UpdateCachePosition); - return TRUE; - // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point. - // But be careful with interactions with populateNear(). - } - } - - // Rule based segment did not include dictionary characters. - // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, - // meaning that we didn't take the return, above. - // Add its end point to the cache. - addFollowing(pos, ruleStatusIdx, UpdateCachePosition); - - // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. - // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. - // - for (int count=0; count<6; ++count) { - pos = fBI->handleNext(); - if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) { - break; - } - addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition); - } - - return TRUE; -} - - -UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) { - if (U_FAILURE(status)) { - return FALSE; - } - - int32_t fromPosition = fBoundaries[fStartBufIdx]; - if (fromPosition == 0) { - return FALSE; - } - - int32_t position = 0; - int32_t positionStatusIdx = 0; - - if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) { - addPreceding(position, positionStatusIdx, UpdateCachePosition); - return TRUE; - } - - int32_t backupPosition = fromPosition; - - // Find a boundary somewhere preceding the first already-cached boundary - do { - backupPosition = backupPosition - 30; - if (backupPosition <= 0) { - backupPosition = 0; - } else { - backupPosition = fBI->handleSafePrevious(backupPosition); - } - if (backupPosition == UBRK_DONE || backupPosition == 0) { - position = 0; - positionStatusIdx = 0; - } else { - // Advance to the boundary following the backup position. - // There is a complication: the safe reverse rules identify pairs of code points - // that are safe. If advancing from the safe point moves forwards by less than - // two code points, we need to advance one more time to ensure that the boundary - // is good, including a correct rules status value. - // - fBI->fPosition = backupPosition; - position = fBI->handleNext(); - if (position <= backupPosition + 4) { - // +4 is a quick test for possibly having advanced only one codepoint. - // Four being the length of the longest potential code point, a supplementary in UTF-8 - utext_setNativeIndex(&fBI->fText, position); - if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) { - // The initial handleNext() only advanced by a single code point. Go again. - position = fBI->handleNext(); // Safe rules identify safe pairs. - } - } - positionStatusIdx = fBI->fRuleStatusIndex; - } - } while (position >= fromPosition); - - // Find boundaries between the one we just located and the first already-cached boundary - // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. - - fSideBuffer.removeAllElements(); - fSideBuffer.addElement(position, status); - fSideBuffer.addElement(positionStatusIdx, status); - - do { - int32_t prevPosition = fBI->fPosition = position; - int32_t prevStatusIdx = positionStatusIdx; - position = fBI->handleNext(); - positionStatusIdx = fBI->fRuleStatusIndex; - if (position == UBRK_DONE) { - break; - } - - UBool segmentHandledByDictionary = FALSE; - if (fBI->fDictionaryCharCount != 0) { - // Segment from the rules includes dictionary characters. - // Subdivide it, with subdivided results going into the dictionary cache. - int32_t dictSegEndPosition = position; - fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); - while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) { - segmentHandledByDictionary = true; - U_ASSERT(position > prevPosition); - if (position >= fromPosition) { - break; - } - U_ASSERT(position <= dictSegEndPosition); - fSideBuffer.addElement(position, status); - fSideBuffer.addElement(positionStatusIdx, status); - prevPosition = position; - } - U_ASSERT(position==dictSegEndPosition || position>=fromPosition); - } - - if (!segmentHandledByDictionary && position < fromPosition) { - fSideBuffer.addElement(position, status); - fSideBuffer.addElement(positionStatusIdx, status); - } - } while (position < fromPosition); - - // Move boundaries from the side buffer to the main circular buffer. - UBool success = FALSE; - if (!fSideBuffer.isEmpty()) { - positionStatusIdx = fSideBuffer.popi(); - position = fSideBuffer.popi(); - addPreceding(position, positionStatusIdx, UpdateCachePosition); - success = TRUE; - } - - while (!fSideBuffer.isEmpty()) { - positionStatusIdx = fSideBuffer.popi(); - position = fSideBuffer.popi(); - if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { - // No space in circular buffer to hold a new preceding result while - // also retaining the current cache (iteration) position. - // Bailing out is safe; the cache will refill again if needed. - break; - } - } - - return success; -} - - -void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { - U_ASSERT(position > fBoundaries[fEndBufIdx]); - U_ASSERT(ruleStatusIdx <= UINT16_MAX); - int32_t nextIdx = modChunkSize(fEndBufIdx + 1); - if (nextIdx == fStartBufIdx) { - fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. - } - fBoundaries[nextIdx] = position; - fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); - fEndBufIdx = nextIdx; - if (update == UpdateCachePosition) { - // Set current position to the newly added boundary. - fBufIdx = nextIdx; - fTextIdx = position; - } else { - // Retaining the original cache position. - // Check if the added boundary wraps around the buffer, and would over-write the original position. - // It's the responsibility of callers of this function to not add too many. - U_ASSERT(nextIdx != fBufIdx); - } -} - -bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { - U_ASSERT(position < fBoundaries[fStartBufIdx]); - U_ASSERT(ruleStatusIdx <= UINT16_MAX); - int32_t nextIdx = modChunkSize(fStartBufIdx - 1); - if (nextIdx == fEndBufIdx) { - if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { - // Failure. The insertion of the new boundary would claim the buffer position that is the - // current iteration position. And we also want to retain the current iteration position. - // (The buffer is already completely full of entries that precede the iteration position.) - return false; - } - fEndBufIdx = modChunkSize(fEndBufIdx - 1); - } - fBoundaries[nextIdx] = position; - fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); - fStartBufIdx = nextIdx; - if (update == UpdateCachePosition) { - fBufIdx = nextIdx; - fTextIdx = position; - } - return true; -} - - -void RuleBasedBreakIterator::BreakCache::dumpCache() { -#ifdef RBBI_DEBUG - RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx); - for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) { - RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]); - if (i == fEndBufIdx) { - break; - } - } -#endif -} - -U_NAMESPACE_END - -#endif // #if !UCONFIG_NO_BREAK_ITERATION +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// file: rbbi_cache.cpp + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_BREAK_ITERATION + +#include "unicode/ubrk.h" +#include "unicode/rbbi.h" + +#include "rbbi_cache.h" + +#include "brkeng.h" +#include "cmemory.h" +#include "rbbidata.h" +#include "rbbirb.h" +#include "uassert.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +/* + * DictionaryCache implementation + */ + +RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) : + fBI(bi), fBreaks(status), fPositionInCache(-1), + fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) { +} + +RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() { +} + +void RuleBasedBreakIterator::DictionaryCache::reset() { + fPositionInCache = -1; + fStart = 0; + fLimit = 0; + fFirstRuleStatusIndex = 0; + fOtherRuleStatusIndex = 0; + fBreaks.removeAllElements(); +} + +UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) { + if (fromPos >= fLimit || fromPos < fStart) { + fPositionInCache = -1; + return FALSE; + } + + // Sequential iteration, move from previous boundary to the following + + int32_t r = 0; + if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { + ++fPositionInCache; + if (fPositionInCache >= fBreaks.size()) { + fPositionInCache = -1; + return FALSE; + } + r = fBreaks.elementAti(fPositionInCache); + U_ASSERT(r > fromPos); + *result = r; + *statusIndex = fOtherRuleStatusIndex; + return TRUE; + } + + // Random indexing. Linear search for the boundary following the given position. + + for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { + r= fBreaks.elementAti(fPositionInCache); + if (r > fromPos) { + *result = r; + *statusIndex = fOtherRuleStatusIndex; + return TRUE; + } + } + UPRV_UNREACHABLE; +} + + +UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) { + if (fromPos <= fStart || fromPos > fLimit) { + fPositionInCache = -1; + return FALSE; + } + + if (fromPos == fLimit) { + fPositionInCache = fBreaks.size() - 1; + if (fPositionInCache >= 0) { + U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos); + } + } + + int32_t r; + if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { + --fPositionInCache; + r = fBreaks.elementAti(fPositionInCache); + U_ASSERT(r < fromPos); + *result = r; + *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + return TRUE; + } + + if (fPositionInCache == 0) { + fPositionInCache = -1; + return FALSE; + } + + for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { + r = fBreaks.elementAti(fPositionInCache); + if (r < fromPos) { + *result = r; + *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + return TRUE; + } + } + UPRV_UNREACHABLE; +} + +void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, + int32_t firstRuleStatus, int32_t otherRuleStatus) { + if ((endPos - startPos) <= 1) { + return; + } + + reset(); + fFirstRuleStatusIndex = firstRuleStatus; + fOtherRuleStatusIndex = otherRuleStatus; + + int32_t rangeStart = startPos; + int32_t rangeEnd = endPos; + + uint16_t category; + int32_t current; + UErrorCode status = U_ZERO_ERROR; + int32_t foundBreakCount = 0; + UText *text = &fBI->fText; + + // Loop through the text, looking for ranges of dictionary characters. + // For each span, find the appropriate break engine, and ask it to find + // any breaks within the span. + + utext_setNativeIndex(text, rangeStart); + UChar32 c = utext_current32(text); + category = UTRIE2_GET16(fBI->fData->fTrie, c); + + while(U_SUCCESS(status)) { + while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) { + utext_next32(text); // TODO: cleaner loop structure. + c = utext_current32(text); + category = UTRIE2_GET16(fBI->fData->fTrie, c); + } + if (current >= rangeEnd) { + break; + } + + // We now have a dictionary character. Get the appropriate language object + // to deal with it. + const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c); + + // Ask the language object if there are any breaks. It will add them to the cache and + // leave the text pointer on the other side of its range, ready to search for the next one. + if (lbe != NULL) { + foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks); + } + + // Reload the loop variables for the next go-round + c = utext_current32(text); + category = UTRIE2_GET16(fBI->fData->fTrie, c); + } + + // If we found breaks, ensure that the first and last entries are + // the original starting and ending position. And initialize the + // cache iteration position to the first entry. + + // printf("foundBreakCount = %d\n", foundBreakCount); + if (foundBreakCount > 0) { + U_ASSERT(foundBreakCount == fBreaks.size()); + if (startPos < fBreaks.elementAti(0)) { + // The dictionary did not place a boundary at the start of the segment of text. + // Add one now. This should not commonly happen, but it would be easy for interactions + // of the rules for dictionary segments and the break engine implementations to + // inadvertently cause it. Cover it here, just in case. + fBreaks.insertElementAt(startPos, 0, status); + } + if (endPos > fBreaks.peeki()) { + fBreaks.push(endPos, status); + } + fPositionInCache = 0; + // Note: Dictionary matching may extend beyond the original limit. + fStart = fBreaks.elementAti(0); + fLimit = fBreaks.peeki(); + } else { + // there were no language-based breaks, even though the segment contained + // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache + // for this range will fail, and the calling code will fall back to the rule based boundaries. + } +} + + +/* + * BreakCache implemetation + */ + +RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : + fBI(bi), fSideBuffer(status) { + reset(); +} + + +RuleBasedBreakIterator::BreakCache::~BreakCache() { +} + + +void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) { + fStartBufIdx = 0; + fEndBufIdx = 0; + fTextIdx = pos; + fBufIdx = 0; + fBoundaries[0] = pos; + fStatuses[0] = (uint16_t)ruleStatus; +} + + +int32_t RuleBasedBreakIterator::BreakCache::current() { + fBI->fPosition = fTextIdx; + fBI->fRuleStatusIndex = fStatuses[fBufIdx]; + fBI->fDone = FALSE; + return fTextIdx; +} + + +void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { + // startPos is in the cache. Do a next() from that position. + // TODO: an awkward set of interactions with bi->fDone + // seek() does not clear it; it can't because of interactions with populateNear(). + // next() does not clear it in the fast-path case, where everything matters. Maybe it should. + // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. + fBI->fDone = false; + next(); + } + return; +} + + +void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { + if (startPos == fTextIdx) { + previous(status); + } else { + // seek() leaves the BreakCache positioned at the preceding boundary + // if the requested position is between two bounaries. + // current() pushes the BreakCache position out to the BreakIterator itself. + U_ASSERT(startPos > fTextIdx); + current(); + } + } + return; +} + + +/* + * Out-of-line code for BreakCache::next(). + * Cache does not already contain the boundary + */ +void RuleBasedBreakIterator::BreakCache::nextOL() { + fBI->fDone = !populateFollowing(); + fBI->fPosition = fTextIdx; + fBI->fRuleStatusIndex = fStatuses[fBufIdx]; + return; +} + + +void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + int32_t initialBufIdx = fBufIdx; + if (fBufIdx == fStartBufIdx) { + // At start of cache. Prepend to it. + populatePreceding(status); + } else { + // Cache already holds the next boundary + fBufIdx = modChunkSize(fBufIdx - 1); + fTextIdx = fBoundaries[fBufIdx]; + } + fBI->fDone = (fBufIdx == initialBufIdx); + fBI->fPosition = fTextIdx; + fBI->fRuleStatusIndex = fStatuses[fBufIdx]; + return; +} + + +UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { + if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { + return FALSE; + } + if (pos == fBoundaries[fStartBufIdx]) { + // Common case: seek(0), from BreakIterator::first() + fBufIdx = fStartBufIdx; + fTextIdx = fBoundaries[fBufIdx]; + return TRUE; + } + if (pos == fBoundaries[fEndBufIdx]) { + fBufIdx = fEndBufIdx; + fTextIdx = fBoundaries[fBufIdx]; + return TRUE; + } + + int32_t min = fStartBufIdx; + int32_t max = fEndBufIdx; + while (min != max) { + int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; + probe = modChunkSize(probe); + if (fBoundaries[probe] > pos) { + max = probe; + } else { + min = modChunkSize(probe + 1); + } + } + U_ASSERT(fBoundaries[max] > pos); + fBufIdx = modChunkSize(max - 1); + fTextIdx = fBoundaries[fBufIdx]; + U_ASSERT(fTextIdx <= pos); + return TRUE; +} + + +UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); + + // Find a boundary somewhere in the vicinity of the requested position. + // Depending on the safe rules and the text data, it could be either before, at, or after + // the requested position. + + + // If the requested position is not near already cached positions, clear the existing cache, + // find a near-by boundary and begin new cache contents there. + + if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { + int32_t aBoundary = 0; + int32_t ruleStatusIndex = 0; + if (position > 20) { + int32_t backupPos = fBI->handleSafePrevious(position); + + if (backupPos > 0) { + // Advance to the boundary following the backup position. + // There is a complication: the safe reverse rules identify pairs of code points + // that are safe. If advancing from the safe point moves forwards by less than + // two code points, we need to advance one more time to ensure that the boundary + // is good, including a correct rules status value. + // + fBI->fPosition = backupPos; + aBoundary = fBI->handleNext(); + if (aBoundary <= backupPos + 4) { + // +4 is a quick test for possibly having advanced only one codepoint. + // Four being the length of the longest potential code point, a supplementary in UTF-8 + utext_setNativeIndex(&fBI->fText, aBoundary); + if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) { + // The initial handleNext() only advanced by a single code point. Go again. + aBoundary = fBI->handleNext(); // Safe rules identify safe pairs. + } + } + ruleStatusIndex = fBI->fRuleStatusIndex; + } + } + reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. + } + + // Fill in boundaries between existing cache content and the new requested position. + + if (fBoundaries[fEndBufIdx] < position) { + // The last position in the cache precedes the requested position. + // Add following position(s) to the cache. + while (fBoundaries[fEndBufIdx] < position) { + if (!populateFollowing()) { + UPRV_UNREACHABLE; + } + } + fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. + fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. + while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. + previous(status); + } + return true; + } + + if (fBoundaries[fStartBufIdx] > position) { + // The first position in the cache is beyond the requested position. + // back up more until we get a boundary <= the requested position. + while (fBoundaries[fStartBufIdx] > position) { + populatePreceding(status); + } + fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. + fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. + while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. + next(); + } + if (fTextIdx > position) { + // If position is not itself a boundary, the next() loop above will overshoot. + // Back up one, leaving cache position at the boundary preceding the requested position. + previous(status); + } + return true; + } + + U_ASSERT(fTextIdx == position); + return true; +} + + + +UBool RuleBasedBreakIterator::BreakCache::populateFollowing() { + int32_t fromPosition = fBoundaries[fEndBufIdx]; + int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx]; + int32_t pos = 0; + int32_t ruleStatusIdx = 0; + + if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { + addFollowing(pos, ruleStatusIdx, UpdateCachePosition); + return TRUE; + } + + fBI->fPosition = fromPosition; + pos = fBI->handleNext(); + if (pos == UBRK_DONE) { + return FALSE; + } + + ruleStatusIdx = fBI->fRuleStatusIndex; + if (fBI->fDictionaryCharCount > 0) { + // The text segment obtained from the rules includes dictionary characters. + // Subdivide it, with subdivided results going into the dictionary cache. + fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); + if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { + addFollowing(pos, ruleStatusIdx, UpdateCachePosition); + return TRUE; + // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point. + // But be careful with interactions with populateNear(). + } + } + + // Rule based segment did not include dictionary characters. + // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, + // meaning that we didn't take the return, above. + // Add its end point to the cache. + addFollowing(pos, ruleStatusIdx, UpdateCachePosition); + + // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. + // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. + // + for (int count=0; count<6; ++count) { + pos = fBI->handleNext(); + if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) { + break; + } + addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition); + } + + return TRUE; +} + + +UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + + int32_t fromPosition = fBoundaries[fStartBufIdx]; + if (fromPosition == 0) { + return FALSE; + } + + int32_t position = 0; + int32_t positionStatusIdx = 0; + + if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) { + addPreceding(position, positionStatusIdx, UpdateCachePosition); + return TRUE; + } + + int32_t backupPosition = fromPosition; + + // Find a boundary somewhere preceding the first already-cached boundary + do { + backupPosition = backupPosition - 30; + if (backupPosition <= 0) { + backupPosition = 0; + } else { + backupPosition = fBI->handleSafePrevious(backupPosition); + } + if (backupPosition == UBRK_DONE || backupPosition == 0) { + position = 0; + positionStatusIdx = 0; + } else { + // Advance to the boundary following the backup position. + // There is a complication: the safe reverse rules identify pairs of code points + // that are safe. If advancing from the safe point moves forwards by less than + // two code points, we need to advance one more time to ensure that the boundary + // is good, including a correct rules status value. + // + fBI->fPosition = backupPosition; + position = fBI->handleNext(); + if (position <= backupPosition + 4) { + // +4 is a quick test for possibly having advanced only one codepoint. + // Four being the length of the longest potential code point, a supplementary in UTF-8 + utext_setNativeIndex(&fBI->fText, position); + if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) { + // The initial handleNext() only advanced by a single code point. Go again. + position = fBI->handleNext(); // Safe rules identify safe pairs. + } + } + positionStatusIdx = fBI->fRuleStatusIndex; + } + } while (position >= fromPosition); + + // Find boundaries between the one we just located and the first already-cached boundary + // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. + + fSideBuffer.removeAllElements(); + fSideBuffer.addElement(position, status); + fSideBuffer.addElement(positionStatusIdx, status); + + do { + int32_t prevPosition = fBI->fPosition = position; + int32_t prevStatusIdx = positionStatusIdx; + position = fBI->handleNext(); + positionStatusIdx = fBI->fRuleStatusIndex; + if (position == UBRK_DONE) { + break; + } + + UBool segmentHandledByDictionary = FALSE; + if (fBI->fDictionaryCharCount != 0) { + // Segment from the rules includes dictionary characters. + // Subdivide it, with subdivided results going into the dictionary cache. + int32_t dictSegEndPosition = position; + fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); + while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) { + segmentHandledByDictionary = true; + U_ASSERT(position > prevPosition); + if (position >= fromPosition) { + break; + } + U_ASSERT(position <= dictSegEndPosition); + fSideBuffer.addElement(position, status); + fSideBuffer.addElement(positionStatusIdx, status); + prevPosition = position; + } + U_ASSERT(position==dictSegEndPosition || position>=fromPosition); + } + + if (!segmentHandledByDictionary && position < fromPosition) { + fSideBuffer.addElement(position, status); + fSideBuffer.addElement(positionStatusIdx, status); + } + } while (position < fromPosition); + + // Move boundaries from the side buffer to the main circular buffer. + UBool success = FALSE; + if (!fSideBuffer.isEmpty()) { + positionStatusIdx = fSideBuffer.popi(); + position = fSideBuffer.popi(); + addPreceding(position, positionStatusIdx, UpdateCachePosition); + success = TRUE; + } + + while (!fSideBuffer.isEmpty()) { + positionStatusIdx = fSideBuffer.popi(); + position = fSideBuffer.popi(); + if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { + // No space in circular buffer to hold a new preceding result while + // also retaining the current cache (iteration) position. + // Bailing out is safe; the cache will refill again if needed. + break; + } + } + + return success; +} + + +void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { + U_ASSERT(position > fBoundaries[fEndBufIdx]); + U_ASSERT(ruleStatusIdx <= UINT16_MAX); + int32_t nextIdx = modChunkSize(fEndBufIdx + 1); + if (nextIdx == fStartBufIdx) { + fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. + } + fBoundaries[nextIdx] = position; + fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); + fEndBufIdx = nextIdx; + if (update == UpdateCachePosition) { + // Set current position to the newly added boundary. + fBufIdx = nextIdx; + fTextIdx = position; + } else { + // Retaining the original cache position. + // Check if the added boundary wraps around the buffer, and would over-write the original position. + // It's the responsibility of callers of this function to not add too many. + U_ASSERT(nextIdx != fBufIdx); + } +} + +bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { + U_ASSERT(position < fBoundaries[fStartBufIdx]); + U_ASSERT(ruleStatusIdx <= UINT16_MAX); + int32_t nextIdx = modChunkSize(fStartBufIdx - 1); + if (nextIdx == fEndBufIdx) { + if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { + // Failure. The insertion of the new boundary would claim the buffer position that is the + // current iteration position. And we also want to retain the current iteration position. + // (The buffer is already completely full of entries that precede the iteration position.) + return false; + } + fEndBufIdx = modChunkSize(fEndBufIdx - 1); + } + fBoundaries[nextIdx] = position; + fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); + fStartBufIdx = nextIdx; + if (update == UpdateCachePosition) { + fBufIdx = nextIdx; + fTextIdx = position; + } + return true; +} + + +void RuleBasedBreakIterator::BreakCache::dumpCache() { +#ifdef RBBI_DEBUG + RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx); + for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) { + RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]); + if (i == fEndBufIdx) { + break; + } + } +#endif +} + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_BREAK_ITERATION |