2020-08-11 11:10:23 +02:00
|
|
|
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
|
|
|
|
// file: rbbi_cache.cpp
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_BREAK_ITERATION
|
|
|
|
|
|
|
|
#include "unicode/ubrk.h"
|
|
|
|
#include "unicode/rbbi.h"
|
|
|
|
|
|
|
|
#include "rbbi_cache.h"
|
|
|
|
|
|
|
|
#include "brkeng.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "rbbidata.h"
|
|
|
|
#include "rbbirb.h"
|
|
|
|
#include "uassert.h"
|
|
|
|
#include "uvectr32.h"
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
/*
|
|
|
|
* DictionaryCache implementation
|
|
|
|
*/
|
|
|
|
|
|
|
|
RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
|
|
|
|
fBI(bi), fBreaks(status), fPositionInCache(-1),
|
|
|
|
fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
|
|
|
|
}
|
|
|
|
|
|
|
|
RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
|
|
|
|
}
|
|
|
|
|
|
|
|
void RuleBasedBreakIterator::DictionaryCache::reset() {
|
|
|
|
fPositionInCache = -1;
|
|
|
|
fStart = 0;
|
|
|
|
fLimit = 0;
|
|
|
|
fFirstRuleStatusIndex = 0;
|
|
|
|
fOtherRuleStatusIndex = 0;
|
|
|
|
fBreaks.removeAllElements();
|
|
|
|
}
|
|
|
|
|
|
|
|
UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
|
|
|
|
if (fromPos >= fLimit || fromPos < fStart) {
|
|
|
|
fPositionInCache = -1;
|
2022-10-28 08:11:55 +02:00
|
|
|
return false;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Sequential iteration, move from previous boundary to the following
|
|
|
|
|
|
|
|
int32_t r = 0;
|
|
|
|
if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
|
|
|
|
++fPositionInCache;
|
|
|
|
if (fPositionInCache >= fBreaks.size()) {
|
|
|
|
fPositionInCache = -1;
|
2022-10-28 08:11:55 +02:00
|
|
|
return false;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
r = fBreaks.elementAti(fPositionInCache);
|
|
|
|
U_ASSERT(r > fromPos);
|
|
|
|
*result = r;
|
|
|
|
*statusIndex = fOtherRuleStatusIndex;
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Random indexing. Linear search for the boundary following the given position.
|
|
|
|
|
|
|
|
for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
|
|
|
|
r= fBreaks.elementAti(fPositionInCache);
|
|
|
|
if (r > fromPos) {
|
|
|
|
*result = r;
|
|
|
|
*statusIndex = fOtherRuleStatusIndex;
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
}
|
2021-10-28 08:15:28 +02:00
|
|
|
UPRV_UNREACHABLE_EXIT;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
|
|
|
|
if (fromPos <= fStart || fromPos > fLimit) {
|
|
|
|
fPositionInCache = -1;
|
2022-10-28 08:11:55 +02:00
|
|
|
return false;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (fromPos == fLimit) {
|
|
|
|
fPositionInCache = fBreaks.size() - 1;
|
|
|
|
if (fPositionInCache >= 0) {
|
|
|
|
U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t r;
|
|
|
|
if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
|
|
|
|
--fPositionInCache;
|
|
|
|
r = fBreaks.elementAti(fPositionInCache);
|
|
|
|
U_ASSERT(r < fromPos);
|
|
|
|
*result = r;
|
|
|
|
*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (fPositionInCache == 0) {
|
|
|
|
fPositionInCache = -1;
|
2022-10-28 08:11:55 +02:00
|
|
|
return false;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
|
|
|
|
r = fBreaks.elementAti(fPositionInCache);
|
|
|
|
if (r < fromPos) {
|
|
|
|
*result = r;
|
|
|
|
*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
}
|
2021-10-28 08:15:28 +02:00
|
|
|
UPRV_UNREACHABLE_EXIT;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
|
|
|
|
int32_t firstRuleStatus, int32_t otherRuleStatus) {
|
|
|
|
if ((endPos - startPos) <= 1) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
reset();
|
|
|
|
fFirstRuleStatusIndex = firstRuleStatus;
|
|
|
|
fOtherRuleStatusIndex = otherRuleStatus;
|
|
|
|
|
|
|
|
int32_t rangeStart = startPos;
|
|
|
|
int32_t rangeEnd = endPos;
|
|
|
|
|
|
|
|
uint16_t category;
|
|
|
|
int32_t current;
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
int32_t foundBreakCount = 0;
|
|
|
|
UText *text = &fBI->fText;
|
|
|
|
|
|
|
|
// Loop through the text, looking for ranges of dictionary characters.
|
|
|
|
// For each span, find the appropriate break engine, and ask it to find
|
|
|
|
// any breaks within the span.
|
|
|
|
|
|
|
|
utext_setNativeIndex(text, rangeStart);
|
|
|
|
UChar32 c = utext_current32(text);
|
|
|
|
category = ucptrie_get(fBI->fData->fTrie, c);
|
|
|
|
uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
|
|
|
|
|
|
|
|
while(U_SUCCESS(status)) {
|
|
|
|
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
|
|
|
|
&& (category < dictStart)) {
|
|
|
|
utext_next32(text); // TODO: cleaner loop structure.
|
|
|
|
c = utext_current32(text);
|
|
|
|
category = ucptrie_get(fBI->fData->fTrie, c);
|
|
|
|
}
|
|
|
|
if (current >= rangeEnd) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We now have a dictionary character. Get the appropriate language object
|
|
|
|
// to deal with it.
|
|
|
|
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);
|
|
|
|
|
|
|
|
// Ask the language object if there are any breaks. It will add them to the cache and
|
|
|
|
// leave the text pointer on the other side of its range, ready to search for the next one.
|
|
|
|
if (lbe != NULL) {
|
2022-05-17 17:14:19 +02:00
|
|
|
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Reload the loop variables for the next go-round
|
|
|
|
c = utext_current32(text);
|
|
|
|
category = ucptrie_get(fBI->fData->fTrie, c);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we found breaks, ensure that the first and last entries are
|
|
|
|
// the original starting and ending position. And initialize the
|
|
|
|
// cache iteration position to the first entry.
|
|
|
|
|
|
|
|
// printf("foundBreakCount = %d\n", foundBreakCount);
|
|
|
|
if (foundBreakCount > 0) {
|
|
|
|
U_ASSERT(foundBreakCount == fBreaks.size());
|
|
|
|
if (startPos < fBreaks.elementAti(0)) {
|
|
|
|
// The dictionary did not place a boundary at the start of the segment of text.
|
|
|
|
// Add one now. This should not commonly happen, but it would be easy for interactions
|
|
|
|
// of the rules for dictionary segments and the break engine implementations to
|
|
|
|
// inadvertently cause it. Cover it here, just in case.
|
|
|
|
fBreaks.insertElementAt(startPos, 0, status);
|
|
|
|
}
|
|
|
|
if (endPos > fBreaks.peeki()) {
|
|
|
|
fBreaks.push(endPos, status);
|
|
|
|
}
|
|
|
|
fPositionInCache = 0;
|
|
|
|
// Note: Dictionary matching may extend beyond the original limit.
|
|
|
|
fStart = fBreaks.elementAti(0);
|
|
|
|
fLimit = fBreaks.peeki();
|
|
|
|
} else {
|
|
|
|
// there were no language-based breaks, even though the segment contained
|
|
|
|
// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
|
|
|
|
// for this range will fail, and the calling code will fall back to the rule based boundaries.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
2021-10-28 08:15:28 +02:00
|
|
|
* BreakCache implementation
|
2020-08-11 11:10:23 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
|
|
|
|
fBI(bi), fSideBuffer(status) {
|
|
|
|
reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
RuleBasedBreakIterator::BreakCache::~BreakCache() {
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
|
|
|
|
fStartBufIdx = 0;
|
|
|
|
fEndBufIdx = 0;
|
|
|
|
fTextIdx = pos;
|
|
|
|
fBufIdx = 0;
|
|
|
|
fBoundaries[0] = pos;
|
|
|
|
fStatuses[0] = (uint16_t)ruleStatus;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int32_t RuleBasedBreakIterator::BreakCache::current() {
|
|
|
|
fBI->fPosition = fTextIdx;
|
|
|
|
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
2022-10-28 08:11:55 +02:00
|
|
|
fBI->fDone = false;
|
2020-08-11 11:10:23 +02:00
|
|
|
return fTextIdx;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
|
|
|
|
// startPos is in the cache. Do a next() from that position.
|
|
|
|
// TODO: an awkward set of interactions with bi->fDone
|
|
|
|
// seek() does not clear it; it can't because of interactions with populateNear().
|
|
|
|
// next() does not clear it in the fast-path case, where everything matters. Maybe it should.
|
|
|
|
// So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
|
|
|
|
fBI->fDone = false;
|
|
|
|
next();
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
|
|
|
|
if (startPos == fTextIdx) {
|
|
|
|
previous(status);
|
|
|
|
} else {
|
|
|
|
// seek() leaves the BreakCache positioned at the preceding boundary
|
2021-04-22 14:08:59 +02:00
|
|
|
// if the requested position is between two boundaries.
|
2020-08-11 11:10:23 +02:00
|
|
|
// current() pushes the BreakCache position out to the BreakIterator itself.
|
|
|
|
U_ASSERT(startPos > fTextIdx);
|
|
|
|
current();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Out-of-line code for BreakCache::next().
|
|
|
|
* Cache does not already contain the boundary
|
|
|
|
*/
|
|
|
|
void RuleBasedBreakIterator::BreakCache::nextOL() {
|
|
|
|
fBI->fDone = !populateFollowing();
|
|
|
|
fBI->fPosition = fTextIdx;
|
|
|
|
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
int32_t initialBufIdx = fBufIdx;
|
|
|
|
if (fBufIdx == fStartBufIdx) {
|
|
|
|
// At start of cache. Prepend to it.
|
|
|
|
populatePreceding(status);
|
|
|
|
} else {
|
|
|
|
// Cache already holds the next boundary
|
|
|
|
fBufIdx = modChunkSize(fBufIdx - 1);
|
|
|
|
fTextIdx = fBoundaries[fBufIdx];
|
|
|
|
}
|
|
|
|
fBI->fDone = (fBufIdx == initialBufIdx);
|
|
|
|
fBI->fPosition = fTextIdx;
|
|
|
|
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
|
|
|
|
if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
|
2022-10-28 08:11:55 +02:00
|
|
|
return false;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
if (pos == fBoundaries[fStartBufIdx]) {
|
|
|
|
// Common case: seek(0), from BreakIterator::first()
|
|
|
|
fBufIdx = fStartBufIdx;
|
|
|
|
fTextIdx = fBoundaries[fBufIdx];
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
if (pos == fBoundaries[fEndBufIdx]) {
|
|
|
|
fBufIdx = fEndBufIdx;
|
|
|
|
fTextIdx = fBoundaries[fBufIdx];
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int32_t min = fStartBufIdx;
|
|
|
|
int32_t max = fEndBufIdx;
|
|
|
|
while (min != max) {
|
|
|
|
int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
|
|
|
|
probe = modChunkSize(probe);
|
|
|
|
if (fBoundaries[probe] > pos) {
|
|
|
|
max = probe;
|
|
|
|
} else {
|
|
|
|
min = modChunkSize(probe + 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
U_ASSERT(fBoundaries[max] > pos);
|
|
|
|
fBufIdx = modChunkSize(max - 1);
|
|
|
|
fTextIdx = fBoundaries[fBufIdx];
|
|
|
|
U_ASSERT(fTextIdx <= pos);
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
2022-10-28 08:11:55 +02:00
|
|
|
return false;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
|
|
|
|
|
2022-10-28 08:11:55 +02:00
|
|
|
// Add boundaries to the cache near the specified position.
|
|
|
|
// The given position need not be a boundary itself.
|
|
|
|
// The input position must be within the range of the text, and
|
|
|
|
// on a code point boundary.
|
|
|
|
// If the requested position is a break boundary, leave the iteration
|
|
|
|
// position on it.
|
|
|
|
// If the requested position is not a boundary, leave the iteration
|
|
|
|
// position on the preceding boundary and include both the
|
|
|
|
// preceding and following boundaries in the cache.
|
|
|
|
// Additional boundaries, either preceding or following, may be added
|
|
|
|
// to the cache as a side effect.
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
// If the requested position is not near already cached positions, clear the existing cache,
|
|
|
|
// find a near-by boundary and begin new cache contents there.
|
|
|
|
|
2022-10-28 08:11:55 +02:00
|
|
|
// Threshold for a text position to be considered near to existing cache contents.
|
|
|
|
// TODO: See issue ICU-22024 "perf tuning of Cache needed."
|
|
|
|
// This value is subject to change. See the ticket for more details.
|
|
|
|
static constexpr int32_t CACHE_NEAR = 15;
|
|
|
|
|
|
|
|
int32_t aBoundary = -1;
|
|
|
|
int32_t ruleStatusIndex = 0;
|
|
|
|
bool retainCache = false;
|
|
|
|
if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
|
|
|
|
// Requested position is near the existing cache. Retain it.
|
|
|
|
retainCache = true;
|
|
|
|
} else if (position <= CACHE_NEAR) {
|
|
|
|
// Requested position is near the start of the text. Fill cache from start, skipping
|
|
|
|
// the need to find a safe point.
|
|
|
|
retainCache = false;
|
|
|
|
aBoundary = 0;
|
|
|
|
} else {
|
|
|
|
// Requested position is not near the existing cache.
|
|
|
|
// Find a safe point to refill the cache from.
|
|
|
|
int32_t backupPos = fBI->handleSafePrevious(position);
|
|
|
|
|
|
|
|
if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
|
|
|
|
// The requested position is beyond the end of the existing cache, but the
|
|
|
|
// reverse rules produced a position near or before the cached region.
|
|
|
|
// Retain the existing cache, and fill from the end of it.
|
|
|
|
retainCache = true;
|
|
|
|
} else if (backupPos < CACHE_NEAR) {
|
|
|
|
// The safe reverse rules moved us to near the start of text.
|
|
|
|
// Take that (index 0) as the backup boundary, avoiding the complication
|
|
|
|
// (in the following block) of moving forward from the safe point to a known boundary.
|
|
|
|
//
|
|
|
|
// Retain the cache if it begins not too far from the requested position.
|
|
|
|
aBoundary = 0;
|
|
|
|
retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
|
|
|
|
} else {
|
|
|
|
// The safe reverse rules produced a position that is neither near the existing
|
|
|
|
// cache, nor near the start of text.
|
|
|
|
// Advance to the boundary following.
|
|
|
|
// There is a complication: the safe reverse rules identify pairs of code points
|
|
|
|
// that are safe. If advancing from the safe point moves forwards by less than
|
|
|
|
// two code points, we need to advance one more time to ensure that the boundary
|
|
|
|
// is good, including a correct rules status value.
|
|
|
|
retainCache = false;
|
|
|
|
fBI->fPosition = backupPos;
|
|
|
|
aBoundary = fBI->handleNext();
|
|
|
|
if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
|
|
|
|
// +4 is a quick test for possibly having advanced only one codepoint.
|
|
|
|
// Four being the length of the longest potential code point, a supplementary in UTF-8
|
|
|
|
utext_setNativeIndex(&fBI->fText, aBoundary);
|
|
|
|
if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
|
|
|
|
// The initial handleNext() only advanced by a single code point. Go again.
|
|
|
|
aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
}
|
2022-10-28 08:11:55 +02:00
|
|
|
if (aBoundary == UBRK_DONE) {
|
|
|
|
// Note (Andy Heninger): I don't think this condition can occur, but it's hard
|
|
|
|
// to prove that it can't. We ran off the end of the string looking a boundary
|
|
|
|
// following a safe point; choose the end of the string as that boundary.
|
|
|
|
aBoundary = utext_nativeLength(&fBI->fText);
|
|
|
|
}
|
|
|
|
ruleStatusIndex = fBI->fRuleStatusIndex;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
2022-10-28 08:11:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!retainCache) {
|
|
|
|
U_ASSERT(aBoundary != -1);
|
2020-08-11 11:10:23 +02:00
|
|
|
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fill in boundaries between existing cache content and the new requested position.
|
|
|
|
|
|
|
|
if (fBoundaries[fEndBufIdx] < position) {
|
|
|
|
// The last position in the cache precedes the requested position.
|
|
|
|
// Add following position(s) to the cache.
|
|
|
|
while (fBoundaries[fEndBufIdx] < position) {
|
|
|
|
if (!populateFollowing()) {
|
2021-10-28 08:15:28 +02:00
|
|
|
UPRV_UNREACHABLE_EXIT;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
|
|
|
|
fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
|
|
|
|
while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
|
|
|
|
previous(status);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fBoundaries[fStartBufIdx] > position) {
|
|
|
|
// The first position in the cache is beyond the requested position.
|
|
|
|
// back up more until we get a boundary <= the requested position.
|
|
|
|
while (fBoundaries[fStartBufIdx] > position) {
|
|
|
|
populatePreceding(status);
|
|
|
|
}
|
|
|
|
fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
|
|
|
|
fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
|
|
|
|
while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
|
|
|
|
next();
|
|
|
|
}
|
|
|
|
if (fTextIdx > position) {
|
|
|
|
// If position is not itself a boundary, the next() loop above will overshoot.
|
|
|
|
// Back up one, leaving cache position at the boundary preceding the requested position.
|
|
|
|
previous(status);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
U_ASSERT(fTextIdx == position);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
|
|
|
|
int32_t fromPosition = fBoundaries[fEndBufIdx];
|
|
|
|
int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
|
|
|
|
int32_t pos = 0;
|
|
|
|
int32_t ruleStatusIdx = 0;
|
|
|
|
|
|
|
|
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
|
|
|
|
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
fBI->fPosition = fromPosition;
|
|
|
|
pos = fBI->handleNext();
|
|
|
|
if (pos == UBRK_DONE) {
|
2022-10-28 08:11:55 +02:00
|
|
|
return false;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
ruleStatusIdx = fBI->fRuleStatusIndex;
|
|
|
|
if (fBI->fDictionaryCharCount > 0) {
|
|
|
|
// The text segment obtained from the rules includes dictionary characters.
|
|
|
|
// Subdivide it, with subdivided results going into the dictionary cache.
|
|
|
|
fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
|
|
|
|
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
|
|
|
|
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
// TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
|
|
|
|
// But be careful with interactions with populateNear().
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Rule based segment did not include dictionary characters.
|
|
|
|
// Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
|
|
|
|
// meaning that we didn't take the return, above.
|
|
|
|
// Add its end point to the cache.
|
|
|
|
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
|
|
|
|
|
|
|
// Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
|
|
|
|
// (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
|
|
|
|
//
|
|
|
|
for (int count=0; count<6; ++count) {
|
|
|
|
pos = fBI->handleNext();
|
|
|
|
if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
|
|
|
|
}
|
|
|
|
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
2022-10-28 08:11:55 +02:00
|
|
|
return false;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int32_t fromPosition = fBoundaries[fStartBufIdx];
|
|
|
|
if (fromPosition == 0) {
|
2022-10-28 08:11:55 +02:00
|
|
|
return false;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int32_t position = 0;
|
|
|
|
int32_t positionStatusIdx = 0;
|
|
|
|
|
|
|
|
if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
|
|
|
|
addPreceding(position, positionStatusIdx, UpdateCachePosition);
|
2022-10-28 08:11:55 +02:00
|
|
|
return true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int32_t backupPosition = fromPosition;
|
|
|
|
|
|
|
|
// Find a boundary somewhere preceding the first already-cached boundary
|
|
|
|
do {
|
|
|
|
backupPosition = backupPosition - 30;
|
|
|
|
if (backupPosition <= 0) {
|
|
|
|
backupPosition = 0;
|
|
|
|
} else {
|
|
|
|
backupPosition = fBI->handleSafePrevious(backupPosition);
|
|
|
|
}
|
|
|
|
if (backupPosition == UBRK_DONE || backupPosition == 0) {
|
|
|
|
position = 0;
|
|
|
|
positionStatusIdx = 0;
|
|
|
|
} else {
|
|
|
|
// Advance to the boundary following the backup position.
|
|
|
|
// There is a complication: the safe reverse rules identify pairs of code points
|
|
|
|
// that are safe. If advancing from the safe point moves forwards by less than
|
|
|
|
// two code points, we need to advance one more time to ensure that the boundary
|
|
|
|
// is good, including a correct rules status value.
|
|
|
|
//
|
|
|
|
fBI->fPosition = backupPosition;
|
|
|
|
position = fBI->handleNext();
|
|
|
|
if (position <= backupPosition + 4) {
|
|
|
|
// +4 is a quick test for possibly having advanced only one codepoint.
|
|
|
|
// Four being the length of the longest potential code point, a supplementary in UTF-8
|
|
|
|
utext_setNativeIndex(&fBI->fText, position);
|
|
|
|
if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
|
|
|
|
// The initial handleNext() only advanced by a single code point. Go again.
|
|
|
|
position = fBI->handleNext(); // Safe rules identify safe pairs.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
positionStatusIdx = fBI->fRuleStatusIndex;
|
|
|
|
}
|
|
|
|
} while (position >= fromPosition);
|
|
|
|
|
|
|
|
// Find boundaries between the one we just located and the first already-cached boundary
|
|
|
|
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
|
|
|
|
|
|
|
|
fSideBuffer.removeAllElements();
|
|
|
|
fSideBuffer.addElement(position, status);
|
|
|
|
fSideBuffer.addElement(positionStatusIdx, status);
|
|
|
|
|
|
|
|
do {
|
|
|
|
int32_t prevPosition = fBI->fPosition = position;
|
|
|
|
int32_t prevStatusIdx = positionStatusIdx;
|
|
|
|
position = fBI->handleNext();
|
|
|
|
positionStatusIdx = fBI->fRuleStatusIndex;
|
|
|
|
if (position == UBRK_DONE) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2022-10-28 08:11:55 +02:00
|
|
|
UBool segmentHandledByDictionary = false;
|
2020-08-11 11:10:23 +02:00
|
|
|
if (fBI->fDictionaryCharCount != 0) {
|
|
|
|
// Segment from the rules includes dictionary characters.
|
|
|
|
// Subdivide it, with subdivided results going into the dictionary cache.
|
|
|
|
int32_t dictSegEndPosition = position;
|
|
|
|
fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
|
|
|
|
while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
|
|
|
|
segmentHandledByDictionary = true;
|
|
|
|
U_ASSERT(position > prevPosition);
|
|
|
|
if (position >= fromPosition) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
U_ASSERT(position <= dictSegEndPosition);
|
|
|
|
fSideBuffer.addElement(position, status);
|
|
|
|
fSideBuffer.addElement(positionStatusIdx, status);
|
|
|
|
prevPosition = position;
|
|
|
|
}
|
|
|
|
U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!segmentHandledByDictionary && position < fromPosition) {
|
|
|
|
fSideBuffer.addElement(position, status);
|
|
|
|
fSideBuffer.addElement(positionStatusIdx, status);
|
|
|
|
}
|
|
|
|
} while (position < fromPosition);
|
|
|
|
|
|
|
|
// Move boundaries from the side buffer to the main circular buffer.
|
2022-10-28 08:11:55 +02:00
|
|
|
UBool success = false;
|
2020-08-11 11:10:23 +02:00
|
|
|
if (!fSideBuffer.isEmpty()) {
|
|
|
|
positionStatusIdx = fSideBuffer.popi();
|
|
|
|
position = fSideBuffer.popi();
|
|
|
|
addPreceding(position, positionStatusIdx, UpdateCachePosition);
|
2022-10-28 08:11:55 +02:00
|
|
|
success = true;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
while (!fSideBuffer.isEmpty()) {
|
|
|
|
positionStatusIdx = fSideBuffer.popi();
|
|
|
|
position = fSideBuffer.popi();
|
|
|
|
if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
|
|
|
|
// No space in circular buffer to hold a new preceding result while
|
|
|
|
// also retaining the current cache (iteration) position.
|
|
|
|
// Bailing out is safe; the cache will refill again if needed.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return success;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
|
|
|
|
U_ASSERT(position > fBoundaries[fEndBufIdx]);
|
|
|
|
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
|
|
|
|
int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
|
|
|
|
if (nextIdx == fStartBufIdx) {
|
|
|
|
fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
|
|
|
|
}
|
|
|
|
fBoundaries[nextIdx] = position;
|
|
|
|
fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
|
|
|
|
fEndBufIdx = nextIdx;
|
|
|
|
if (update == UpdateCachePosition) {
|
|
|
|
// Set current position to the newly added boundary.
|
|
|
|
fBufIdx = nextIdx;
|
|
|
|
fTextIdx = position;
|
|
|
|
} else {
|
|
|
|
// Retaining the original cache position.
|
|
|
|
// Check if the added boundary wraps around the buffer, and would over-write the original position.
|
|
|
|
// It's the responsibility of callers of this function to not add too many.
|
|
|
|
U_ASSERT(nextIdx != fBufIdx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
|
|
|
|
U_ASSERT(position < fBoundaries[fStartBufIdx]);
|
|
|
|
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
|
|
|
|
int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
|
|
|
|
if (nextIdx == fEndBufIdx) {
|
|
|
|
if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
|
|
|
|
// Failure. The insertion of the new boundary would claim the buffer position that is the
|
|
|
|
// current iteration position. And we also want to retain the current iteration position.
|
|
|
|
// (The buffer is already completely full of entries that precede the iteration position.)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
fEndBufIdx = modChunkSize(fEndBufIdx - 1);
|
|
|
|
}
|
|
|
|
fBoundaries[nextIdx] = position;
|
|
|
|
fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
|
|
|
|
fStartBufIdx = nextIdx;
|
|
|
|
if (update == UpdateCachePosition) {
|
|
|
|
fBufIdx = nextIdx;
|
|
|
|
fTextIdx = position;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void RuleBasedBreakIterator::BreakCache::dumpCache() {
|
|
|
|
#ifdef RBBI_DEBUG
|
|
|
|
RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx);
|
|
|
|
for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
|
|
|
|
RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]);
|
|
|
|
if (i == fEndBufIdx) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
#endif // #if !UCONFIG_NO_BREAK_ITERATION
|