2020-08-11 11:10:23 +02:00
|
|
|
// © 2016 and later: Unicode, Inc. and others.
|
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
/*
|
|
|
|
**********************************************************************
|
|
|
|
* Copyright (c) 2003-2011, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
* Author: Alan Liu
|
|
|
|
* Created: September 24 2003
|
|
|
|
* Since: ICU 2.8
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
#include "ruleiter.h"
|
|
|
|
#include "unicode/parsepos.h"
|
|
|
|
#include "unicode/symtable.h"
|
|
|
|
#include "unicode/unistr.h"
|
|
|
|
#include "unicode/utf16.h"
|
|
|
|
#include "patternprops.h"
|
|
|
|
|
|
|
|
/* \U87654321 or \ud800\udc00 */
|
|
|
|
#define MAX_U_NOTATION_LEN 12
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
RuleCharacterIterator::RuleCharacterIterator(const UnicodeString& theText, const SymbolTable* theSym,
|
|
|
|
ParsePosition& thePos) :
|
|
|
|
text(theText),
|
|
|
|
pos(thePos),
|
|
|
|
sym(theSym),
|
2024-05-14 10:41:19 +02:00
|
|
|
buf(nullptr),
|
2020-08-11 11:10:23 +02:00
|
|
|
bufPos(0)
|
|
|
|
{}
|
|
|
|
|
|
|
|
UBool RuleCharacterIterator::atEnd() const {
|
2024-05-14 10:41:19 +02:00
|
|
|
return buf == nullptr && pos.getIndex() == text.length();
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
UChar32 RuleCharacterIterator::next(int32_t options, UBool& isEscaped, UErrorCode& ec) {
|
|
|
|
if (U_FAILURE(ec)) return DONE;
|
|
|
|
|
|
|
|
UChar32 c = DONE;
|
2022-10-28 08:11:55 +02:00
|
|
|
isEscaped = false;
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
c = _current();
|
|
|
|
_advance(U16_LENGTH(c));
|
|
|
|
|
2024-05-14 10:41:19 +02:00
|
|
|
if (c == SymbolTable::SYMBOL_REF && buf == nullptr &&
|
|
|
|
(options & PARSE_VARIABLES) != 0 && sym != nullptr) {
|
2020-08-11 11:10:23 +02:00
|
|
|
UnicodeString name = sym->parseReference(text, pos, text.length());
|
|
|
|
// If name is empty there was an isolated SYMBOL_REF;
|
|
|
|
// return it. Caller must be prepared for this.
|
|
|
|
if (name.length() == 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
bufPos = 0;
|
|
|
|
buf = sym->lookup(name);
|
2024-05-14 10:41:19 +02:00
|
|
|
if (buf == nullptr) {
|
2020-08-11 11:10:23 +02:00
|
|
|
ec = U_UNDEFINED_VARIABLE;
|
|
|
|
return DONE;
|
|
|
|
}
|
|
|
|
// Handle empty variable value
|
|
|
|
if (buf->length() == 0) {
|
2024-05-14 10:41:19 +02:00
|
|
|
buf = nullptr;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((options & SKIP_WHITESPACE) != 0 && PatternProps::isWhiteSpace(c)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (c == 0x5C /*'\\'*/ && (options & PARSE_ESCAPES) != 0) {
|
|
|
|
UnicodeString tempEscape;
|
|
|
|
int32_t offset = 0;
|
|
|
|
c = lookahead(tempEscape, MAX_U_NOTATION_LEN).unescapeAt(offset);
|
|
|
|
jumpahead(offset);
|
2022-10-28 08:11:55 +02:00
|
|
|
isEscaped = true;
|
2020-08-11 11:10:23 +02:00
|
|
|
if (c < 0) {
|
|
|
|
ec = U_MALFORMED_UNICODE_ESCAPE;
|
|
|
|
return DONE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
void RuleCharacterIterator::getPos(RuleCharacterIterator::Pos& p) const {
|
|
|
|
p.buf = buf;
|
|
|
|
p.pos = pos.getIndex();
|
|
|
|
p.bufPos = bufPos;
|
|
|
|
}
|
|
|
|
|
|
|
|
void RuleCharacterIterator::setPos(const RuleCharacterIterator::Pos& p) {
|
|
|
|
buf = p.buf;
|
|
|
|
pos.setIndex(p.pos);
|
|
|
|
bufPos = p.bufPos;
|
|
|
|
}
|
|
|
|
|
|
|
|
void RuleCharacterIterator::skipIgnored(int32_t options) {
|
|
|
|
if ((options & SKIP_WHITESPACE) != 0) {
|
|
|
|
for (;;) {
|
|
|
|
UChar32 a = _current();
|
|
|
|
if (!PatternProps::isWhiteSpace(a)) break;
|
|
|
|
_advance(U16_LENGTH(a));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UnicodeString& RuleCharacterIterator::lookahead(UnicodeString& result, int32_t maxLookAhead) const {
|
|
|
|
if (maxLookAhead < 0) {
|
|
|
|
maxLookAhead = 0x7FFFFFFF;
|
|
|
|
}
|
2024-05-14 10:41:19 +02:00
|
|
|
if (buf != nullptr) {
|
2020-08-11 11:10:23 +02:00
|
|
|
buf->extract(bufPos, maxLookAhead, result);
|
|
|
|
} else {
|
|
|
|
text.extract(pos.getIndex(), maxLookAhead, result);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
void RuleCharacterIterator::jumpahead(int32_t count) {
|
|
|
|
_advance(count);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
UnicodeString& RuleCharacterIterator::toString(UnicodeString& result) const {
|
|
|
|
int32_t b = pos.getIndex();
|
|
|
|
text.extract(0, b, result);
|
2023-05-23 02:05:01 +02:00
|
|
|
return result.append((char16_t) 0x7C).append(text, b, 0x7FFFFFFF); // Insert '|' at index
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
|
|
|
UChar32 RuleCharacterIterator::_current() const {
|
2024-05-14 10:41:19 +02:00
|
|
|
if (buf != nullptr) {
|
2020-08-11 11:10:23 +02:00
|
|
|
return buf->char32At(bufPos);
|
|
|
|
} else {
|
|
|
|
int i = pos.getIndex();
|
|
|
|
return (i < text.length()) ? text.char32At(i) : (UChar32)DONE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void RuleCharacterIterator::_advance(int32_t count) {
|
2024-05-14 10:41:19 +02:00
|
|
|
if (buf != nullptr) {
|
2020-08-11 11:10:23 +02:00
|
|
|
bufPos += count;
|
|
|
|
if (bufPos == buf->length()) {
|
2024-05-14 10:41:19 +02:00
|
|
|
buf = nullptr;
|
2020-08-11 11:10:23 +02:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
pos.setIndex(pos.getIndex() + count);
|
|
|
|
if (pos.getIndex() > text.length()) {
|
|
|
|
pos.setIndex(text.length());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
//eof
|