aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/icu/i18n/collationruleparser.cpp
diff options
context:
space:
mode:
authorneksard <neksard@yandex-team.ru>2022-02-10 16:45:23 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:23 +0300
commit8f7cf138264e0caa318144bf8a2c950e0b0a8593 (patch)
tree83bf5c8c8047c42d8475e6095df90ccdc3d1b57f /contrib/libs/icu/i18n/collationruleparser.cpp
parentd3a398281c6fd1d3672036cb2d63f842d2cb28c5 (diff)
downloadydb-8f7cf138264e0caa318144bf8a2c950e0b0a8593.tar.gz
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/i18n/collationruleparser.cpp')
-rw-r--r--contrib/libs/icu/i18n/collationruleparser.cpp1752
1 files changed, 876 insertions, 876 deletions
diff --git a/contrib/libs/icu/i18n/collationruleparser.cpp b/contrib/libs/icu/i18n/collationruleparser.cpp
index ade6ecb552..a19b058a9d 100644
--- a/contrib/libs/icu/i18n/collationruleparser.cpp
+++ b/contrib/libs/icu/i18n/collationruleparser.cpp
@@ -1,881 +1,881 @@
// © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-*******************************************************************************
-* Copyright (C) 2013-2015, International Business Machines
-* Corporation and others. All Rights Reserved.
-*******************************************************************************
-* collationruleparser.cpp
-*
-* (replaced the former ucol_tok.cpp)
-*
-* created on: 2013apr10
-* created by: Markus W. Scherer
-*/
-
-#include "unicode/utypes.h"
-
-#if !UCONFIG_NO_COLLATION
-
-#include "unicode/normalizer2.h"
-#include "unicode/parseerr.h"
-#include "unicode/uchar.h"
-#include "unicode/ucol.h"
-#include "unicode/uloc.h"
-#include "unicode/unistr.h"
-#include "unicode/utf16.h"
-#include "charstr.h"
-#include "cmemory.h"
-#include "collation.h"
-#include "collationdata.h"
-#include "collationruleparser.h"
-#include "collationsettings.h"
-#include "collationtailoring.h"
-#include "cstring.h"
-#include "patternprops.h"
-#include "uassert.h"
-#include "uvectr32.h"
-
-U_NAMESPACE_BEGIN
-
-namespace {
-
-static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
-const int32_t BEFORE_LENGTH = 7;
-
-} // namespace
-
-CollationRuleParser::Sink::~Sink() {}
-
-void
-CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
-
-void
-CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
-
-CollationRuleParser::Importer::~Importer() {}
-
-CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
- : nfd(*Normalizer2::getNFDInstance(errorCode)),
- nfc(*Normalizer2::getNFCInstance(errorCode)),
- rules(NULL), baseData(base), settings(NULL),
- parseError(NULL), errorReason(NULL),
- sink(NULL), importer(NULL),
- ruleIndex(0) {
-}
-
-CollationRuleParser::~CollationRuleParser() {
-}
-
-void
-CollationRuleParser::parse(const UnicodeString &ruleString,
- CollationSettings &outSettings,
- UParseError *outParseError,
- UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return; }
- settings = &outSettings;
- parseError = outParseError;
- if(parseError != NULL) {
- parseError->line = 0;
- parseError->offset = -1;
- parseError->preContext[0] = 0;
- parseError->postContext[0] = 0;
- }
- errorReason = NULL;
- parse(ruleString, errorCode);
-}
-
-void
-CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return; }
- rules = &ruleString;
- ruleIndex = 0;
-
- while(ruleIndex < rules->length()) {
- UChar c = rules->charAt(ruleIndex);
- if(PatternProps::isWhiteSpace(c)) {
- ++ruleIndex;
- continue;
- }
- switch(c) {
- case 0x26: // '&'
- parseRuleChain(errorCode);
- break;
- case 0x5b: // '['
- parseSetting(errorCode);
- break;
- case 0x23: // '#' starts a comment, until the end of the line
- ruleIndex = skipComment(ruleIndex + 1);
- break;
- case 0x40: // '@' is equivalent to [backwards 2]
- settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
- UCOL_ON, 0, errorCode);
- ++ruleIndex;
- break;
- case 0x21: // '!' used to turn on Thai/Lao character reversal
- // Accept but ignore. The root collator has contractions
- // that are equivalent to the character reversal, where appropriate.
- ++ruleIndex;
- break;
- default:
- setParseError("expected a reset or setting or comment", errorCode);
- break;
- }
- if(U_FAILURE(errorCode)) { return; }
- }
-}
-
-void
-CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
- int32_t resetStrength = parseResetAndPosition(errorCode);
- UBool isFirstRelation = TRUE;
- for(;;) {
- int32_t result = parseRelationOperator(errorCode);
- if(U_FAILURE(errorCode)) { return; }
- if(result < 0) {
- if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
- // '#' starts a comment, until the end of the line
- ruleIndex = skipComment(ruleIndex + 1);
- continue;
- }
- if(isFirstRelation) {
- setParseError("reset not followed by a relation", errorCode);
- }
- return;
- }
- int32_t strength = result & STRENGTH_MASK;
- if(resetStrength < UCOL_IDENTICAL) {
- // reset-before rule chain
- if(isFirstRelation) {
- if(strength != resetStrength) {
- setParseError("reset-before strength differs from its first relation", errorCode);
- return;
- }
- } else {
- if(strength < resetStrength) {
- setParseError("reset-before strength followed by a stronger relation", errorCode);
- return;
- }
- }
- }
- int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
- if((result & STARRED_FLAG) == 0) {
- parseRelationStrings(strength, i, errorCode);
- } else {
- parseStarredCharacters(strength, i, errorCode);
- }
- if(U_FAILURE(errorCode)) { return; }
- isFirstRelation = FALSE;
- }
-}
-
-int32_t
-CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
- int32_t i = skipWhiteSpace(ruleIndex + 1);
- int32_t j;
- UChar c;
- int32_t resetStrength;
- if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
- (j = i + BEFORE_LENGTH) < rules->length() &&
- PatternProps::isWhiteSpace(rules->charAt(j)) &&
- ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
- 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
- rules->charAt(j + 1) == 0x5d) {
- // &[before n] with n=1 or 2 or 3
- resetStrength = UCOL_PRIMARY + (c - 0x31);
- i = skipWhiteSpace(j + 2);
- } else {
- resetStrength = UCOL_IDENTICAL;
- }
- if(i >= rules->length()) {
- setParseError("reset without position", errorCode);
- return UCOL_DEFAULT;
- }
- UnicodeString str;
- if(rules->charAt(i) == 0x5b) { // '['
- i = parseSpecialPosition(i, str, errorCode);
- } else {
- i = parseTailoringString(i, str, errorCode);
- }
- sink->addReset(resetStrength, str, errorReason, errorCode);
- if(U_FAILURE(errorCode)) { setErrorContext(); }
- ruleIndex = i;
- return resetStrength;
-}
-
-int32_t
-CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
- ruleIndex = skipWhiteSpace(ruleIndex);
- if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
- int32_t strength;
- int32_t i = ruleIndex;
- UChar c = rules->charAt(i++);
- switch(c) {
- case 0x3c: // '<'
- if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
- ++i;
- if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
- ++i;
- if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
- ++i;
- strength = UCOL_QUATERNARY;
- } else {
- strength = UCOL_TERTIARY;
- }
- } else {
- strength = UCOL_SECONDARY;
- }
- } else {
- strength = UCOL_PRIMARY;
- }
- if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
- ++i;
- strength |= STARRED_FLAG;
- }
- break;
- case 0x3b: // ';' same as <<
- strength = UCOL_SECONDARY;
- break;
- case 0x2c: // ',' same as <<<
- strength = UCOL_TERTIARY;
- break;
- case 0x3d: // '='
- strength = UCOL_IDENTICAL;
- if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
- ++i;
- strength |= STARRED_FLAG;
- }
- break;
- default:
- return UCOL_DEFAULT;
- }
- return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
-}
-
-void
-CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
- // Parse
- // prefix | str / extension
- // where prefix and extension are optional.
- UnicodeString prefix, str, extension;
- i = parseTailoringString(i, str, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
- if(next == 0x7c) { // '|' separates the context prefix from the string.
- prefix = str;
- i = parseTailoringString(i + 1, str, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- next = (i < rules->length()) ? rules->charAt(i) : 0;
- }
- if(next == 0x2f) { // '/' separates the string from the extension.
- i = parseTailoringString(i + 1, extension, errorCode);
- }
- if(!prefix.isEmpty()) {
- UChar32 prefix0 = prefix.char32At(0);
- UChar32 c = str.char32At(0);
- if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
- setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
- errorCode);
- return;
- }
- }
- sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
- if(U_FAILURE(errorCode)) { setErrorContext(); }
- ruleIndex = i;
-}
-
-void
-CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
- UnicodeString empty, raw;
- i = parseString(skipWhiteSpace(i), raw, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- if(raw.isEmpty()) {
- setParseError("missing starred-relation string", errorCode);
- return;
- }
- UChar32 prev = -1;
- int32_t j = 0;
- for(;;) {
- while(j < raw.length()) {
- UChar32 c = raw.char32At(j);
- if(!nfd.isInert(c)) {
- setParseError("starred-relation string is not all NFD-inert", errorCode);
- return;
- }
- sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
- if(U_FAILURE(errorCode)) {
- setErrorContext();
- return;
- }
- j += U16_LENGTH(c);
- prev = c;
- }
- if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
- break;
- }
- if(prev < 0) {
- setParseError("range without start in starred-relation string", errorCode);
- return;
- }
- i = parseString(i + 1, raw, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- if(raw.isEmpty()) {
- setParseError("range without end in starred-relation string", errorCode);
- return;
- }
- UChar32 c = raw.char32At(0);
- if(c < prev) {
- setParseError("range start greater than end in starred-relation string", errorCode);
- return;
- }
- // range prev-c
- UnicodeString s;
- while(++prev <= c) {
- if(!nfd.isInert(prev)) {
- setParseError("starred-relation string range is not all NFD-inert", errorCode);
- return;
- }
- if(U_IS_SURROGATE(prev)) {
- setParseError("starred-relation string range contains a surrogate", errorCode);
- return;
- }
- if(0xfffd <= prev && prev <= 0xffff) {
- setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
- return;
- }
- s.setTo(prev);
- sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
- if(U_FAILURE(errorCode)) {
- setErrorContext();
- return;
- }
- }
- prev = -1;
- j = U16_LENGTH(c);
- }
- ruleIndex = skipWhiteSpace(i);
-}
-
-int32_t
-CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
- i = parseString(skipWhiteSpace(i), raw, errorCode);
- if(U_SUCCESS(errorCode) && raw.isEmpty()) {
- setParseError("missing relation string", errorCode);
- }
- return skipWhiteSpace(i);
-}
-
-int32_t
-CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return i; }
- raw.remove();
- while(i < rules->length()) {
- UChar32 c = rules->charAt(i++);
- if(isSyntaxChar(c)) {
- if(c == 0x27) { // apostrophe
- if(i < rules->length() && rules->charAt(i) == 0x27) {
- // Double apostrophe, encodes a single one.
- raw.append((UChar)0x27);
- ++i;
- continue;
- }
- // Quote literal text until the next single apostrophe.
- for(;;) {
- if(i == rules->length()) {
- setParseError("quoted literal text missing terminating apostrophe", errorCode);
- return i;
- }
- c = rules->charAt(i++);
- if(c == 0x27) {
- if(i < rules->length() && rules->charAt(i) == 0x27) {
- // Double apostrophe inside quoted literal text,
- // still encodes a single apostrophe.
- ++i;
- } else {
- break;
- }
- }
- raw.append((UChar)c);
- }
- } else if(c == 0x5c) { // backslash
- if(i == rules->length()) {
- setParseError("backslash escape at the end of the rule string", errorCode);
- return i;
- }
- c = rules->char32At(i);
- raw.append(c);
- i += U16_LENGTH(c);
- } else {
- // Any other syntax character terminates a string.
- --i;
- break;
- }
- } else if(PatternProps::isWhiteSpace(c)) {
- // Unquoted white space terminates a string.
- --i;
- break;
- } else {
- raw.append((UChar)c);
- }
- }
- for(int32_t j = 0; j < raw.length();) {
- UChar32 c = raw.char32At(j);
- if(U_IS_SURROGATE(c)) {
- setParseError("string contains an unpaired surrogate", errorCode);
- return i;
- }
- if(0xfffd <= c && c <= 0xffff) {
- setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
- return i;
- }
- j += U16_LENGTH(c);
- }
- return i;
-}
-
-namespace {
-
-static const char *const positions[] = {
- "first tertiary ignorable",
- "last tertiary ignorable",
- "first secondary ignorable",
- "last secondary ignorable",
- "first primary ignorable",
- "last primary ignorable",
- "first variable",
- "last variable",
- "first regular",
- "last regular",
- "first implicit",
- "last implicit",
- "first trailing",
- "last trailing"
-};
-
-} // namespace
-
-int32_t
-CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return 0; }
- UnicodeString raw;
- int32_t j = readWords(i + 1, raw);
- if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
- ++j;
- for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
- if(raw == UnicodeString(positions[pos], -1, US_INV)) {
- str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
- return j;
- }
- }
- if(raw == UNICODE_STRING_SIMPLE("top")) {
- str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
- return j;
- }
- if(raw == UNICODE_STRING_SIMPLE("variable top")) {
- str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
- return j;
- }
- }
- setParseError("not a valid special reset position", errorCode);
- return i;
-}
-
-void
-CollationRuleParser::parseSetting(UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return; }
- UnicodeString raw;
- int32_t i = ruleIndex + 1;
- int32_t j = readWords(i, raw);
- if(j <= i || raw.isEmpty()) {
- setParseError("expected a setting/option at '['", errorCode);
- }
- if(rules->charAt(j) == 0x5d) { // words end with ]
- ++j;
- if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
- (raw.length() == 7 || raw.charAt(7) == 0x20)) {
- parseReordering(raw, errorCode);
- ruleIndex = j;
- return;
- }
- if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
- settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
- UCOL_ON, 0, errorCode);
- ruleIndex = j;
- return;
- }
- UnicodeString v;
- int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
- if(valueIndex >= 0) {
- v.setTo(raw, valueIndex + 1);
- raw.truncate(valueIndex);
- }
- if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
- int32_t value = UCOL_DEFAULT;
- UChar c = v.charAt(0);
- if(0x31 <= c && c <= 0x34) { // 1..4
- value = UCOL_PRIMARY + (c - 0x31);
- } else if(c == 0x49) { // 'I'
- value = UCOL_IDENTICAL;
- }
- if(value != UCOL_DEFAULT) {
- settings->setStrength(value, 0, errorCode);
- ruleIndex = j;
- return;
- }
- } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
- UColAttributeValue value = UCOL_DEFAULT;
- if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
- value = UCOL_NON_IGNORABLE;
- } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
- value = UCOL_SHIFTED;
- }
- if(value != UCOL_DEFAULT) {
- settings->setAlternateHandling(value, 0, errorCode);
- ruleIndex = j;
- return;
- }
- } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
- int32_t value = UCOL_DEFAULT;
- if(v == UNICODE_STRING_SIMPLE("space")) {
- value = CollationSettings::MAX_VAR_SPACE;
- } else if(v == UNICODE_STRING_SIMPLE("punct")) {
- value = CollationSettings::MAX_VAR_PUNCT;
- } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
- value = CollationSettings::MAX_VAR_SYMBOL;
- } else if(v == UNICODE_STRING_SIMPLE("currency")) {
- value = CollationSettings::MAX_VAR_CURRENCY;
- }
- if(value != UCOL_DEFAULT) {
- settings->setMaxVariable(value, 0, errorCode);
- settings->variableTop = baseData->getLastPrimaryForGroup(
- UCOL_REORDER_CODE_FIRST + value);
- U_ASSERT(settings->variableTop != 0);
- ruleIndex = j;
- return;
- }
- } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
- UColAttributeValue value = UCOL_DEFAULT;
- if(v == UNICODE_STRING_SIMPLE("off")) {
- value = UCOL_OFF;
- } else if(v == UNICODE_STRING_SIMPLE("lower")) {
- value = UCOL_LOWER_FIRST;
- } else if(v == UNICODE_STRING_SIMPLE("upper")) {
- value = UCOL_UPPER_FIRST;
- }
- if(value != UCOL_DEFAULT) {
- settings->setCaseFirst(value, 0, errorCode);
- ruleIndex = j;
- return;
- }
- } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
- UColAttributeValue value = getOnOffValue(v);
- if(value != UCOL_DEFAULT) {
- settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
- ruleIndex = j;
- return;
- }
- } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
- UColAttributeValue value = getOnOffValue(v);
- if(value != UCOL_DEFAULT) {
- settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
- ruleIndex = j;
- return;
- }
- } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
- UColAttributeValue value = getOnOffValue(v);
- if(value != UCOL_DEFAULT) {
- settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
- ruleIndex = j;
- return;
- }
- } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
- UColAttributeValue value = getOnOffValue(v);
- if(value != UCOL_DEFAULT) {
- if(value == UCOL_ON) {
- setParseError("[hiraganaQ on] is not supported", errorCode);
- }
- ruleIndex = j;
- return;
- }
- } else if(raw == UNICODE_STRING_SIMPLE("import")) {
- CharString lang;
- lang.appendInvariantChars(v, errorCode);
- if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
- // BCP 47 language tag -> ICU locale ID
- char localeID[ULOC_FULLNAME_CAPACITY];
- int32_t parsedLength;
- int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
- &parsedLength, &errorCode);
- if(U_FAILURE(errorCode) ||
- parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
- errorCode = U_ZERO_ERROR;
- setParseError("expected language tag in [import langTag]", errorCode);
- return;
- }
- // localeID minus all keywords
- char baseID[ULOC_FULLNAME_CAPACITY];
- length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
- if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
- errorCode = U_ZERO_ERROR;
- setParseError("expected language tag in [import langTag]", errorCode);
- return;
- }
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2013-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* collationruleparser.cpp
+*
+* (replaced the former ucol_tok.cpp)
+*
+* created on: 2013apr10
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "unicode/normalizer2.h"
+#include "unicode/parseerr.h"
+#include "unicode/uchar.h"
+#include "unicode/ucol.h"
+#include "unicode/uloc.h"
+#include "unicode/unistr.h"
+#include "unicode/utf16.h"
+#include "charstr.h"
+#include "cmemory.h"
+#include "collation.h"
+#include "collationdata.h"
+#include "collationruleparser.h"
+#include "collationsettings.h"
+#include "collationtailoring.h"
+#include "cstring.h"
+#include "patternprops.h"
+#include "uassert.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+namespace {
+
+static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
+const int32_t BEFORE_LENGTH = 7;
+
+} // namespace
+
+CollationRuleParser::Sink::~Sink() {}
+
+void
+CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
+
+void
+CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
+
+CollationRuleParser::Importer::~Importer() {}
+
+CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
+ : nfd(*Normalizer2::getNFDInstance(errorCode)),
+ nfc(*Normalizer2::getNFCInstance(errorCode)),
+ rules(NULL), baseData(base), settings(NULL),
+ parseError(NULL), errorReason(NULL),
+ sink(NULL), importer(NULL),
+ ruleIndex(0) {
+}
+
+CollationRuleParser::~CollationRuleParser() {
+}
+
+void
+CollationRuleParser::parse(const UnicodeString &ruleString,
+ CollationSettings &outSettings,
+ UParseError *outParseError,
+ UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ settings = &outSettings;
+ parseError = outParseError;
+ if(parseError != NULL) {
+ parseError->line = 0;
+ parseError->offset = -1;
+ parseError->preContext[0] = 0;
+ parseError->postContext[0] = 0;
+ }
+ errorReason = NULL;
+ parse(ruleString, errorCode);
+}
+
+void
+CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ rules = &ruleString;
+ ruleIndex = 0;
+
+ while(ruleIndex < rules->length()) {
+ UChar c = rules->charAt(ruleIndex);
+ if(PatternProps::isWhiteSpace(c)) {
+ ++ruleIndex;
+ continue;
+ }
+ switch(c) {
+ case 0x26: // '&'
+ parseRuleChain(errorCode);
+ break;
+ case 0x5b: // '['
+ parseSetting(errorCode);
+ break;
+ case 0x23: // '#' starts a comment, until the end of the line
+ ruleIndex = skipComment(ruleIndex + 1);
+ break;
+ case 0x40: // '@' is equivalent to [backwards 2]
+ settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
+ UCOL_ON, 0, errorCode);
+ ++ruleIndex;
+ break;
+ case 0x21: // '!' used to turn on Thai/Lao character reversal
+ // Accept but ignore. The root collator has contractions
+ // that are equivalent to the character reversal, where appropriate.
+ ++ruleIndex;
+ break;
+ default:
+ setParseError("expected a reset or setting or comment", errorCode);
+ break;
+ }
+ if(U_FAILURE(errorCode)) { return; }
+ }
+}
+
+void
+CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
+ int32_t resetStrength = parseResetAndPosition(errorCode);
+ UBool isFirstRelation = TRUE;
+ for(;;) {
+ int32_t result = parseRelationOperator(errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ if(result < 0) {
+ if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
+ // '#' starts a comment, until the end of the line
+ ruleIndex = skipComment(ruleIndex + 1);
+ continue;
+ }
+ if(isFirstRelation) {
+ setParseError("reset not followed by a relation", errorCode);
+ }
+ return;
+ }
+ int32_t strength = result & STRENGTH_MASK;
+ if(resetStrength < UCOL_IDENTICAL) {
+ // reset-before rule chain
+ if(isFirstRelation) {
+ if(strength != resetStrength) {
+ setParseError("reset-before strength differs from its first relation", errorCode);
+ return;
+ }
+ } else {
+ if(strength < resetStrength) {
+ setParseError("reset-before strength followed by a stronger relation", errorCode);
+ return;
+ }
+ }
+ }
+ int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
+ if((result & STARRED_FLAG) == 0) {
+ parseRelationStrings(strength, i, errorCode);
+ } else {
+ parseStarredCharacters(strength, i, errorCode);
+ }
+ if(U_FAILURE(errorCode)) { return; }
+ isFirstRelation = FALSE;
+ }
+}
+
+int32_t
+CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
+ int32_t i = skipWhiteSpace(ruleIndex + 1);
+ int32_t j;
+ UChar c;
+ int32_t resetStrength;
+ if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
+ (j = i + BEFORE_LENGTH) < rules->length() &&
+ PatternProps::isWhiteSpace(rules->charAt(j)) &&
+ ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
+ 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
+ rules->charAt(j + 1) == 0x5d) {
+ // &[before n] with n=1 or 2 or 3
+ resetStrength = UCOL_PRIMARY + (c - 0x31);
+ i = skipWhiteSpace(j + 2);
+ } else {
+ resetStrength = UCOL_IDENTICAL;
+ }
+ if(i >= rules->length()) {
+ setParseError("reset without position", errorCode);
+ return UCOL_DEFAULT;
+ }
+ UnicodeString str;
+ if(rules->charAt(i) == 0x5b) { // '['
+ i = parseSpecialPosition(i, str, errorCode);
+ } else {
+ i = parseTailoringString(i, str, errorCode);
+ }
+ sink->addReset(resetStrength, str, errorReason, errorCode);
+ if(U_FAILURE(errorCode)) { setErrorContext(); }
+ ruleIndex = i;
+ return resetStrength;
+}
+
+int32_t
+CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
+ ruleIndex = skipWhiteSpace(ruleIndex);
+ if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
+ int32_t strength;
+ int32_t i = ruleIndex;
+ UChar c = rules->charAt(i++);
+ switch(c) {
+ case 0x3c: // '<'
+ if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
+ ++i;
+ if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
+ ++i;
+ if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
+ ++i;
+ strength = UCOL_QUATERNARY;
+ } else {
+ strength = UCOL_TERTIARY;
+ }
+ } else {
+ strength = UCOL_SECONDARY;
+ }
+ } else {
+ strength = UCOL_PRIMARY;
+ }
+ if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
+ ++i;
+ strength |= STARRED_FLAG;
+ }
+ break;
+ case 0x3b: // ';' same as <<
+ strength = UCOL_SECONDARY;
+ break;
+ case 0x2c: // ',' same as <<<
+ strength = UCOL_TERTIARY;
+ break;
+ case 0x3d: // '='
+ strength = UCOL_IDENTICAL;
+ if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
+ ++i;
+ strength |= STARRED_FLAG;
+ }
+ break;
+ default:
+ return UCOL_DEFAULT;
+ }
+ return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
+}
+
+void
+CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
+ // Parse
+ // prefix | str / extension
+ // where prefix and extension are optional.
+ UnicodeString prefix, str, extension;
+ i = parseTailoringString(i, str, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
+ if(next == 0x7c) { // '|' separates the context prefix from the string.
+ prefix = str;
+ i = parseTailoringString(i + 1, str, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ next = (i < rules->length()) ? rules->charAt(i) : 0;
+ }
+ if(next == 0x2f) { // '/' separates the string from the extension.
+ i = parseTailoringString(i + 1, extension, errorCode);
+ }
+ if(!prefix.isEmpty()) {
+ UChar32 prefix0 = prefix.char32At(0);
+ UChar32 c = str.char32At(0);
+ if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
+ setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
+ errorCode);
+ return;
+ }
+ }
+ sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
+ if(U_FAILURE(errorCode)) { setErrorContext(); }
+ ruleIndex = i;
+}
+
+void
+CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
+ UnicodeString empty, raw;
+ i = parseString(skipWhiteSpace(i), raw, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ if(raw.isEmpty()) {
+ setParseError("missing starred-relation string", errorCode);
+ return;
+ }
+ UChar32 prev = -1;
+ int32_t j = 0;
+ for(;;) {
+ while(j < raw.length()) {
+ UChar32 c = raw.char32At(j);
+ if(!nfd.isInert(c)) {
+ setParseError("starred-relation string is not all NFD-inert", errorCode);
+ return;
+ }
+ sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
+ if(U_FAILURE(errorCode)) {
+ setErrorContext();
+ return;
+ }
+ j += U16_LENGTH(c);
+ prev = c;
+ }
+ if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
+ break;
+ }
+ if(prev < 0) {
+ setParseError("range without start in starred-relation string", errorCode);
+ return;
+ }
+ i = parseString(i + 1, raw, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ if(raw.isEmpty()) {
+ setParseError("range without end in starred-relation string", errorCode);
+ return;
+ }
+ UChar32 c = raw.char32At(0);
+ if(c < prev) {
+ setParseError("range start greater than end in starred-relation string", errorCode);
+ return;
+ }
+ // range prev-c
+ UnicodeString s;
+ while(++prev <= c) {
+ if(!nfd.isInert(prev)) {
+ setParseError("starred-relation string range is not all NFD-inert", errorCode);
+ return;
+ }
+ if(U_IS_SURROGATE(prev)) {
+ setParseError("starred-relation string range contains a surrogate", errorCode);
+ return;
+ }
+ if(0xfffd <= prev && prev <= 0xffff) {
+ setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
+ return;
+ }
+ s.setTo(prev);
+ sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
+ if(U_FAILURE(errorCode)) {
+ setErrorContext();
+ return;
+ }
+ }
+ prev = -1;
+ j = U16_LENGTH(c);
+ }
+ ruleIndex = skipWhiteSpace(i);
+}
+
+int32_t
+CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
+ i = parseString(skipWhiteSpace(i), raw, errorCode);
+ if(U_SUCCESS(errorCode) && raw.isEmpty()) {
+ setParseError("missing relation string", errorCode);
+ }
+ return skipWhiteSpace(i);
+}
+
+int32_t
+CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return i; }
+ raw.remove();
+ while(i < rules->length()) {
+ UChar32 c = rules->charAt(i++);
+ if(isSyntaxChar(c)) {
+ if(c == 0x27) { // apostrophe
+ if(i < rules->length() && rules->charAt(i) == 0x27) {
+ // Double apostrophe, encodes a single one.
+ raw.append((UChar)0x27);
+ ++i;
+ continue;
+ }
+ // Quote literal text until the next single apostrophe.
+ for(;;) {
+ if(i == rules->length()) {
+ setParseError("quoted literal text missing terminating apostrophe", errorCode);
+ return i;
+ }
+ c = rules->charAt(i++);
+ if(c == 0x27) {
+ if(i < rules->length() && rules->charAt(i) == 0x27) {
+ // Double apostrophe inside quoted literal text,
+ // still encodes a single apostrophe.
+ ++i;
+ } else {
+ break;
+ }
+ }
+ raw.append((UChar)c);
+ }
+ } else if(c == 0x5c) { // backslash
+ if(i == rules->length()) {
+ setParseError("backslash escape at the end of the rule string", errorCode);
+ return i;
+ }
+ c = rules->char32At(i);
+ raw.append(c);
+ i += U16_LENGTH(c);
+ } else {
+ // Any other syntax character terminates a string.
+ --i;
+ break;
+ }
+ } else if(PatternProps::isWhiteSpace(c)) {
+ // Unquoted white space terminates a string.
+ --i;
+ break;
+ } else {
+ raw.append((UChar)c);
+ }
+ }
+ for(int32_t j = 0; j < raw.length();) {
+ UChar32 c = raw.char32At(j);
+ if(U_IS_SURROGATE(c)) {
+ setParseError("string contains an unpaired surrogate", errorCode);
+ return i;
+ }
+ if(0xfffd <= c && c <= 0xffff) {
+ setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
+ return i;
+ }
+ j += U16_LENGTH(c);
+ }
+ return i;
+}
+
+namespace {
+
+static const char *const positions[] = {
+ "first tertiary ignorable",
+ "last tertiary ignorable",
+ "first secondary ignorable",
+ "last secondary ignorable",
+ "first primary ignorable",
+ "last primary ignorable",
+ "first variable",
+ "last variable",
+ "first regular",
+ "last regular",
+ "first implicit",
+ "last implicit",
+ "first trailing",
+ "last trailing"
+};
+
+} // namespace
+
+int32_t
+CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return 0; }
+ UnicodeString raw;
+ int32_t j = readWords(i + 1, raw);
+ if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
+ ++j;
+ for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
+ if(raw == UnicodeString(positions[pos], -1, US_INV)) {
+ str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
+ return j;
+ }
+ }
+ if(raw == UNICODE_STRING_SIMPLE("top")) {
+ str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
+ return j;
+ }
+ if(raw == UNICODE_STRING_SIMPLE("variable top")) {
+ str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
+ return j;
+ }
+ }
+ setParseError("not a valid special reset position", errorCode);
+ return i;
+}
+
+void
+CollationRuleParser::parseSetting(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ UnicodeString raw;
+ int32_t i = ruleIndex + 1;
+ int32_t j = readWords(i, raw);
+ if(j <= i || raw.isEmpty()) {
+ setParseError("expected a setting/option at '['", errorCode);
+ }
+ if(rules->charAt(j) == 0x5d) { // words end with ]
+ ++j;
+ if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
+ (raw.length() == 7 || raw.charAt(7) == 0x20)) {
+ parseReordering(raw, errorCode);
+ ruleIndex = j;
+ return;
+ }
+ if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
+ settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
+ UCOL_ON, 0, errorCode);
+ ruleIndex = j;
+ return;
+ }
+ UnicodeString v;
+ int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
+ if(valueIndex >= 0) {
+ v.setTo(raw, valueIndex + 1);
+ raw.truncate(valueIndex);
+ }
+ if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
+ int32_t value = UCOL_DEFAULT;
+ UChar c = v.charAt(0);
+ if(0x31 <= c && c <= 0x34) { // 1..4
+ value = UCOL_PRIMARY + (c - 0x31);
+ } else if(c == 0x49) { // 'I'
+ value = UCOL_IDENTICAL;
+ }
+ if(value != UCOL_DEFAULT) {
+ settings->setStrength(value, 0, errorCode);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
+ UColAttributeValue value = UCOL_DEFAULT;
+ if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
+ value = UCOL_NON_IGNORABLE;
+ } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
+ value = UCOL_SHIFTED;
+ }
+ if(value != UCOL_DEFAULT) {
+ settings->setAlternateHandling(value, 0, errorCode);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
+ int32_t value = UCOL_DEFAULT;
+ if(v == UNICODE_STRING_SIMPLE("space")) {
+ value = CollationSettings::MAX_VAR_SPACE;
+ } else if(v == UNICODE_STRING_SIMPLE("punct")) {
+ value = CollationSettings::MAX_VAR_PUNCT;
+ } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
+ value = CollationSettings::MAX_VAR_SYMBOL;
+ } else if(v == UNICODE_STRING_SIMPLE("currency")) {
+ value = CollationSettings::MAX_VAR_CURRENCY;
+ }
+ if(value != UCOL_DEFAULT) {
+ settings->setMaxVariable(value, 0, errorCode);
+ settings->variableTop = baseData->getLastPrimaryForGroup(
+ UCOL_REORDER_CODE_FIRST + value);
+ U_ASSERT(settings->variableTop != 0);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
+ UColAttributeValue value = UCOL_DEFAULT;
+ if(v == UNICODE_STRING_SIMPLE("off")) {
+ value = UCOL_OFF;
+ } else if(v == UNICODE_STRING_SIMPLE("lower")) {
+ value = UCOL_LOWER_FIRST;
+ } else if(v == UNICODE_STRING_SIMPLE("upper")) {
+ value = UCOL_UPPER_FIRST;
+ }
+ if(value != UCOL_DEFAULT) {
+ settings->setCaseFirst(value, 0, errorCode);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
+ UColAttributeValue value = getOnOffValue(v);
+ if(value != UCOL_DEFAULT) {
+ settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
+ UColAttributeValue value = getOnOffValue(v);
+ if(value != UCOL_DEFAULT) {
+ settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
+ UColAttributeValue value = getOnOffValue(v);
+ if(value != UCOL_DEFAULT) {
+ settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
+ UColAttributeValue value = getOnOffValue(v);
+ if(value != UCOL_DEFAULT) {
+ if(value == UCOL_ON) {
+ setParseError("[hiraganaQ on] is not supported", errorCode);
+ }
+ ruleIndex = j;
+ return;
+ }
+ } else if(raw == UNICODE_STRING_SIMPLE("import")) {
+ CharString lang;
+ lang.appendInvariantChars(v, errorCode);
+ if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
+ // BCP 47 language tag -> ICU locale ID
+ char localeID[ULOC_FULLNAME_CAPACITY];
+ int32_t parsedLength;
+ int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
+ &parsedLength, &errorCode);
+ if(U_FAILURE(errorCode) ||
+ parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
+ errorCode = U_ZERO_ERROR;
+ setParseError("expected language tag in [import langTag]", errorCode);
+ return;
+ }
+ // localeID minus all keywords
+ char baseID[ULOC_FULLNAME_CAPACITY];
+ length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
+ if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
+ errorCode = U_ZERO_ERROR;
+ setParseError("expected language tag in [import langTag]", errorCode);
+ return;
+ }
if(length == 0) {
- uprv_strcpy(baseID, "root");
+ uprv_strcpy(baseID, "root");
} else if(*baseID == '_') {
uprv_memmove(baseID + 3, baseID, length + 1);
uprv_memcpy(baseID, "und", 3);
- }
- // @collation=type, or length=0 if not specified
- char collationType[ULOC_KEYWORDS_CAPACITY];
- length = uloc_getKeywordValue(localeID, "collation",
- collationType, ULOC_KEYWORDS_CAPACITY,
- &errorCode);
- if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
- errorCode = U_ZERO_ERROR;
- setParseError("expected language tag in [import langTag]", errorCode);
- return;
- }
- if(importer == NULL) {
- setParseError("[import langTag] is not supported", errorCode);
- } else {
- UnicodeString importedRules;
- importer->getRules(baseID, length > 0 ? collationType : "standard",
- importedRules, errorReason, errorCode);
- if(U_FAILURE(errorCode)) {
- if(errorReason == NULL) {
- errorReason = "[import langTag] failed";
- }
- setErrorContext();
- return;
- }
- const UnicodeString *outerRules = rules;
- int32_t outerRuleIndex = ruleIndex;
- parse(importedRules, errorCode);
- if(U_FAILURE(errorCode)) {
- if(parseError != NULL) {
- parseError->offset = outerRuleIndex;
- }
- }
- rules = outerRules;
- ruleIndex = j;
- }
- return;
- }
- } else if(rules->charAt(j) == 0x5b) { // words end with [
- UnicodeSet set;
- j = parseUnicodeSet(j, set, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- if(raw == UNICODE_STRING_SIMPLE("optimize")) {
- sink->optimize(set, errorReason, errorCode);
- if(U_FAILURE(errorCode)) { setErrorContext(); }
- ruleIndex = j;
- return;
- } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
- sink->suppressContractions(set, errorReason, errorCode);
- if(U_FAILURE(errorCode)) { setErrorContext(); }
- ruleIndex = j;
- return;
- }
- }
- setParseError("not a valid setting/option", errorCode);
-}
-
-void
-CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return; }
- int32_t i = 7; // after "reorder"
- if(i == raw.length()) {
- // empty [reorder] with no codes
- settings->resetReordering();
- return;
- }
- // Parse the codes in [reorder aa bb cc].
- UVector32 reorderCodes(errorCode);
- if(U_FAILURE(errorCode)) { return; }
- CharString word;
- while(i < raw.length()) {
- ++i; // skip the word-separating space
- int32_t limit = raw.indexOf((UChar)0x20, i);
- if(limit < 0) { limit = raw.length(); }
- word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
- if(U_FAILURE(errorCode)) { return; }
- int32_t code = getReorderCode(word.data());
- if(code < 0) {
- setParseError("unknown script or reorder code", errorCode);
- return;
- }
- reorderCodes.addElement(code, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- i = limit;
- }
- settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
-}
-
-static const char *const gSpecialReorderCodes[] = {
- "space", "punct", "symbol", "currency", "digit"
-};
-
-int32_t
-CollationRuleParser::getReorderCode(const char *word) {
- for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
- if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
- return UCOL_REORDER_CODE_FIRST + i;
- }
- }
- int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
- if(script >= 0) {
- return script;
- }
- if(uprv_stricmp(word, "others") == 0) {
- return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
- }
- return -1;
-}
-
-UColAttributeValue
-CollationRuleParser::getOnOffValue(const UnicodeString &s) {
- if(s == UNICODE_STRING_SIMPLE("on")) {
- return UCOL_ON;
- } else if(s == UNICODE_STRING_SIMPLE("off")) {
- return UCOL_OFF;
- } else {
- return UCOL_DEFAULT;
- }
-}
-
-int32_t
-CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
- // Collect a UnicodeSet pattern between a balanced pair of [brackets].
- int32_t level = 0;
- int32_t j = i;
- for(;;) {
- if(j == rules->length()) {
- setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
- return j;
- }
- UChar c = rules->charAt(j++);
- if(c == 0x5b) { // '['
- ++level;
- } else if(c == 0x5d) { // ']'
- if(--level == 0) { break; }
- }
- }
- set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
- if(U_FAILURE(errorCode)) {
- errorCode = U_ZERO_ERROR;
- setParseError("not a valid UnicodeSet pattern", errorCode);
- return j;
- }
- j = skipWhiteSpace(j);
- if(j == rules->length() || rules->charAt(j) != 0x5d) {
- setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
- return j;
- }
- return ++j;
-}
-
-int32_t
-CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
- static const UChar sp = 0x20;
- raw.remove();
- i = skipWhiteSpace(i);
- for(;;) {
- if(i >= rules->length()) { return 0; }
- UChar c = rules->charAt(i);
- if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
- if(raw.isEmpty()) { return i; }
- if(raw.endsWith(&sp, 1)) { // remove trailing space
- raw.truncate(raw.length() - 1);
- }
- return i;
- }
- if(PatternProps::isWhiteSpace(c)) {
- raw.append(sp);
- i = skipWhiteSpace(i + 1);
- } else {
- raw.append(c);
- ++i;
- }
- }
-}
-
-int32_t
-CollationRuleParser::skipComment(int32_t i) const {
- // skip to past the newline
- while(i < rules->length()) {
- UChar c = rules->charAt(i++);
- // LF or FF or CR or NEL or LS or PS
- if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
- // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
- // NLF (new line function) = CR or LF or CR+LF or NEL.
- // No need to collect all of CR+LF because a following LF will be ignored anyway.
- break;
- }
- }
- return i;
-}
-
-void
-CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return; }
- // Error code consistent with the old parser (from ca. 2001),
- // rather than U_PARSE_ERROR;
- errorCode = U_INVALID_FORMAT_ERROR;
- errorReason = reason;
- if(parseError != NULL) { setErrorContext(); }
-}
-
-void
-CollationRuleParser::setErrorContext() {
- if(parseError == NULL) { return; }
-
- // Note: This relies on the calling code maintaining the ruleIndex
- // at a position that is useful for debugging.
- // For example, at the beginning of a reset or relation etc.
- parseError->offset = ruleIndex;
- parseError->line = 0; // We are not counting line numbers.
-
- // before ruleIndex
- int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
- if(start < 0) {
- start = 0;
- } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
- ++start;
- }
- int32_t length = ruleIndex - start;
- rules->extract(start, length, parseError->preContext);
- parseError->preContext[length] = 0;
-
- // starting from ruleIndex
- length = rules->length() - ruleIndex;
- if(length >= U_PARSE_CONTEXT_LEN) {
- length = U_PARSE_CONTEXT_LEN - 1;
- if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
- --length;
- }
- }
- rules->extract(ruleIndex, length, parseError->postContext);
- parseError->postContext[length] = 0;
-}
-
-UBool
-CollationRuleParser::isSyntaxChar(UChar32 c) {
- return 0x21 <= c && c <= 0x7e &&
- (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
- (0x5b <= c && c <= 0x60) || (0x7b <= c));
-}
-
-int32_t
-CollationRuleParser::skipWhiteSpace(int32_t i) const {
- while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
- ++i;
- }
- return i;
-}
-
-U_NAMESPACE_END
-
-#endif // !UCONFIG_NO_COLLATION
+ }
+ // @collation=type, or length=0 if not specified
+ char collationType[ULOC_KEYWORDS_CAPACITY];
+ length = uloc_getKeywordValue(localeID, "collation",
+ collationType, ULOC_KEYWORDS_CAPACITY,
+ &errorCode);
+ if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
+ errorCode = U_ZERO_ERROR;
+ setParseError("expected language tag in [import langTag]", errorCode);
+ return;
+ }
+ if(importer == NULL) {
+ setParseError("[import langTag] is not supported", errorCode);
+ } else {
+ UnicodeString importedRules;
+ importer->getRules(baseID, length > 0 ? collationType : "standard",
+ importedRules, errorReason, errorCode);
+ if(U_FAILURE(errorCode)) {
+ if(errorReason == NULL) {
+ errorReason = "[import langTag] failed";
+ }
+ setErrorContext();
+ return;
+ }
+ const UnicodeString *outerRules = rules;
+ int32_t outerRuleIndex = ruleIndex;
+ parse(importedRules, errorCode);
+ if(U_FAILURE(errorCode)) {
+ if(parseError != NULL) {
+ parseError->offset = outerRuleIndex;
+ }
+ }
+ rules = outerRules;
+ ruleIndex = j;
+ }
+ return;
+ }
+ } else if(rules->charAt(j) == 0x5b) { // words end with [
+ UnicodeSet set;
+ j = parseUnicodeSet(j, set, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ if(raw == UNICODE_STRING_SIMPLE("optimize")) {
+ sink->optimize(set, errorReason, errorCode);
+ if(U_FAILURE(errorCode)) { setErrorContext(); }
+ ruleIndex = j;
+ return;
+ } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
+ sink->suppressContractions(set, errorReason, errorCode);
+ if(U_FAILURE(errorCode)) { setErrorContext(); }
+ ruleIndex = j;
+ return;
+ }
+ }
+ setParseError("not a valid setting/option", errorCode);
+}
+
+void
+CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ int32_t i = 7; // after "reorder"
+ if(i == raw.length()) {
+ // empty [reorder] with no codes
+ settings->resetReordering();
+ return;
+ }
+ // Parse the codes in [reorder aa bb cc].
+ UVector32 reorderCodes(errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ CharString word;
+ while(i < raw.length()) {
+ ++i; // skip the word-separating space
+ int32_t limit = raw.indexOf((UChar)0x20, i);
+ if(limit < 0) { limit = raw.length(); }
+ word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ int32_t code = getReorderCode(word.data());
+ if(code < 0) {
+ setParseError("unknown script or reorder code", errorCode);
+ return;
+ }
+ reorderCodes.addElement(code, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ i = limit;
+ }
+ settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
+}
+
+static const char *const gSpecialReorderCodes[] = {
+ "space", "punct", "symbol", "currency", "digit"
+};
+
+int32_t
+CollationRuleParser::getReorderCode(const char *word) {
+ for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
+ if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
+ return UCOL_REORDER_CODE_FIRST + i;
+ }
+ }
+ int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
+ if(script >= 0) {
+ return script;
+ }
+ if(uprv_stricmp(word, "others") == 0) {
+ return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
+ }
+ return -1;
+}
+
+UColAttributeValue
+CollationRuleParser::getOnOffValue(const UnicodeString &s) {
+ if(s == UNICODE_STRING_SIMPLE("on")) {
+ return UCOL_ON;
+ } else if(s == UNICODE_STRING_SIMPLE("off")) {
+ return UCOL_OFF;
+ } else {
+ return UCOL_DEFAULT;
+ }
+}
+
+int32_t
+CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
+ // Collect a UnicodeSet pattern between a balanced pair of [brackets].
+ int32_t level = 0;
+ int32_t j = i;
+ for(;;) {
+ if(j == rules->length()) {
+ setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
+ return j;
+ }
+ UChar c = rules->charAt(j++);
+ if(c == 0x5b) { // '['
+ ++level;
+ } else if(c == 0x5d) { // ']'
+ if(--level == 0) { break; }
+ }
+ }
+ set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
+ if(U_FAILURE(errorCode)) {
+ errorCode = U_ZERO_ERROR;
+ setParseError("not a valid UnicodeSet pattern", errorCode);
+ return j;
+ }
+ j = skipWhiteSpace(j);
+ if(j == rules->length() || rules->charAt(j) != 0x5d) {
+ setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
+ return j;
+ }
+ return ++j;
+}
+
+int32_t
+CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
+ static const UChar sp = 0x20;
+ raw.remove();
+ i = skipWhiteSpace(i);
+ for(;;) {
+ if(i >= rules->length()) { return 0; }
+ UChar c = rules->charAt(i);
+ if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
+ if(raw.isEmpty()) { return i; }
+ if(raw.endsWith(&sp, 1)) { // remove trailing space
+ raw.truncate(raw.length() - 1);
+ }
+ return i;
+ }
+ if(PatternProps::isWhiteSpace(c)) {
+ raw.append(sp);
+ i = skipWhiteSpace(i + 1);
+ } else {
+ raw.append(c);
+ ++i;
+ }
+ }
+}
+
+int32_t
+CollationRuleParser::skipComment(int32_t i) const {
+ // skip to past the newline
+ while(i < rules->length()) {
+ UChar c = rules->charAt(i++);
+ // LF or FF or CR or NEL or LS or PS
+ if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
+ // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
+ // NLF (new line function) = CR or LF or CR+LF or NEL.
+ // No need to collect all of CR+LF because a following LF will be ignored anyway.
+ break;
+ }
+ }
+ return i;
+}
+
+void
+CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ // Error code consistent with the old parser (from ca. 2001),
+ // rather than U_PARSE_ERROR;
+ errorCode = U_INVALID_FORMAT_ERROR;
+ errorReason = reason;
+ if(parseError != NULL) { setErrorContext(); }
+}
+
+void
+CollationRuleParser::setErrorContext() {
+ if(parseError == NULL) { return; }
+
+ // Note: This relies on the calling code maintaining the ruleIndex
+ // at a position that is useful for debugging.
+ // For example, at the beginning of a reset or relation etc.
+ parseError->offset = ruleIndex;
+ parseError->line = 0; // We are not counting line numbers.
+
+ // before ruleIndex
+ int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
+ if(start < 0) {
+ start = 0;
+ } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
+ ++start;
+ }
+ int32_t length = ruleIndex - start;
+ rules->extract(start, length, parseError->preContext);
+ parseError->preContext[length] = 0;
+
+ // starting from ruleIndex
+ length = rules->length() - ruleIndex;
+ if(length >= U_PARSE_CONTEXT_LEN) {
+ length = U_PARSE_CONTEXT_LEN - 1;
+ if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
+ --length;
+ }
+ }
+ rules->extract(ruleIndex, length, parseError->postContext);
+ parseError->postContext[length] = 0;
+}
+
+UBool
+CollationRuleParser::isSyntaxChar(UChar32 c) {
+ return 0x21 <= c && c <= 0x7e &&
+ (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
+ (0x5b <= c && c <= 0x60) || (0x7b <= c));
+}
+
+int32_t
+CollationRuleParser::skipWhiteSpace(int32_t i) const {
+ while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
+ ++i;
+ }
+ return i;
+}
+
+U_NAMESPACE_END
+
+#endif // !UCONFIG_NO_COLLATION