diff options
author | neksard <neksard@yandex-team.ru> | 2022-02-10 16:45:23 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:23 +0300 |
commit | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (patch) | |
tree | 83bf5c8c8047c42d8475e6095df90ccdc3d1b57f /contrib/libs/icu/i18n/rbt_pars.h | |
parent | d3a398281c6fd1d3672036cb2d63f842d2cb28c5 (diff) | |
download | ydb-8f7cf138264e0caa318144bf8a2c950e0b0a8593.tar.gz |
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/i18n/rbt_pars.h')
-rw-r--r-- | contrib/libs/icu/i18n/rbt_pars.h | 712 |
1 files changed, 356 insertions, 356 deletions
diff --git a/contrib/libs/icu/i18n/rbt_pars.h b/contrib/libs/icu/i18n/rbt_pars.h index 61ce9727e0..214152077d 100644 --- a/contrib/libs/icu/i18n/rbt_pars.h +++ b/contrib/libs/icu/i18n/rbt_pars.h @@ -1,357 +1,357 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -********************************************************************** -* Copyright (C) 1999-2011, International Business Machines Corporation -* and others. All Rights Reserved. -********************************************************************** -* Date Name Description -* 11/17/99 aliu Creation. -********************************************************************** -*/ -#ifndef RBT_PARS_H -#define RBT_PARS_H - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_TRANSLITERATION -#ifdef __cplusplus - -#include "unicode/uobject.h" -#include "unicode/parseerr.h" -#include "unicode/unorm.h" -#include "rbt.h" -#include "hash.h" -#include "uvector.h" - -U_NAMESPACE_BEGIN - -class TransliterationRuleData; -class UnicodeFunctor; -class ParseData; -class RuleHalf; -class ParsePosition; -class StringMatcher; - -class TransliteratorParser : public UMemory { - - public: - - /** - * A Vector of TransliterationRuleData objects, one for each discrete group - * of rules in the rule set - */ - UVector dataVector; - - /** - * PUBLIC data member. - * A Vector of UnicodeStrings containing all of the ID blocks in the rule set - */ - UVector idBlockVector; - - /** - * PUBLIC data member containing the parsed compound filter, if any. - */ - UnicodeSet* compoundFilter; - - private: - - /** - * The current data object for which we are parsing rules - */ - TransliterationRuleData* curData; - - UTransDirection direction; - - /** - * Parse error information. - */ - UParseError parseError; - - /** - * Temporary symbol table used during parsing. - */ - ParseData* parseData; - - /** - * Temporary vector of matcher variables. When parsing is complete, this - * is copied into the array data.variables. As with data.variables, - * element 0 corresponds to character data.variablesBase. - */ - UVector variablesVector; - - /** - * Temporary table of variable names. When parsing is complete, this is - * copied into data.variableNames. - */ - Hashtable variableNames; - - /** - * String of standins for segments. Used during the parsing of a single - * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds - * to StringMatcher object segmentObjects.elementAt(0), etc. - */ - UnicodeString segmentStandins; - - /** - * Vector of StringMatcher objects for segments. Used during the - * parsing of a single rule. - * segmentStandins.charAt(0) is the standin for "$1" and corresponds - * to StringMatcher object segmentObjects.elementAt(0), etc. - */ - UVector segmentObjects; - - /** - * The next available stand-in for variables. This starts at some point in - * the private use area (discovered dynamically) and increments up toward - * <code>variableLimit</code>. At any point during parsing, available - * variables are <code>variableNext..variableLimit-1</code>. - */ - UChar variableNext; - - /** - * The last available stand-in for variables. This is discovered - * dynamically. At any point during parsing, available variables are - * <code>variableNext..variableLimit-1</code>. - */ - UChar variableLimit; - - /** - * When we encounter an undefined variable, we do not immediately signal - * an error, in case we are defining this variable, e.g., "$a = [a-z];". - * Instead, we save the name of the undefined variable, and substitute - * in the placeholder char variableLimit - 1, and decrement - * variableLimit. - */ - UnicodeString undefinedVariableName; - - /** - * The stand-in character for the 'dot' set, represented by '.' in - * patterns. This is allocated the first time it is needed, and - * reused thereafter. - */ - UChar dotStandIn; - -public: - - /** - * Constructor. - */ - TransliteratorParser(UErrorCode &statusReturn); - - /** - * Destructor. - */ - ~TransliteratorParser(); - - /** - * Parse the given string as a sequence of rules, separated by newline - * characters ('\n'), and cause this object to implement those rules. Any - * previous rules are discarded. Typically this method is called exactly - * once after construction. - * - * Parse the given rules, in the given direction. After this call - * returns, query the public data members for results. The caller - * owns the 'data' and 'compoundFilter' data members after this - * call returns. - * @param rules rules, separated by ';' - * @param direction either FORWARD or REVERSE. - * @param pe Struct to recieve information on position - * of error if an error is encountered - * @param ec Output param set to success/failure code. - */ - void parse(const UnicodeString& rules, - UTransDirection direction, - UParseError& pe, - UErrorCode& ec); - - /** - * Return the compound filter parsed by parse(). Caller owns result. - * @return the compound filter parsed by parse(). - */ - UnicodeSet* orphanCompoundFilter(); - -private: - - /** - * Return a representation of this transliterator as source rules. - * @param rules Output param to receive the rules. - * @param direction either FORWARD or REVERSE. - */ - void parseRules(const UnicodeString& rules, - UTransDirection direction, - UErrorCode& status); - - /** - * MAIN PARSER. Parse the next rule in the given rule string, starting - * at pos. Return the index after the last character parsed. Do not - * parse characters at or after limit. - * - * Important: The character at pos must be a non-whitespace character - * that is not the comment character. - * - * This method handles quoting, escaping, and whitespace removal. It - * parses the end-of-rule character. It recognizes context and cursor - * indicators. Once it does a lexical breakdown of the rule at pos, it - * creates a rule object and adds it to our rule list. - * @param rules Output param to receive the rules. - * @param pos the starting position. - * @param limit pointer past the last character of the rule. - * @return the index after the last character parsed. - */ - int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); - - /** - * Set the variable range to [start, end] (inclusive). - * @param start the start value of the range. - * @param end the end value of the range. - */ - void setVariableRange(int32_t start, int32_t end, UErrorCode& status); - - /** - * Assert that the given character is NOT within the variable range. - * If it is, return FALSE. This is neccesary to ensure that the - * variable range does not overlap characters used in a rule. - * @param ch the given character. - * @return True, if the given character is NOT within the variable range. - */ - UBool checkVariableRange(UChar32 ch) const; - - /** - * Set the maximum backup to 'backup', in response to a pragma - * statement. - * @param backup the new value to be set. - */ - void pragmaMaximumBackup(int32_t backup); - - /** - * Begin normalizing all rules using the given mode, in response - * to a pragma statement. - * @param mode the given mode. - */ - void pragmaNormalizeRules(UNormalizationMode mode); - - /** - * Return true if the given rule looks like a pragma. - * @param pos offset to the first non-whitespace character - * of the rule. - * @param limit pointer past the last character of the rule. - * @return true if the given rule looks like a pragma. - */ - static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); - - /** - * Parse a pragma. This method assumes resemblesPragma() has - * already returned true. - * @param pos offset to the first non-whitespace character - * of the rule. - * @param limit pointer past the last character of the rule. - * @return the position index after the final ';' of the pragma, - * or -1 on failure. - */ - int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); - - /** - * Called by main parser upon syntax error. Search the rule string - * for the probable end of the rule. Of course, if the error is that - * the end of rule marker is missing, then the rule end will not be found. - * In any case the rule start will be correctly reported. - * @param parseErrorCode error code. - * @param msg error description. - * @param start position of first character of current rule. - * @return start position of first character of current rule. - */ - int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, - UErrorCode& status); - - /** - * Parse a UnicodeSet out, store it, and return the stand-in character - * used to represent it. - * - * @param rule the rule for UnicodeSet. - * @param pos the position in pattern at which to start parsing. - * @return the stand-in character used to represent it. - */ - UChar parseSet(const UnicodeString& rule, - ParsePosition& pos, - UErrorCode& status); - - /** - * Generate and return a stand-in for a new UnicodeFunctor. Store - * the matcher (adopt it). - * @param adopted the UnicodeFunctor to be adopted. - * @return a stand-in for a new UnicodeFunctor. - */ - UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); - - /** - * Return the standin for segment seg (1-based). - * @param seg the given segment. - * @return the standIn character for the given segment. - */ - UChar getSegmentStandin(int32_t seg, UErrorCode& status); - - /** - * Set the object for segment seg (1-based). - * @param seg the given segment. - * @param adopted the StringMatcher to be adopted. - */ - void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); - - /** - * Return the stand-in for the dot set. It is allocated the first - * time and reused thereafter. - * @return the stand-in for the dot set. - */ - UChar getDotStandIn(UErrorCode& status); - - /** - * Append the value of the given variable name to the given - * UnicodeString. - * @param name the variable name to be appended. - * @param buf the given UnicodeString to append to. - */ - void appendVariableDef(const UnicodeString& name, - UnicodeString& buf, - UErrorCode& status); - - /** - * Glue method to get around access restrictions in C++. - */ - /*static Transliterator* createBasicInstance(const UnicodeString& id, - const UnicodeString* canonID);*/ - - friend class RuleHalf; - - // Disallowed methods; no impl. - /** - * Copy constructor - */ - TransliteratorParser(const TransliteratorParser&); - - /** - * Assignment operator - */ - TransliteratorParser& operator=(const TransliteratorParser&); -}; - -U_NAMESPACE_END - -#endif /* #ifdef __cplusplus */ - -/** - * Strip/convert the following from the transliterator rules: - * comments - * newlines - * white space at the beginning and end of a line - * unescape \u notation - * - * The target must be equal in size as the source. - * @internal - */ -U_CAPI int32_t -utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); - -#endif /* #if !UCONFIG_NO_TRANSLITERATION */ - -#endif +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 1999-2011, International Business Machines Corporation +* and others. All Rights Reserved. +********************************************************************** +* Date Name Description +* 11/17/99 aliu Creation. +********************************************************************** +*/ +#ifndef RBT_PARS_H +#define RBT_PARS_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_TRANSLITERATION +#ifdef __cplusplus + +#include "unicode/uobject.h" +#include "unicode/parseerr.h" +#include "unicode/unorm.h" +#include "rbt.h" +#include "hash.h" +#include "uvector.h" + +U_NAMESPACE_BEGIN + +class TransliterationRuleData; +class UnicodeFunctor; +class ParseData; +class RuleHalf; +class ParsePosition; +class StringMatcher; + +class TransliteratorParser : public UMemory { + + public: + + /** + * A Vector of TransliterationRuleData objects, one for each discrete group + * of rules in the rule set + */ + UVector dataVector; + + /** + * PUBLIC data member. + * A Vector of UnicodeStrings containing all of the ID blocks in the rule set + */ + UVector idBlockVector; + + /** + * PUBLIC data member containing the parsed compound filter, if any. + */ + UnicodeSet* compoundFilter; + + private: + + /** + * The current data object for which we are parsing rules + */ + TransliterationRuleData* curData; + + UTransDirection direction; + + /** + * Parse error information. + */ + UParseError parseError; + + /** + * Temporary symbol table used during parsing. + */ + ParseData* parseData; + + /** + * Temporary vector of matcher variables. When parsing is complete, this + * is copied into the array data.variables. As with data.variables, + * element 0 corresponds to character data.variablesBase. + */ + UVector variablesVector; + + /** + * Temporary table of variable names. When parsing is complete, this is + * copied into data.variableNames. + */ + Hashtable variableNames; + + /** + * String of standins for segments. Used during the parsing of a single + * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds + * to StringMatcher object segmentObjects.elementAt(0), etc. + */ + UnicodeString segmentStandins; + + /** + * Vector of StringMatcher objects for segments. Used during the + * parsing of a single rule. + * segmentStandins.charAt(0) is the standin for "$1" and corresponds + * to StringMatcher object segmentObjects.elementAt(0), etc. + */ + UVector segmentObjects; + + /** + * The next available stand-in for variables. This starts at some point in + * the private use area (discovered dynamically) and increments up toward + * <code>variableLimit</code>. At any point during parsing, available + * variables are <code>variableNext..variableLimit-1</code>. + */ + UChar variableNext; + + /** + * The last available stand-in for variables. This is discovered + * dynamically. At any point during parsing, available variables are + * <code>variableNext..variableLimit-1</code>. + */ + UChar variableLimit; + + /** + * When we encounter an undefined variable, we do not immediately signal + * an error, in case we are defining this variable, e.g., "$a = [a-z];". + * Instead, we save the name of the undefined variable, and substitute + * in the placeholder char variableLimit - 1, and decrement + * variableLimit. + */ + UnicodeString undefinedVariableName; + + /** + * The stand-in character for the 'dot' set, represented by '.' in + * patterns. This is allocated the first time it is needed, and + * reused thereafter. + */ + UChar dotStandIn; + +public: + + /** + * Constructor. + */ + TransliteratorParser(UErrorCode &statusReturn); + + /** + * Destructor. + */ + ~TransliteratorParser(); + + /** + * Parse the given string as a sequence of rules, separated by newline + * characters ('\n'), and cause this object to implement those rules. Any + * previous rules are discarded. Typically this method is called exactly + * once after construction. + * + * Parse the given rules, in the given direction. After this call + * returns, query the public data members for results. The caller + * owns the 'data' and 'compoundFilter' data members after this + * call returns. + * @param rules rules, separated by ';' + * @param direction either FORWARD or REVERSE. + * @param pe Struct to recieve information on position + * of error if an error is encountered + * @param ec Output param set to success/failure code. + */ + void parse(const UnicodeString& rules, + UTransDirection direction, + UParseError& pe, + UErrorCode& ec); + + /** + * Return the compound filter parsed by parse(). Caller owns result. + * @return the compound filter parsed by parse(). + */ + UnicodeSet* orphanCompoundFilter(); + +private: + + /** + * Return a representation of this transliterator as source rules. + * @param rules Output param to receive the rules. + * @param direction either FORWARD or REVERSE. + */ + void parseRules(const UnicodeString& rules, + UTransDirection direction, + UErrorCode& status); + + /** + * MAIN PARSER. Parse the next rule in the given rule string, starting + * at pos. Return the index after the last character parsed. Do not + * parse characters at or after limit. + * + * Important: The character at pos must be a non-whitespace character + * that is not the comment character. + * + * This method handles quoting, escaping, and whitespace removal. It + * parses the end-of-rule character. It recognizes context and cursor + * indicators. Once it does a lexical breakdown of the rule at pos, it + * creates a rule object and adds it to our rule list. + * @param rules Output param to receive the rules. + * @param pos the starting position. + * @param limit pointer past the last character of the rule. + * @return the index after the last character parsed. + */ + int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); + + /** + * Set the variable range to [start, end] (inclusive). + * @param start the start value of the range. + * @param end the end value of the range. + */ + void setVariableRange(int32_t start, int32_t end, UErrorCode& status); + + /** + * Assert that the given character is NOT within the variable range. + * If it is, return FALSE. This is neccesary to ensure that the + * variable range does not overlap characters used in a rule. + * @param ch the given character. + * @return True, if the given character is NOT within the variable range. + */ + UBool checkVariableRange(UChar32 ch) const; + + /** + * Set the maximum backup to 'backup', in response to a pragma + * statement. + * @param backup the new value to be set. + */ + void pragmaMaximumBackup(int32_t backup); + + /** + * Begin normalizing all rules using the given mode, in response + * to a pragma statement. + * @param mode the given mode. + */ + void pragmaNormalizeRules(UNormalizationMode mode); + + /** + * Return true if the given rule looks like a pragma. + * @param pos offset to the first non-whitespace character + * of the rule. + * @param limit pointer past the last character of the rule. + * @return true if the given rule looks like a pragma. + */ + static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); + + /** + * Parse a pragma. This method assumes resemblesPragma() has + * already returned true. + * @param pos offset to the first non-whitespace character + * of the rule. + * @param limit pointer past the last character of the rule. + * @return the position index after the final ';' of the pragma, + * or -1 on failure. + */ + int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); + + /** + * Called by main parser upon syntax error. Search the rule string + * for the probable end of the rule. Of course, if the error is that + * the end of rule marker is missing, then the rule end will not be found. + * In any case the rule start will be correctly reported. + * @param parseErrorCode error code. + * @param msg error description. + * @param start position of first character of current rule. + * @return start position of first character of current rule. + */ + int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, + UErrorCode& status); + + /** + * Parse a UnicodeSet out, store it, and return the stand-in character + * used to represent it. + * + * @param rule the rule for UnicodeSet. + * @param pos the position in pattern at which to start parsing. + * @return the stand-in character used to represent it. + */ + UChar parseSet(const UnicodeString& rule, + ParsePosition& pos, + UErrorCode& status); + + /** + * Generate and return a stand-in for a new UnicodeFunctor. Store + * the matcher (adopt it). + * @param adopted the UnicodeFunctor to be adopted. + * @return a stand-in for a new UnicodeFunctor. + */ + UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); + + /** + * Return the standin for segment seg (1-based). + * @param seg the given segment. + * @return the standIn character for the given segment. + */ + UChar getSegmentStandin(int32_t seg, UErrorCode& status); + + /** + * Set the object for segment seg (1-based). + * @param seg the given segment. + * @param adopted the StringMatcher to be adopted. + */ + void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); + + /** + * Return the stand-in for the dot set. It is allocated the first + * time and reused thereafter. + * @return the stand-in for the dot set. + */ + UChar getDotStandIn(UErrorCode& status); + + /** + * Append the value of the given variable name to the given + * UnicodeString. + * @param name the variable name to be appended. + * @param buf the given UnicodeString to append to. + */ + void appendVariableDef(const UnicodeString& name, + UnicodeString& buf, + UErrorCode& status); + + /** + * Glue method to get around access restrictions in C++. + */ + /*static Transliterator* createBasicInstance(const UnicodeString& id, + const UnicodeString* canonID);*/ + + friend class RuleHalf; + + // Disallowed methods; no impl. + /** + * Copy constructor + */ + TransliteratorParser(const TransliteratorParser&); + + /** + * Assignment operator + */ + TransliteratorParser& operator=(const TransliteratorParser&); +}; + +U_NAMESPACE_END + +#endif /* #ifdef __cplusplus */ + +/** + * Strip/convert the following from the transliterator rules: + * comments + * newlines + * white space at the beginning and end of a line + * unescape \u notation + * + * The target must be equal in size as the source. + * @internal + */ +U_CAPI int32_t +utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); + +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ + +#endif |