diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/poco/Foundation/include/Poco/Unicode.h | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/poco/Foundation/include/Poco/Unicode.h')
-rw-r--r-- | contrib/libs/poco/Foundation/include/Poco/Unicode.h | 327 |
1 files changed, 327 insertions, 0 deletions
diff --git a/contrib/libs/poco/Foundation/include/Poco/Unicode.h b/contrib/libs/poco/Foundation/include/Poco/Unicode.h new file mode 100644 index 0000000000..b6d027685a --- /dev/null +++ b/contrib/libs/poco/Foundation/include/Poco/Unicode.h @@ -0,0 +1,327 @@ +// +// Unicode.h +// +// Library: Foundation +// Package: Text +// Module: Unicode +// +// Definition of the Unicode class. +// +// Copyright (c) 2007, Applied Informatics Software Engineering GmbH. +// and Contributors. +// +// SPDX-License-Identifier: BSL-1.0 +// + + +#ifndef Foundation_Unicode_INCLUDED +#define Foundation_Unicode_INCLUDED + + +#include "Poco/Foundation.h" + + +namespace Poco { + + +class Foundation_API Unicode + /// This class contains enumerations and static + /// utility functions for dealing with Unicode characters + /// and their properties. + /// + /// For more information on Unicode, see <http://www.unicode.org>. + /// + /// The implementation is based on the Unicode support + /// functions in PCRE. +{ +public: + // Implementation note: the following definitions must be kept + // in sync with those from ucp.h (PCRE). + enum CharacterCategory + /// Unicode character categories. + { + UCP_OTHER, + UCP_LETTER, + UCP_MARK, + UCP_NUMBER, + UCP_PUNCTUATION, + UCP_SYMBOL, + UCP_SEPARATOR + }; + + enum CharacterType + /// Unicode character types. + { + UCP_CONTROL, + UCP_FORMAT, + UCP_UNASSIGNED, + UCP_PRIVATE_USE, + UCP_SURROGATE, + UCP_LOWER_CASE_LETTER, + UCP_MODIFIER_LETTER, + UCP_OTHER_LETTER, + UCP_TITLE_CASE_LETTER, + UCP_UPPER_CASE_LETTER, + UCP_SPACING_MARK, + UCP_ENCLOSING_MARK, + UCP_NON_SPACING_MARK, + UCP_DECIMAL_NUMBER, + UCP_LETTER_NUMBER, + UCP_OTHER_NUMBER, + UCP_CONNECTOR_PUNCTUATION, + UCP_DASH_PUNCTUATION, + UCP_CLOSE_PUNCTUATION, + UCP_FINAL_PUNCTUATION, + UCP_INITIAL_PUNCTUATION, + UCP_OTHER_PUNCTUATION, + UCP_OPEN_PUNCTUATION, + UCP_CURRENCY_SYMBOL, + UCP_MODIFIER_SYMBOL, + UCP_MATHEMATICAL_SYMBOL, + UCP_OTHER_SYMBOL, + UCP_LINE_SEPARATOR, + UCP_PARAGRAPH_SEPARATOR, + UCP_SPACE_SEPARATOR + }; + + enum Script + /// Unicode 7.0 script identifiers. + { + UCP_ARABIC, + UCP_ARMENIAN, + UCP_BENGALI, + UCP_BOPOMOFO, + UCP_BRAILLE, + UCP_BUGINESE, + UCP_BUHID, + UCP_CANADIAN_ABORIGINAL, + UCP_CHEROKEE, + UCP_COMMON, + UCP_COPTIC, + UCP_CYPRIOT, + UCP_CYRILLIC, + UCP_DESERET, + UCP_DEVANAGARI, + UCP_ETHIOPIC, + UCP_GEORGIAN, + UCP_GLAGOLITIC, + UCP_GOTHIC, + UCP_GREEK, + UCP_GUJARATI, + UCP_GURMUKHI, + UCP_HAN, + UCP_HANGUL, + UCP_HANUNOO, + UCP_HEBREW, + UCP_HIRAGANA, + UCP_INHERITED, + UCP_KANNADA, + UCP_KATAKANA, + UCP_KHAROSHTHI, + UCP_KHMER, + UCP_LAO, + UCP_LATIN, + UCP_LIMBU, + UCP_LINEAR_B, + UCP_MALAYALAM, + UCP_MONGOLIAN, + UCP_MYANMAR, + UCP_NEW_TAI_LUE, + UCP_OGHAM, + UCP_OLD_ITALIC, + UCP_OLD_PERSIAN, + UCP_ORIYA, + UCP_OSMANYA, + UCP_RUNIC, + UCP_SHAVIAN, + UCP_SINHALA, + UCP_SYLOTI_NAGRI, + UCP_SYRIAC, + UCP_TAGALOG, + UCP_TAGBANWA, + UCP_TAI_LE, + UCP_TAMIL, + UCP_TELUGU, + UCP_THAANA, + UCP_THAI, + UCP_TIBETAN, + UCP_TIFINAGH, + UCP_UGARITIC, + UCP_YI, + // Unicode 5.0 + UCP_BALINESE, + UCP_CUNEIFORM, + UCP_NKO, + UCP_PHAGS_PA, + UCP_PHOENICIAN, + // Unicode 5.1 + UCP_CARIAN, + UCP_CHAM, + UCP_KAYAH_LI, + UCP_LEPCHA, + UCP_LYCIAN, + UCP_LYDIAN, + UCP_OL_CHIKI, + UCP_REJANG, + UCP_SAURASHTRA, + UCP_SUNDANESE, + UCP_VAI, + // Unicode 5.2 + UCP_AVESTAN, + UCP_BAMUM, + UCP_EGYPTIAN_HIEROGLYPHS, + UCP_IMPERIAL_ARAMAIC, + UCP_INSCRIPTIONAL_PAHLAVI, + UCP_INSCRIPTIONAL_PARTHIAN, + UCP_JAVANESE, + UCP_KAITHI, + UCP_LISU, + UCP_MEETEI_MAYEK, + UCP_OLD_SOUTH_ARABIAN, + UCP_OLD_TURKIC, + UCP_SAMARITAN, + UCP_TAI_THAM, + UCP_TAI_VIET, + // Unicode 6.0 + UCP_BATAK, + UCP_BRAHMI, + UCP_MANDAIC, + // Unicode 6.1 + UCP_CHAKMA, + UCP_MEROITIC_CURSIVE, + UCP_MEROITIC_HIEROGLYPHS, + UCP_MIAO, + UCP_SHARADA, + UCP_SORA_SOMPENG, + UCP_TAKRI, + // Unicode 7.0 + UCP_BASSA_VAH, + UCP_CAUCASIAN_ALBANIAN, + UCP_DUPLOYAN, + UCP_ELBASAN, + UCP_GRANTHA, + UCP_KHOJKI, + UCP_KHUDAWADI, + UCP_LINEAR_A, + UCP_MAHAJANI, + UCP_MANICHAEAN, + UCP_MENDE_KIKAKUI, + UCP_MODI, + UCP_MRO, + UCP_NABATAEAN, + UCP_OLD_NORTH_ARABIAN, + UCP_OLD_PERMIC, + UCP_PAHAWH_HMONG, + UCP_PALMYRENE, + UCP_PSALTER_PAHLAVI, + UCP_PAU_CIN_HAU, + UCP_SIDDHAM, + UCP_TIRHUTA, + UCP_WARANG_CITI + }; + + enum + { + UCP_MAX_CODEPOINT = 0x10FFFF + }; + + struct CharacterProperties + /// This structure holds the character properties + /// of an Unicode character. + { + CharacterCategory category; + CharacterType type; + Script script; + }; + + static void properties(int ch, CharacterProperties& props); + /// Return the Unicode character properties for the + /// character with the given Unicode value. + + static bool isSpace(int ch); + /// Returns true iff the given character is a separator. + + static bool isDigit(int ch); + /// Returns true iff the given character is a numeric character. + + static bool isPunct(int ch); + /// Returns true iff the given character is a punctuation character. + + static bool isAlpha(int ch); + /// Returns true iff the given character is a letter. + + static bool isLower(int ch); + /// Returns true iff the given character is a lowercase + /// character. + + static bool isUpper(int ch); + /// Returns true iff the given character is an uppercase + /// character. + + static int toLower(int ch); + /// If the given character is an uppercase character, + /// return its lowercase counterpart, otherwise return + /// the character. + + static int toUpper(int ch); + /// If the given character is a lowercase character, + /// return its uppercase counterpart, otherwise return + /// the character. +}; + + +// +// inlines +// +inline bool Unicode::isSpace(int ch) +{ + CharacterProperties props; + properties(ch, props); + return props.category == UCP_SEPARATOR; +} + + +inline bool Unicode::isDigit(int ch) +{ + CharacterProperties props; + properties(ch, props); + return props.category == UCP_NUMBER; +} + + +inline bool Unicode::isPunct(int ch) +{ + CharacterProperties props; + properties(ch, props); + return props.category == UCP_PUNCTUATION; +} + + +inline bool Unicode::isAlpha(int ch) +{ + CharacterProperties props; + properties(ch, props); + return props.category == UCP_LETTER; +} + + +inline bool Unicode::isLower(int ch) +{ + CharacterProperties props; + properties(ch, props); + return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER; +} + + +inline bool Unicode::isUpper(int ch) +{ + CharacterProperties props; + properties(ch, props); + return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER; +} + + +} // namespace Poco + + +#endif // Foundation_Unicode_INCLUDED |