aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/poco/Foundation/include/Poco/Unicode.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/poco/Foundation/include/Poco/Unicode.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/poco/Foundation/include/Poco/Unicode.h')
-rw-r--r--contrib/libs/poco/Foundation/include/Poco/Unicode.h327
1 files changed, 327 insertions, 0 deletions
diff --git a/contrib/libs/poco/Foundation/include/Poco/Unicode.h b/contrib/libs/poco/Foundation/include/Poco/Unicode.h
new file mode 100644
index 0000000000..b6d027685a
--- /dev/null
+++ b/contrib/libs/poco/Foundation/include/Poco/Unicode.h
@@ -0,0 +1,327 @@
+//
+// Unicode.h
+//
+// Library: Foundation
+// Package: Text
+// Module: Unicode
+//
+// Definition of the Unicode class.
+//
+// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
+// and Contributors.
+//
+// SPDX-License-Identifier: BSL-1.0
+//
+
+
+#ifndef Foundation_Unicode_INCLUDED
+#define Foundation_Unicode_INCLUDED
+
+
+#include "Poco/Foundation.h"
+
+
+namespace Poco {
+
+
+class Foundation_API Unicode
+ /// This class contains enumerations and static
+ /// utility functions for dealing with Unicode characters
+ /// and their properties.
+ ///
+ /// For more information on Unicode, see <http://www.unicode.org>.
+ ///
+ /// The implementation is based on the Unicode support
+ /// functions in PCRE.
+{
+public:
+ // Implementation note: the following definitions must be kept
+ // in sync with those from ucp.h (PCRE).
+ enum CharacterCategory
+ /// Unicode character categories.
+ {
+ UCP_OTHER,
+ UCP_LETTER,
+ UCP_MARK,
+ UCP_NUMBER,
+ UCP_PUNCTUATION,
+ UCP_SYMBOL,
+ UCP_SEPARATOR
+ };
+
+ enum CharacterType
+ /// Unicode character types.
+ {
+ UCP_CONTROL,
+ UCP_FORMAT,
+ UCP_UNASSIGNED,
+ UCP_PRIVATE_USE,
+ UCP_SURROGATE,
+ UCP_LOWER_CASE_LETTER,
+ UCP_MODIFIER_LETTER,
+ UCP_OTHER_LETTER,
+ UCP_TITLE_CASE_LETTER,
+ UCP_UPPER_CASE_LETTER,
+ UCP_SPACING_MARK,
+ UCP_ENCLOSING_MARK,
+ UCP_NON_SPACING_MARK,
+ UCP_DECIMAL_NUMBER,
+ UCP_LETTER_NUMBER,
+ UCP_OTHER_NUMBER,
+ UCP_CONNECTOR_PUNCTUATION,
+ UCP_DASH_PUNCTUATION,
+ UCP_CLOSE_PUNCTUATION,
+ UCP_FINAL_PUNCTUATION,
+ UCP_INITIAL_PUNCTUATION,
+ UCP_OTHER_PUNCTUATION,
+ UCP_OPEN_PUNCTUATION,
+ UCP_CURRENCY_SYMBOL,
+ UCP_MODIFIER_SYMBOL,
+ UCP_MATHEMATICAL_SYMBOL,
+ UCP_OTHER_SYMBOL,
+ UCP_LINE_SEPARATOR,
+ UCP_PARAGRAPH_SEPARATOR,
+ UCP_SPACE_SEPARATOR
+ };
+
+ enum Script
+ /// Unicode 7.0 script identifiers.
+ {
+ UCP_ARABIC,
+ UCP_ARMENIAN,
+ UCP_BENGALI,
+ UCP_BOPOMOFO,
+ UCP_BRAILLE,
+ UCP_BUGINESE,
+ UCP_BUHID,
+ UCP_CANADIAN_ABORIGINAL,
+ UCP_CHEROKEE,
+ UCP_COMMON,
+ UCP_COPTIC,
+ UCP_CYPRIOT,
+ UCP_CYRILLIC,
+ UCP_DESERET,
+ UCP_DEVANAGARI,
+ UCP_ETHIOPIC,
+ UCP_GEORGIAN,
+ UCP_GLAGOLITIC,
+ UCP_GOTHIC,
+ UCP_GREEK,
+ UCP_GUJARATI,
+ UCP_GURMUKHI,
+ UCP_HAN,
+ UCP_HANGUL,
+ UCP_HANUNOO,
+ UCP_HEBREW,
+ UCP_HIRAGANA,
+ UCP_INHERITED,
+ UCP_KANNADA,
+ UCP_KATAKANA,
+ UCP_KHAROSHTHI,
+ UCP_KHMER,
+ UCP_LAO,
+ UCP_LATIN,
+ UCP_LIMBU,
+ UCP_LINEAR_B,
+ UCP_MALAYALAM,
+ UCP_MONGOLIAN,
+ UCP_MYANMAR,
+ UCP_NEW_TAI_LUE,
+ UCP_OGHAM,
+ UCP_OLD_ITALIC,
+ UCP_OLD_PERSIAN,
+ UCP_ORIYA,
+ UCP_OSMANYA,
+ UCP_RUNIC,
+ UCP_SHAVIAN,
+ UCP_SINHALA,
+ UCP_SYLOTI_NAGRI,
+ UCP_SYRIAC,
+ UCP_TAGALOG,
+ UCP_TAGBANWA,
+ UCP_TAI_LE,
+ UCP_TAMIL,
+ UCP_TELUGU,
+ UCP_THAANA,
+ UCP_THAI,
+ UCP_TIBETAN,
+ UCP_TIFINAGH,
+ UCP_UGARITIC,
+ UCP_YI,
+ // Unicode 5.0
+ UCP_BALINESE,
+ UCP_CUNEIFORM,
+ UCP_NKO,
+ UCP_PHAGS_PA,
+ UCP_PHOENICIAN,
+ // Unicode 5.1
+ UCP_CARIAN,
+ UCP_CHAM,
+ UCP_KAYAH_LI,
+ UCP_LEPCHA,
+ UCP_LYCIAN,
+ UCP_LYDIAN,
+ UCP_OL_CHIKI,
+ UCP_REJANG,
+ UCP_SAURASHTRA,
+ UCP_SUNDANESE,
+ UCP_VAI,
+ // Unicode 5.2
+ UCP_AVESTAN,
+ UCP_BAMUM,
+ UCP_EGYPTIAN_HIEROGLYPHS,
+ UCP_IMPERIAL_ARAMAIC,
+ UCP_INSCRIPTIONAL_PAHLAVI,
+ UCP_INSCRIPTIONAL_PARTHIAN,
+ UCP_JAVANESE,
+ UCP_KAITHI,
+ UCP_LISU,
+ UCP_MEETEI_MAYEK,
+ UCP_OLD_SOUTH_ARABIAN,
+ UCP_OLD_TURKIC,
+ UCP_SAMARITAN,
+ UCP_TAI_THAM,
+ UCP_TAI_VIET,
+ // Unicode 6.0
+ UCP_BATAK,
+ UCP_BRAHMI,
+ UCP_MANDAIC,
+ // Unicode 6.1
+ UCP_CHAKMA,
+ UCP_MEROITIC_CURSIVE,
+ UCP_MEROITIC_HIEROGLYPHS,
+ UCP_MIAO,
+ UCP_SHARADA,
+ UCP_SORA_SOMPENG,
+ UCP_TAKRI,
+ // Unicode 7.0
+ UCP_BASSA_VAH,
+ UCP_CAUCASIAN_ALBANIAN,
+ UCP_DUPLOYAN,
+ UCP_ELBASAN,
+ UCP_GRANTHA,
+ UCP_KHOJKI,
+ UCP_KHUDAWADI,
+ UCP_LINEAR_A,
+ UCP_MAHAJANI,
+ UCP_MANICHAEAN,
+ UCP_MENDE_KIKAKUI,
+ UCP_MODI,
+ UCP_MRO,
+ UCP_NABATAEAN,
+ UCP_OLD_NORTH_ARABIAN,
+ UCP_OLD_PERMIC,
+ UCP_PAHAWH_HMONG,
+ UCP_PALMYRENE,
+ UCP_PSALTER_PAHLAVI,
+ UCP_PAU_CIN_HAU,
+ UCP_SIDDHAM,
+ UCP_TIRHUTA,
+ UCP_WARANG_CITI
+ };
+
+ enum
+ {
+ UCP_MAX_CODEPOINT = 0x10FFFF
+ };
+
+ struct CharacterProperties
+ /// This structure holds the character properties
+ /// of an Unicode character.
+ {
+ CharacterCategory category;
+ CharacterType type;
+ Script script;
+ };
+
+ static void properties(int ch, CharacterProperties& props);
+ /// Return the Unicode character properties for the
+ /// character with the given Unicode value.
+
+ static bool isSpace(int ch);
+ /// Returns true iff the given character is a separator.
+
+ static bool isDigit(int ch);
+ /// Returns true iff the given character is a numeric character.
+
+ static bool isPunct(int ch);
+ /// Returns true iff the given character is a punctuation character.
+
+ static bool isAlpha(int ch);
+ /// Returns true iff the given character is a letter.
+
+ static bool isLower(int ch);
+ /// Returns true iff the given character is a lowercase
+ /// character.
+
+ static bool isUpper(int ch);
+ /// Returns true iff the given character is an uppercase
+ /// character.
+
+ static int toLower(int ch);
+ /// If the given character is an uppercase character,
+ /// return its lowercase counterpart, otherwise return
+ /// the character.
+
+ static int toUpper(int ch);
+ /// If the given character is a lowercase character,
+ /// return its uppercase counterpart, otherwise return
+ /// the character.
+};
+
+
+//
+// inlines
+//
+inline bool Unicode::isSpace(int ch)
+{
+ CharacterProperties props;
+ properties(ch, props);
+ return props.category == UCP_SEPARATOR;
+}
+
+
+inline bool Unicode::isDigit(int ch)
+{
+ CharacterProperties props;
+ properties(ch, props);
+ return props.category == UCP_NUMBER;
+}
+
+
+inline bool Unicode::isPunct(int ch)
+{
+ CharacterProperties props;
+ properties(ch, props);
+ return props.category == UCP_PUNCTUATION;
+}
+
+
+inline bool Unicode::isAlpha(int ch)
+{
+ CharacterProperties props;
+ properties(ch, props);
+ return props.category == UCP_LETTER;
+}
+
+
+inline bool Unicode::isLower(int ch)
+{
+ CharacterProperties props;
+ properties(ch, props);
+ return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
+}
+
+
+inline bool Unicode::isUpper(int ch)
+{
+ CharacterProperties props;
+ properties(ch, props);
+ return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
+}
+
+
+} // namespace Poco
+
+
+#endif // Foundation_Unicode_INCLUDED