diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /util/charset/unidata.h | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'util/charset/unidata.h')
-rw-r--r-- | util/charset/unidata.h | 190 |
1 files changed, 95 insertions, 95 deletions
diff --git a/util/charset/unidata.h b/util/charset/unidata.h index 400d314186..422c83d8ce 100644 --- a/util/charset/unidata.h +++ b/util/charset/unidata.h @@ -1,71 +1,71 @@ #pragma once -#include "unicode_table.h" - -#include <util/system/defaults.h> // wchar32, ui64, ULL() - -enum WC_TYPE { // TODO move no NUnicode - Lu_UPPER = 1, // 'Ъ' - Ll_LOWER = 2, // 'ъ' - Lt_TITLE = 3, // 'Ъ' - Lm_EXTENDER = 4, // '-' - Lm_LETTER = 5, // 'ъ' - Lo_OTHER = 6, // '?' - Lo_IDEOGRAPH = 7, // '?' - Lo_KATAKANA = 8, // '?' - Lo_HIRAGANA = 9, // '?' - Lo_LEADING = 10, // '?' - Lo_VOWEL = 11, // '?' - Lo_TRAILING = 12, // '?' - - Mn_NONSPACING = 13, // '`' - Me_ENCLOSING = 14, // '`' - Mc_SPACING = 15, // '`' - - Nd_DIGIT = 16, // '9' // convert to digit - Nl_LETTER = 17, // 'X' // X,V,C,L,I ... +#include "unicode_table.h" + +#include <util/system/defaults.h> // wchar32, ui64, ULL() + +enum WC_TYPE { // TODO move no NUnicode + Lu_UPPER = 1, // 'Ъ' + Ll_LOWER = 2, // 'ъ' + Lt_TITLE = 3, // 'Ъ' + Lm_EXTENDER = 4, // '-' + Lm_LETTER = 5, // 'ъ' + Lo_OTHER = 6, // '?' + Lo_IDEOGRAPH = 7, // '?' + Lo_KATAKANA = 8, // '?' + Lo_HIRAGANA = 9, // '?' + Lo_LEADING = 10, // '?' + Lo_VOWEL = 11, // '?' + Lo_TRAILING = 12, // '?' + + Mn_NONSPACING = 13, // '`' + Me_ENCLOSING = 14, // '`' + Mc_SPACING = 15, // '`' + + Nd_DIGIT = 16, // '9' // convert to digit + Nl_LETTER = 17, // 'X' // X,V,C,L,I ... Nl_IDEOGRAPH = 18, // '?' - No_OTHER = 19, // '9' + No_OTHER = 19, // '9' - Zs_SPACE = 20, // ' ' [\40\240] SPACE ... NO-BREAK SPACE (00A0) - Zs_ZWSPACE = 21, // ' ' // nothing ? - Zl_LINE = 22, // '\n' + Zs_SPACE = 20, // ' ' [\40\240] SPACE ... NO-BREAK SPACE (00A0) + Zs_ZWSPACE = 21, // ' ' // nothing ? + Zl_LINE = 22, // '\n' Zp_PARAGRAPH = 23, // '\n' - Cc_ASCII = 24, // '\x1A' // can not happen - Cc_SPACE = 25, // '\x1A' // can not happen + Cc_ASCII = 24, // '\x1A' // can not happen + Cc_SPACE = 25, // '\x1A' // can not happen Cc_SEPARATOR = 26, // '\x1A' // can not happen - Cf_FORMAT = 27, // '\x1A' // nothing ? - Cf_JOIN = 28, // '\x1A' // nothing ? - Cf_BIDI = 29, // '\x1A' // nothing ? - Cf_ZWNBSP = 30, // '\x1A' // nothing ? - - Cn_UNASSIGNED = 0, // '?' - Co_PRIVATE = 0, // '?' - Cs_LOW = 31, // '?' - Cs_HIGH = 32, // '?' - - Pd_DASH = 33, // '-' - Pd_HYPHEN = 34, // '-' [-] HYPHEN-MINUS - Ps_START = 35, // '(' [([{] LEFT PARENTHESIS ... LEFT CURLY BRACKET - Ps_QUOTE = 36, // '"' - Pe_END = 37, // ')' [)]}] RIGHT PARENTHESIS ... RIGHT CURLY BRACKET - Pe_QUOTE = 38, // '"' - Pi_QUOTE = 39, // '"' - Pf_QUOTE = 40, // '"' + Cf_FORMAT = 27, // '\x1A' // nothing ? + Cf_JOIN = 28, // '\x1A' // nothing ? + Cf_BIDI = 29, // '\x1A' // nothing ? + Cf_ZWNBSP = 30, // '\x1A' // nothing ? + + Cn_UNASSIGNED = 0, // '?' + Co_PRIVATE = 0, // '?' + Cs_LOW = 31, // '?' + Cs_HIGH = 32, // '?' + + Pd_DASH = 33, // '-' + Pd_HYPHEN = 34, // '-' [-] HYPHEN-MINUS + Ps_START = 35, // '(' [([{] LEFT PARENTHESIS ... LEFT CURLY BRACKET + Ps_QUOTE = 36, // '"' + Pe_END = 37, // ')' [)]}] RIGHT PARENTHESIS ... RIGHT CURLY BRACKET + Pe_QUOTE = 38, // '"' + Pi_QUOTE = 39, // '"' + Pf_QUOTE = 40, // '"' Pc_CONNECTOR = 41, // '_' [_] LOW LINE - Po_OTHER = 42, // '*' [#%&*/@\] NUMBER SIGN ... REVERSE SOLIDUS - Po_QUOTE = 43, // '"' ["] QUOTATION MARK - Po_TERMINAL = 44, // '.' [!,.:;?] EXCLAMATION MARK ... QUESTION MARK - Po_EXTENDER = 45, // '-' [№] MIDDLE DOT (00B7) - Po_HYPHEN = 46, // '-' - - Sm_MATH = 47, // '=' [+<=>|~] PLUS SIGN ... TILDE - Sm_MINUS = 48, // '-' - Sc_CURRENCY = 49, // '$' [$] DOLLAR SIGN - Sk_MODIFIER = 50, // '`' [^`] CIRCUMFLEX ACCENT ... GRAVE ACCENT - So_OTHER = 51, // '°' [°] DEGREE SIGN (00B0) + Po_OTHER = 42, // '*' [#%&*/@\] NUMBER SIGN ... REVERSE SOLIDUS + Po_QUOTE = 43, // '"' ["] QUOTATION MARK + Po_TERMINAL = 44, // '.' [!,.:;?] EXCLAMATION MARK ... QUESTION MARK + Po_EXTENDER = 45, // '-' [№] MIDDLE DOT (00B7) + Po_HYPHEN = 46, // '-' + + Sm_MATH = 47, // '=' [+<=>|~] PLUS SIGN ... TILDE + Sm_MINUS = 48, // '-' + Sc_CURRENCY = 49, // '$' [$] DOLLAR SIGN + Sk_MODIFIER = 50, // '`' [^`] CIRCUMFLEX ACCENT ... GRAVE ACCENT + So_OTHER = 51, // '°' [°] DEGREE SIGN (00B0) Ps_SINGLE_QUOTE = 52, // '\'' ['] OPENING SINGLE QUOTE Pe_SINGLE_QUOTE = 53, // '\'' ['] CLOSING SINGLE QUOTE @@ -73,8 +73,8 @@ enum WC_TYPE { // TODO move no NUnicode Pf_SINGLE_QUOTE = 55, // '\'' ['] FINAL SINGLE QUOTE Po_SINGLE_QUOTE = 56, // '\'' ['] APOSTROPHE and PRIME - CCL_NUM = 57, - CCL_MASK = 0x3F, + CCL_NUM = 57, + CCL_MASK = 0x3F, IS_ASCII_XDIGIT = 1 << 6, IS_DIGIT = 1 << 7, @@ -90,13 +90,13 @@ enum WC_TYPE { // TODO move no NUnicode NFKD_QC = 1 << 14, NFKC_QC = 1 << 15, - BIDI_OFFSET = 16, - SVAL_OFFSET = 22, + BIDI_OFFSET = 16, + SVAL_OFFSET = 22, }; const size_t DEFCHAR_BUF = 58; // CCL_NUM + 1 -#define SHIFT(i) (ULL(1) << (i)) +#define SHIFT(i) (ULL(1) << (i)) namespace NUnicode { using TCombining = ui8; @@ -114,7 +114,7 @@ namespace NUnicode { using TUnidataTable = NUnicodeTable::TTable<NUnicodeTable::TSubtable<NUnicodeTable::UNICODE_TABLE_SHIFT, NUnicodeTable::TValues<TProperty>>>; const TUnidataTable& UnidataTable(); - + inline const TProperty& CharProperty(wchar32 ch) { return UnidataTable().Get(ch, DEFAULT_KEY); } @@ -126,7 +126,7 @@ namespace NUnicode { inline bool IsBidi(wchar32 ch, ui32 type) { return ((NUnicode::NPrivate::CharInfo(ch) >> BIDI_OFFSET) & 15) == type; } - } + } inline size_t UnicodeInstancesLimit() { return NPrivate::UnidataTable().Size(); @@ -142,10 +142,10 @@ namespace NUnicode { inline bool CharHasType(wchar32 ch, ui64 type_bits) { return (SHIFT(NUnicode::CharType(ch)) & type_bits) != 0; } -} +} // all usefull properties - + inline bool IsComposed(wchar32 ch) { return NUnicode::NPrivate::CharInfo(ch) & (IS_COMPAT | IS_CANON); } @@ -176,8 +176,8 @@ inline bool IsIgnorableCntrl(wchar32 ch) { } inline bool IsCntrl(wchar32 ch) { return NUnicode::CharHasType(ch, - SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) | - SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR)); + SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) | + SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR)); } inline bool IsZerowidth(wchar32 ch) { return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) | SHIFT(Zs_ZWSPACE)); @@ -196,15 +196,15 @@ inline bool IsHyphen(wchar32 ch) { } inline bool IsQuotation(wchar32 ch) { return NUnicode::CharHasType(ch, - SHIFT(Po_QUOTE) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) | SHIFT(Pi_QUOTE) | - SHIFT(Pf_QUOTE) | SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | - SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE)); + SHIFT(Po_QUOTE) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) | SHIFT(Pi_QUOTE) | + SHIFT(Pf_QUOTE) | SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | + SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE)); } inline bool IsSingleQuotation(wchar32 ch) { return NUnicode::CharHasType(ch, - SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | SHIFT(Pe_SINGLE_QUOTE) | - SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE)); + SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | SHIFT(Pe_SINGLE_QUOTE) | + SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE)); } inline bool IsTerminal(wchar32 ch) { @@ -212,9 +212,9 @@ inline bool IsTerminal(wchar32 ch) { } inline bool IsPairedPunct(wchar32 ch) { return NUnicode::CharHasType(ch, - SHIFT(Ps_START) | SHIFT(Pe_END) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) | - SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | - SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE)); + SHIFT(Ps_START) | SHIFT(Pe_END) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) | + SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | + SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE)); } inline bool IsLeftPunct(wchar32 ch) { return NUnicode::CharHasType(ch, SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Ps_SINGLE_QUOTE)); @@ -230,7 +230,7 @@ inline bool IsNonspacing(wchar32 ch) { } inline bool IsAlphabetic(wchar32 ch) { return NUnicode::CharHasType(ch, - SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_EXTENDER) | SHIFT(Lm_LETTER) | SHIFT(Lo_OTHER) | SHIFT(Nl_LETTER)); + SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_EXTENDER) | SHIFT(Lm_LETTER) | SHIFT(Lo_OTHER) | SHIFT(Nl_LETTER)); } inline bool IsIdeographic(wchar32 ch) { return NUnicode::CharHasType(ch, SHIFT(Lo_IDEOGRAPH) | SHIFT(Nl_IDEOGRAPH)); @@ -286,9 +286,9 @@ inline bool IsUnassigned(wchar32 ch) { inline bool IsPrivateHighSurrogate(wchar32 ch) { return NUnicode::CharHasType(ch, SHIFT(Cs_HIGH)) && (NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE); } - + // transformations - + inline wchar32 ToLower(wchar32 ch) { return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Lower); } @@ -303,9 +303,9 @@ inline int ToDigit(wchar32 ch) { ui32 i = NUnicode::NPrivate::CharInfo(ch); return (i & IS_DIGIT) ? static_cast<int>(i >> SVAL_OFFSET) : -1; } - + // BIDI properties - + inline bool IsBidiLeft(wchar32 ch) { return NUnicode::NPrivate::IsBidi(ch, 1); } @@ -342,7 +342,7 @@ inline bool IsBidiNeutral(wchar32 ch) { inline bool IsBidiNotappl(wchar32 ch) { return NUnicode::NPrivate::IsBidi(ch, 0); } - + inline bool IsSpace(wchar32 ch) { return IsWhitespace(ch); } @@ -357,23 +357,23 @@ inline bool IsTitle(wchar32 ch) { } inline bool IsAlpha(wchar32 ch) { return NUnicode::CharHasType(ch, - SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) | - SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) | - SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING)); + SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) | + SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) | + SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING)); } inline bool IsAlnum(wchar32 ch) { return NUnicode::CharHasType(ch, - SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) | - SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) | - SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING) | - SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER)); + SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) | + SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) | + SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING) | + SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER)); } inline bool IsPunct(wchar32 ch) { return NUnicode::CharHasType(ch, - SHIFT(Pd_DASH) | - SHIFT(Pd_HYPHEN) | SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pc_CONNECTOR) | - SHIFT(Po_OTHER) | SHIFT(Po_QUOTE) | SHIFT(Po_TERMINAL) | SHIFT(Po_EXTENDER) | SHIFT(Po_HYPHEN) | - SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE)); + SHIFT(Pd_DASH) | + SHIFT(Pd_HYPHEN) | SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pc_CONNECTOR) | + SHIFT(Po_OTHER) | SHIFT(Po_QUOTE) | SHIFT(Po_TERMINAL) | SHIFT(Po_EXTENDER) | SHIFT(Po_HYPHEN) | + SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE)); } inline bool IsXdigit(wchar32 ch) { return IsHexdigit(ch); |