aboutsummaryrefslogtreecommitdiffstats
path: root/util/charset/unidata.h
diff options
context:
space:
mode:
authorAnton Samokhvalov <pg83@yandex.ru>2022-02-10 16:45:15 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:15 +0300
commit72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch)
treeda2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /util/charset/unidata.h
parent778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff)
downloadydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'util/charset/unidata.h')
-rw-r--r--util/charset/unidata.h190
1 files changed, 95 insertions, 95 deletions
diff --git a/util/charset/unidata.h b/util/charset/unidata.h
index 400d314186..422c83d8ce 100644
--- a/util/charset/unidata.h
+++ b/util/charset/unidata.h
@@ -1,71 +1,71 @@
#pragma once
-#include "unicode_table.h"
-
-#include <util/system/defaults.h> // wchar32, ui64, ULL()
-
-enum WC_TYPE { // TODO move no NUnicode
- Lu_UPPER = 1, // 'Ъ'
- Ll_LOWER = 2, // 'ъ'
- Lt_TITLE = 3, // 'Ъ'
- Lm_EXTENDER = 4, // '-'
- Lm_LETTER = 5, // 'ъ'
- Lo_OTHER = 6, // '?'
- Lo_IDEOGRAPH = 7, // '?'
- Lo_KATAKANA = 8, // '?'
- Lo_HIRAGANA = 9, // '?'
- Lo_LEADING = 10, // '?'
- Lo_VOWEL = 11, // '?'
- Lo_TRAILING = 12, // '?'
-
- Mn_NONSPACING = 13, // '`'
- Me_ENCLOSING = 14, // '`'
- Mc_SPACING = 15, // '`'
-
- Nd_DIGIT = 16, // '9' // convert to digit
- Nl_LETTER = 17, // 'X' // X,V,C,L,I ...
+#include "unicode_table.h"
+
+#include <util/system/defaults.h> // wchar32, ui64, ULL()
+
+enum WC_TYPE { // TODO move no NUnicode
+ Lu_UPPER = 1, // 'Ъ'
+ Ll_LOWER = 2, // 'ъ'
+ Lt_TITLE = 3, // 'Ъ'
+ Lm_EXTENDER = 4, // '-'
+ Lm_LETTER = 5, // 'ъ'
+ Lo_OTHER = 6, // '?'
+ Lo_IDEOGRAPH = 7, // '?'
+ Lo_KATAKANA = 8, // '?'
+ Lo_HIRAGANA = 9, // '?'
+ Lo_LEADING = 10, // '?'
+ Lo_VOWEL = 11, // '?'
+ Lo_TRAILING = 12, // '?'
+
+ Mn_NONSPACING = 13, // '`'
+ Me_ENCLOSING = 14, // '`'
+ Mc_SPACING = 15, // '`'
+
+ Nd_DIGIT = 16, // '9' // convert to digit
+ Nl_LETTER = 17, // 'X' // X,V,C,L,I ...
Nl_IDEOGRAPH = 18, // '?'
- No_OTHER = 19, // '9'
+ No_OTHER = 19, // '9'
- Zs_SPACE = 20, // ' ' [\40\240] SPACE ... NO-BREAK SPACE (00A0)
- Zs_ZWSPACE = 21, // ' ' // nothing ?
- Zl_LINE = 22, // '\n'
+ Zs_SPACE = 20, // ' ' [\40\240] SPACE ... NO-BREAK SPACE (00A0)
+ Zs_ZWSPACE = 21, // ' ' // nothing ?
+ Zl_LINE = 22, // '\n'
Zp_PARAGRAPH = 23, // '\n'
- Cc_ASCII = 24, // '\x1A' // can not happen
- Cc_SPACE = 25, // '\x1A' // can not happen
+ Cc_ASCII = 24, // '\x1A' // can not happen
+ Cc_SPACE = 25, // '\x1A' // can not happen
Cc_SEPARATOR = 26, // '\x1A' // can not happen
- Cf_FORMAT = 27, // '\x1A' // nothing ?
- Cf_JOIN = 28, // '\x1A' // nothing ?
- Cf_BIDI = 29, // '\x1A' // nothing ?
- Cf_ZWNBSP = 30, // '\x1A' // nothing ?
-
- Cn_UNASSIGNED = 0, // '?'
- Co_PRIVATE = 0, // '?'
- Cs_LOW = 31, // '?'
- Cs_HIGH = 32, // '?'
-
- Pd_DASH = 33, // '-'
- Pd_HYPHEN = 34, // '-' [-] HYPHEN-MINUS
- Ps_START = 35, // '(' [([{] LEFT PARENTHESIS ... LEFT CURLY BRACKET
- Ps_QUOTE = 36, // '"'
- Pe_END = 37, // ')' [)]}] RIGHT PARENTHESIS ... RIGHT CURLY BRACKET
- Pe_QUOTE = 38, // '"'
- Pi_QUOTE = 39, // '"'
- Pf_QUOTE = 40, // '"'
+ Cf_FORMAT = 27, // '\x1A' // nothing ?
+ Cf_JOIN = 28, // '\x1A' // nothing ?
+ Cf_BIDI = 29, // '\x1A' // nothing ?
+ Cf_ZWNBSP = 30, // '\x1A' // nothing ?
+
+ Cn_UNASSIGNED = 0, // '?'
+ Co_PRIVATE = 0, // '?'
+ Cs_LOW = 31, // '?'
+ Cs_HIGH = 32, // '?'
+
+ Pd_DASH = 33, // '-'
+ Pd_HYPHEN = 34, // '-' [-] HYPHEN-MINUS
+ Ps_START = 35, // '(' [([{] LEFT PARENTHESIS ... LEFT CURLY BRACKET
+ Ps_QUOTE = 36, // '"'
+ Pe_END = 37, // ')' [)]}] RIGHT PARENTHESIS ... RIGHT CURLY BRACKET
+ Pe_QUOTE = 38, // '"'
+ Pi_QUOTE = 39, // '"'
+ Pf_QUOTE = 40, // '"'
Pc_CONNECTOR = 41, // '_' [_] LOW LINE
- Po_OTHER = 42, // '*' [#%&*/@\] NUMBER SIGN ... REVERSE SOLIDUS
- Po_QUOTE = 43, // '"' ["] QUOTATION MARK
- Po_TERMINAL = 44, // '.' [!,.:;?] EXCLAMATION MARK ... QUESTION MARK
- Po_EXTENDER = 45, // '-' [№] MIDDLE DOT (00B7)
- Po_HYPHEN = 46, // '-'
-
- Sm_MATH = 47, // '=' [+<=>|~] PLUS SIGN ... TILDE
- Sm_MINUS = 48, // '-'
- Sc_CURRENCY = 49, // '$' [$] DOLLAR SIGN
- Sk_MODIFIER = 50, // '`' [^`] CIRCUMFLEX ACCENT ... GRAVE ACCENT
- So_OTHER = 51, // '°' [°] DEGREE SIGN (00B0)
+ Po_OTHER = 42, // '*' [#%&*/@\] NUMBER SIGN ... REVERSE SOLIDUS
+ Po_QUOTE = 43, // '"' ["] QUOTATION MARK
+ Po_TERMINAL = 44, // '.' [!,.:;?] EXCLAMATION MARK ... QUESTION MARK
+ Po_EXTENDER = 45, // '-' [№] MIDDLE DOT (00B7)
+ Po_HYPHEN = 46, // '-'
+
+ Sm_MATH = 47, // '=' [+<=>|~] PLUS SIGN ... TILDE
+ Sm_MINUS = 48, // '-'
+ Sc_CURRENCY = 49, // '$' [$] DOLLAR SIGN
+ Sk_MODIFIER = 50, // '`' [^`] CIRCUMFLEX ACCENT ... GRAVE ACCENT
+ So_OTHER = 51, // '°' [°] DEGREE SIGN (00B0)
Ps_SINGLE_QUOTE = 52, // '\'' ['] OPENING SINGLE QUOTE
Pe_SINGLE_QUOTE = 53, // '\'' ['] CLOSING SINGLE QUOTE
@@ -73,8 +73,8 @@ enum WC_TYPE { // TODO move no NUnicode
Pf_SINGLE_QUOTE = 55, // '\'' ['] FINAL SINGLE QUOTE
Po_SINGLE_QUOTE = 56, // '\'' ['] APOSTROPHE and PRIME
- CCL_NUM = 57,
- CCL_MASK = 0x3F,
+ CCL_NUM = 57,
+ CCL_MASK = 0x3F,
IS_ASCII_XDIGIT = 1 << 6,
IS_DIGIT = 1 << 7,
@@ -90,13 +90,13 @@ enum WC_TYPE { // TODO move no NUnicode
NFKD_QC = 1 << 14,
NFKC_QC = 1 << 15,
- BIDI_OFFSET = 16,
- SVAL_OFFSET = 22,
+ BIDI_OFFSET = 16,
+ SVAL_OFFSET = 22,
};
const size_t DEFCHAR_BUF = 58; // CCL_NUM + 1
-#define SHIFT(i) (ULL(1) << (i))
+#define SHIFT(i) (ULL(1) << (i))
namespace NUnicode {
using TCombining = ui8;
@@ -114,7 +114,7 @@ namespace NUnicode {
using TUnidataTable = NUnicodeTable::TTable<NUnicodeTable::TSubtable<NUnicodeTable::UNICODE_TABLE_SHIFT, NUnicodeTable::TValues<TProperty>>>;
const TUnidataTable& UnidataTable();
-
+
inline const TProperty& CharProperty(wchar32 ch) {
return UnidataTable().Get(ch, DEFAULT_KEY);
}
@@ -126,7 +126,7 @@ namespace NUnicode {
inline bool IsBidi(wchar32 ch, ui32 type) {
return ((NUnicode::NPrivate::CharInfo(ch) >> BIDI_OFFSET) & 15) == type;
}
- }
+ }
inline size_t UnicodeInstancesLimit() {
return NPrivate::UnidataTable().Size();
@@ -142,10 +142,10 @@ namespace NUnicode {
inline bool CharHasType(wchar32 ch, ui64 type_bits) {
return (SHIFT(NUnicode::CharType(ch)) & type_bits) != 0;
}
-}
+}
// all usefull properties
-
+
inline bool IsComposed(wchar32 ch) {
return NUnicode::NPrivate::CharInfo(ch) & (IS_COMPAT | IS_CANON);
}
@@ -176,8 +176,8 @@ inline bool IsIgnorableCntrl(wchar32 ch) {
}
inline bool IsCntrl(wchar32 ch) {
return NUnicode::CharHasType(ch,
- SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) |
- SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR));
+ SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) |
+ SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR));
}
inline bool IsZerowidth(wchar32 ch) {
return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) | SHIFT(Zs_ZWSPACE));
@@ -196,15 +196,15 @@ inline bool IsHyphen(wchar32 ch) {
}
inline bool IsQuotation(wchar32 ch) {
return NUnicode::CharHasType(ch,
- SHIFT(Po_QUOTE) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) | SHIFT(Pi_QUOTE) |
- SHIFT(Pf_QUOTE) | SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
- SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
+ SHIFT(Po_QUOTE) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) | SHIFT(Pi_QUOTE) |
+ SHIFT(Pf_QUOTE) | SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
+ SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
}
inline bool IsSingleQuotation(wchar32 ch) {
return NUnicode::CharHasType(ch,
- SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | SHIFT(Pe_SINGLE_QUOTE) |
- SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
+ SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | SHIFT(Pe_SINGLE_QUOTE) |
+ SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
}
inline bool IsTerminal(wchar32 ch) {
@@ -212,9 +212,9 @@ inline bool IsTerminal(wchar32 ch) {
}
inline bool IsPairedPunct(wchar32 ch) {
return NUnicode::CharHasType(ch,
- SHIFT(Ps_START) | SHIFT(Pe_END) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) |
- SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
- SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
+ SHIFT(Ps_START) | SHIFT(Pe_END) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) |
+ SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
+ SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
}
inline bool IsLeftPunct(wchar32 ch) {
return NUnicode::CharHasType(ch, SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Ps_SINGLE_QUOTE));
@@ -230,7 +230,7 @@ inline bool IsNonspacing(wchar32 ch) {
}
inline bool IsAlphabetic(wchar32 ch) {
return NUnicode::CharHasType(ch,
- SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_EXTENDER) | SHIFT(Lm_LETTER) | SHIFT(Lo_OTHER) | SHIFT(Nl_LETTER));
+ SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_EXTENDER) | SHIFT(Lm_LETTER) | SHIFT(Lo_OTHER) | SHIFT(Nl_LETTER));
}
inline bool IsIdeographic(wchar32 ch) {
return NUnicode::CharHasType(ch, SHIFT(Lo_IDEOGRAPH) | SHIFT(Nl_IDEOGRAPH));
@@ -286,9 +286,9 @@ inline bool IsUnassigned(wchar32 ch) {
inline bool IsPrivateHighSurrogate(wchar32 ch) {
return NUnicode::CharHasType(ch, SHIFT(Cs_HIGH)) && (NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE);
}
-
+
// transformations
-
+
inline wchar32 ToLower(wchar32 ch) {
return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Lower);
}
@@ -303,9 +303,9 @@ inline int ToDigit(wchar32 ch) {
ui32 i = NUnicode::NPrivate::CharInfo(ch);
return (i & IS_DIGIT) ? static_cast<int>(i >> SVAL_OFFSET) : -1;
}
-
+
// BIDI properties
-
+
inline bool IsBidiLeft(wchar32 ch) {
return NUnicode::NPrivate::IsBidi(ch, 1);
}
@@ -342,7 +342,7 @@ inline bool IsBidiNeutral(wchar32 ch) {
inline bool IsBidiNotappl(wchar32 ch) {
return NUnicode::NPrivate::IsBidi(ch, 0);
}
-
+
inline bool IsSpace(wchar32 ch) {
return IsWhitespace(ch);
}
@@ -357,23 +357,23 @@ inline bool IsTitle(wchar32 ch) {
}
inline bool IsAlpha(wchar32 ch) {
return NUnicode::CharHasType(ch,
- SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
- SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
- SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING));
+ SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
+ SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
+ SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING));
}
inline bool IsAlnum(wchar32 ch) {
return NUnicode::CharHasType(ch,
- SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
- SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
- SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING) |
- SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER));
+ SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
+ SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
+ SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING) |
+ SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER));
}
inline bool IsPunct(wchar32 ch) {
return NUnicode::CharHasType(ch,
- SHIFT(Pd_DASH) |
- SHIFT(Pd_HYPHEN) | SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pc_CONNECTOR) |
- SHIFT(Po_OTHER) | SHIFT(Po_QUOTE) | SHIFT(Po_TERMINAL) | SHIFT(Po_EXTENDER) | SHIFT(Po_HYPHEN) |
- SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE));
+ SHIFT(Pd_DASH) |
+ SHIFT(Pd_HYPHEN) | SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pc_CONNECTOR) |
+ SHIFT(Po_OTHER) | SHIFT(Po_QUOTE) | SHIFT(Po_TERMINAL) | SHIFT(Po_EXTENDER) | SHIFT(Po_HYPHEN) |
+ SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE));
}
inline bool IsXdigit(wchar32 ch) {
return IsHexdigit(ch);