aboutsummaryrefslogblamecommitdiffstats
path: root/util/charset/unidata.h
blob: 400d3141868e311ed2b85aa155d5f4d725729874 (plain) (tree)
1
2
3
4
5
6
            
 

                                                         
 











                                              
 

                              
 
                                                           
                             
                             
 

                                                                       
                              
                                                         
                                                         


                                                 
 


                             
 






                                                                                 
                                                



                                                                          
 



                                                                         





                                                            
                    
 
                             
                         
 
                        
 
                        
 


                      
 
                     
  
                                             
                                
                    
                           








                                 
                                        
                                                                                                                                                     
                                            
 
                                                          
                                                       






















                                                                                    
     
 
                         
 
                                                                     
 
                                                       
 
                                                        
 

                                                                                                                                   
 
                                                                                              
 
                                                     
 
                                                     
 
                                                       
 
                                                                                                            
 
                                    
                                                                                                        
 
                                                                                                                                
 
                                                     
 
                                                          
 
                                                                                          
 
                                                                          
 
                                     
                                    

                                                                                                               
 

                                           
                                                                                                           
 
                                                         
 
                                       
                                    

                                                                                                               
 
                                                                                                 
 
                                                                                               
 
                                                                                                     
 
                                                                                 
 
                                      
                                    
                                                                                                                                                                   
 
                                                                                
 
                                                         
 
                                                         
 
                                                        
 
                                                      
 
                                                         
 
                                    
                                                              
 
                                                      
 
                                                                                                                 
 
                                                         
 
                                                     
 
                                                                                                                                   
 
                                                    
 
                                                     
 
                                                          
 
                                                                                                         
 
                                                                                             
 
                                                                                                        
 
 
                  
 
                                                                                 
 
                                                                                 
 
                                                                                 
 

                                              
                                                                    
 
 














































                                                      

                                                      
                                    

                                                                                                                              

                                    


                                                                                                                              

                                    


                                                                                                                                                   





                                  







                                                                                 








                                                                                        
















                                                                                    
            
#pragma once

#include "unicode_table.h"

#include <util/system/defaults.h> // wchar32, ui64, ULL()

enum WC_TYPE {        // TODO move no NUnicode
    Lu_UPPER = 1,     // 'Ъ'
    Ll_LOWER = 2,     // 'ъ'
    Lt_TITLE = 3,     // 'Ъ'
    Lm_EXTENDER = 4,  // '-'
    Lm_LETTER = 5,    // 'ъ'
    Lo_OTHER = 6,     // '?'
    Lo_IDEOGRAPH = 7, // '?'
    Lo_KATAKANA = 8,  // '?'
    Lo_HIRAGANA = 9,  // '?'
    Lo_LEADING = 10,  // '?'
    Lo_VOWEL = 11,    // '?'
    Lo_TRAILING = 12, // '?'

    Mn_NONSPACING = 13, // '`'
    Me_ENCLOSING = 14,  // '`'
    Mc_SPACING = 15,    // '`'

    Nd_DIGIT = 16,     // '9'           // convert to digit
    Nl_LETTER = 17,    // 'X'           // X,V,C,L,I ...
    Nl_IDEOGRAPH = 18, // '?'
    No_OTHER = 19,     // '9'

    Zs_SPACE = 20,     // ' ' [\40\240] SPACE ... NO-BREAK SPACE (00A0)
    Zs_ZWSPACE = 21,   // ' '           // nothing ?
    Zl_LINE = 22,      // '\n'
    Zp_PARAGRAPH = 23, // '\n'

    Cc_ASCII = 24,     // '\x1A'        // can not happen
    Cc_SPACE = 25,     // '\x1A'        // can not happen
    Cc_SEPARATOR = 26, // '\x1A'        // can not happen

    Cf_FORMAT = 27, // '\x1A'        // nothing ?
    Cf_JOIN = 28,   // '\x1A'        // nothing ?
    Cf_BIDI = 29,   // '\x1A'        // nothing ?
    Cf_ZWNBSP = 30, // '\x1A'        // nothing ?

    Cn_UNASSIGNED = 0, // '?'
    Co_PRIVATE = 0,    // '?'
    Cs_LOW = 31,       // '?'
    Cs_HIGH = 32,      // '?'

    Pd_DASH = 33,      // '-'
    Pd_HYPHEN = 34,    // '-' [-]       HYPHEN-MINUS
    Ps_START = 35,     // '(' [([{]     LEFT PARENTHESIS ... LEFT CURLY BRACKET
    Ps_QUOTE = 36,     // '"'
    Pe_END = 37,       // ')' [)]}]     RIGHT PARENTHESIS ... RIGHT CURLY BRACKET
    Pe_QUOTE = 38,     // '"'
    Pi_QUOTE = 39,     // '"'
    Pf_QUOTE = 40,     // '"'
    Pc_CONNECTOR = 41, // '_' [_]       LOW LINE
    Po_OTHER = 42,     // '*' [#%&*/@\] NUMBER SIGN ... REVERSE SOLIDUS
    Po_QUOTE = 43,     // '"' ["]       QUOTATION MARK
    Po_TERMINAL = 44,  // '.' [!,.:;?]  EXCLAMATION MARK ... QUESTION MARK
    Po_EXTENDER = 45,  // '-' [№]       MIDDLE DOT (00B7)
    Po_HYPHEN = 46,    // '-'

    Sm_MATH = 47,     // '=' [+<=>|~]  PLUS SIGN ... TILDE
    Sm_MINUS = 48,    // '-'
    Sc_CURRENCY = 49, // '$' [$]       DOLLAR SIGN
    Sk_MODIFIER = 50, // '`' [^`]      CIRCUMFLEX ACCENT ... GRAVE ACCENT
    So_OTHER = 51,    // '°' [°]       DEGREE SIGN (00B0)

    Ps_SINGLE_QUOTE = 52, // '\'' [']   OPENING SINGLE QUOTE
    Pe_SINGLE_QUOTE = 53, // '\'' [']   CLOSING SINGLE QUOTE
    Pi_SINGLE_QUOTE = 54, // '\'' [']   INITIAL SINGLE QUOTE
    Pf_SINGLE_QUOTE = 55, // '\'' [']   FINAL SINGLE QUOTE
    Po_SINGLE_QUOTE = 56, // '\'' [']   APOSTROPHE and PRIME

    CCL_NUM = 57,
    CCL_MASK = 0x3F,

    IS_ASCII_XDIGIT = 1 << 6,
    IS_DIGIT = 1 << 7,
    IS_NONBREAK = 1 << 8,

    IS_PRIVATE = 1 << 9,

    IS_COMPAT = 1 << 10,
    IS_CANON = 1 << 11,

    NFD_QC = 1 << 12,
    NFC_QC = 1 << 13,
    NFKD_QC = 1 << 14,
    NFKC_QC = 1 << 15,

    BIDI_OFFSET = 16,
    SVAL_OFFSET = 22,
};

const size_t DEFCHAR_BUF = 58; // CCL_NUM + 1

#define SHIFT(i) (ULL(1) << (i))

namespace NUnicode {
    using TCombining = ui8;

    namespace NPrivate {
        struct TProperty {
            ui32 Info;
            i32 Lower;
            i32 Upper;
            i32 Title;
            TCombining Combining;
        };

        extern const size_t DEFAULT_KEY;

        using TUnidataTable = NUnicodeTable::TTable<NUnicodeTable::TSubtable<NUnicodeTable::UNICODE_TABLE_SHIFT, NUnicodeTable::TValues<TProperty>>>;
        const TUnidataTable& UnidataTable();

        inline const TProperty& CharProperty(wchar32 ch) {
            return UnidataTable().Get(ch, DEFAULT_KEY);
        }

        inline ui32 CharInfo(wchar32 ch) {
            return CharProperty(ch).Info;
        }

        inline bool IsBidi(wchar32 ch, ui32 type) {
            return ((NUnicode::NPrivate::CharInfo(ch) >> BIDI_OFFSET) & 15) == type;
        }
    }

    inline size_t UnicodeInstancesLimit() {
        return NPrivate::UnidataTable().Size();
    }

    inline TCombining DecompositionCombining(wchar32 ch) {
        return NPrivate::CharProperty(ch).Combining;
    }

    inline WC_TYPE CharType(wchar32 ch) {
        return (WC_TYPE)(NUnicode::NPrivate::CharInfo(ch) & CCL_MASK);
    }
    inline bool CharHasType(wchar32 ch, ui64 type_bits) {
        return (SHIFT(NUnicode::CharType(ch)) & type_bits) != 0;
    }
}

// all usefull properties

inline bool IsComposed(wchar32 ch) {
    return NUnicode::NPrivate::CharInfo(ch) & (IS_COMPAT | IS_CANON);
}
inline bool IsCanonComposed(wchar32 ch) {
    return NUnicode::NPrivate::CharInfo(ch) & IS_CANON;
}
inline bool IsCompatComposed(wchar32 ch) {
    return NUnicode::NPrivate::CharInfo(ch) & IS_COMPAT;
}

inline bool IsWhitespace(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cc_SPACE) | SHIFT(Zs_SPACE) | SHIFT(Zs_ZWSPACE) | SHIFT(Zl_LINE) | SHIFT(Zp_PARAGRAPH));
}
inline bool IsAsciiCntrl(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR));
}
inline bool IsBidiCntrl(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cf_BIDI));
}
inline bool IsJoinCntrl(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cf_JOIN));
}
inline bool IsFormatCntrl(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT));
}
inline bool IsIgnorableCntrl(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP));
}
inline bool IsCntrl(wchar32 ch) {
    return NUnicode::CharHasType(ch,
                                 SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) |
                                     SHIFT(Cc_ASCII) | SHIFT(Cc_SPACE) | SHIFT(Cc_SEPARATOR));
}
inline bool IsZerowidth(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cf_FORMAT) | SHIFT(Cf_JOIN) | SHIFT(Cf_BIDI) | SHIFT(Cf_ZWNBSP) | SHIFT(Zs_ZWSPACE));
}
inline bool IsLineSep(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Zl_LINE));
}
inline bool IsParaSep(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Zp_PARAGRAPH));
}
inline bool IsDash(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Pd_DASH) | SHIFT(Pd_HYPHEN) | SHIFT(Sm_MINUS));
}
inline bool IsHyphen(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Pd_HYPHEN) | SHIFT(Po_HYPHEN));
}
inline bool IsQuotation(wchar32 ch) {
    return NUnicode::CharHasType(ch,
                                 SHIFT(Po_QUOTE) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) | SHIFT(Pi_QUOTE) |
                                     SHIFT(Pf_QUOTE) | SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
                                     SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
}

inline bool IsSingleQuotation(wchar32 ch) {
    return NUnicode::CharHasType(ch,
                                 SHIFT(Po_SINGLE_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) | SHIFT(Pe_SINGLE_QUOTE) |
                                     SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
}

inline bool IsTerminal(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Po_TERMINAL));
}
inline bool IsPairedPunct(wchar32 ch) {
    return NUnicode::CharHasType(ch,
                                 SHIFT(Ps_START) | SHIFT(Pe_END) | SHIFT(Ps_QUOTE) | SHIFT(Pe_QUOTE) |
                                     SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE) | SHIFT(Ps_SINGLE_QUOTE) |
                                     SHIFT(Pe_SINGLE_QUOTE) | SHIFT(Pi_SINGLE_QUOTE) | SHIFT(Pf_SINGLE_QUOTE));
}
inline bool IsLeftPunct(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Ps_SINGLE_QUOTE));
}
inline bool IsRightPunct(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pe_SINGLE_QUOTE));
}
inline bool IsCombining(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Mc_SPACING) | SHIFT(Mn_NONSPACING) | SHIFT(Me_ENCLOSING));
}
inline bool IsNonspacing(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Mn_NONSPACING) | SHIFT(Me_ENCLOSING));
}
inline bool IsAlphabetic(wchar32 ch) {
    return NUnicode::CharHasType(ch,
                                 SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_EXTENDER) | SHIFT(Lm_LETTER) | SHIFT(Lo_OTHER) | SHIFT(Nl_LETTER));
}
inline bool IsIdeographic(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Lo_IDEOGRAPH) | SHIFT(Nl_IDEOGRAPH));
}
inline bool IsKatakana(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Lo_KATAKANA));
}
inline bool IsHiragana(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Lo_HIRAGANA));
}
inline bool IsHangulLeading(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Lo_LEADING));
}
inline bool IsHangulVowel(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Lo_VOWEL));
}
inline bool IsHangulTrailing(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Lo_TRAILING));
}
inline bool IsHexdigit(wchar32 ch) {
    return NUnicode::NPrivate::CharInfo(ch) & IS_ASCII_XDIGIT;
}
inline bool IsDecdigit(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Nd_DIGIT));
}
inline bool IsNumeric(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER));
}
inline bool IsCurrency(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Sc_CURRENCY));
}
inline bool IsMath(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Sm_MATH));
}
inline bool IsSymbol(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Sm_MATH) | SHIFT(Sm_MINUS) | SHIFT(Sc_CURRENCY) | SHIFT(Sk_MODIFIER) | SHIFT(So_OTHER));
}
inline bool IsLowSurrogate(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cs_LOW));
}
inline bool IsHighSurrogate(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cs_HIGH));
}
inline bool IsNonbreak(wchar32 ch) {
    return NUnicode::NPrivate::CharInfo(ch) & IS_NONBREAK;
}
inline bool IsPrivate(wchar32 ch) {
    return (NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE) && !NUnicode::CharHasType(ch, SHIFT(Cs_HIGH));
}
inline bool IsUnassigned(wchar32 ch) {
    return (NUnicode::CharType(ch) == 0) && !(NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE);
}
inline bool IsPrivateHighSurrogate(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Cs_HIGH)) && (NUnicode::NPrivate::CharInfo(ch) & IS_PRIVATE);
}

// transformations

inline wchar32 ToLower(wchar32 ch) {
    return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Lower);
}
inline wchar32 ToUpper(wchar32 ch) {
    return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Upper);
}
inline wchar32 ToTitle(wchar32 ch) {
    return static_cast<wchar32>(ch + NUnicode::NPrivate::CharProperty(ch).Title);
}

inline int ToDigit(wchar32 ch) {
    ui32 i = NUnicode::NPrivate::CharInfo(ch);
    return (i & IS_DIGIT) ? static_cast<int>(i >> SVAL_OFFSET) : -1;
}

// BIDI properties

inline bool IsBidiLeft(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 1);
}
inline bool IsBidiRight(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 2);
}
inline bool IsBidiEuronum(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 3);
}
inline bool IsBidiEurosep(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 4);
}
inline bool IsBidiEuroterm(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 5);
}
inline bool IsBidiArabnum(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 6);
}
inline bool IsBidiCommsep(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 7);
}
inline bool IsBidiBlocksep(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 8);
}
inline bool IsBidiSegmsep(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 9);
}
inline bool IsBidiSpace(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 10);
}
inline bool IsBidiNeutral(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 11);
}
inline bool IsBidiNotappl(wchar32 ch) {
    return NUnicode::NPrivate::IsBidi(ch, 0);
}

inline bool IsSpace(wchar32 ch) {
    return IsWhitespace(ch);
}
inline bool IsLower(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Ll_LOWER));
}
inline bool IsUpper(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Lu_UPPER));
}
inline bool IsTitle(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Lt_TITLE));
}
inline bool IsAlpha(wchar32 ch) {
    return NUnicode::CharHasType(ch,
                                 SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
                                     SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
                                     SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING));
}
inline bool IsAlnum(wchar32 ch) {
    return NUnicode::CharHasType(ch,
                                 SHIFT(Lu_UPPER) | SHIFT(Ll_LOWER) | SHIFT(Lt_TITLE) | SHIFT(Lm_LETTER) | SHIFT(Lm_EXTENDER) |
                                     SHIFT(Lo_OTHER) | SHIFT(Lo_IDEOGRAPH) | SHIFT(Lo_KATAKANA) | SHIFT(Lo_HIRAGANA) |
                                     SHIFT(Lo_LEADING) | SHIFT(Lo_VOWEL) | SHIFT(Lo_TRAILING) |
                                     SHIFT(Nd_DIGIT) | SHIFT(Nl_LETTER) | SHIFT(Nl_IDEOGRAPH) | SHIFT(No_OTHER));
}
inline bool IsPunct(wchar32 ch) {
    return NUnicode::CharHasType(ch,
                                 SHIFT(Pd_DASH) |
                                     SHIFT(Pd_HYPHEN) | SHIFT(Ps_START) | SHIFT(Ps_QUOTE) | SHIFT(Pe_END) | SHIFT(Pe_QUOTE) | SHIFT(Pc_CONNECTOR) |
                                     SHIFT(Po_OTHER) | SHIFT(Po_QUOTE) | SHIFT(Po_TERMINAL) | SHIFT(Po_EXTENDER) | SHIFT(Po_HYPHEN) |
                                     SHIFT(Pi_QUOTE) | SHIFT(Pf_QUOTE));
}
inline bool IsXdigit(wchar32 ch) {
    return IsHexdigit(ch);
}
inline bool IsDigit(wchar32 ch) {
    return IsDecdigit(ch);
}

inline bool IsCommonDigit(wchar32 ch) {
    // IsDigit returns true for some exotic symbols like "VAI DIGIT TWO" (U+A622)
    // and cannot be used safely with FromString() convertors
    const wchar32 ZERO = '0';
    const wchar32 NINE = '9';
    return ch >= ZERO && ch <= NINE;
}

inline bool IsGraph(wchar32 ch) {
    return IsAlnum(ch) || IsPunct(ch) || IsSymbol(ch);
}
inline bool IsBlank(wchar32 ch) {
    return NUnicode::CharHasType(ch, SHIFT(Zs_SPACE) | SHIFT(Zs_ZWSPACE)) || ch == '\t';
}
inline bool IsPrint(wchar32 ch) {
    return IsAlnum(ch) || IsPunct(ch) || IsSymbol(ch) || IsBlank(ch);
}

inline bool IsRomanDigit(wchar32 ch) {
    if (NUnicode::CharHasType(ch, SHIFT(Nl_LETTER)) && 0x2160 <= ch && ch <= 0x2188)
        return true;
    if (ch < 127) {
        switch (static_cast<char>(::ToLower(ch))) {
            case 'i':
            case 'v':
            case 'x':
            case 'l':
            case 'c':
            case 'd':
            case 'm':
                return true;
        }
    }
    return false;
}

#undef SHIFT