diff options
author | mcheshkov <mcheshkov@yandex-team.ru> | 2022-02-10 16:46:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:15 +0300 |
commit | e9d19cec64684c9c1e6b0c98297e5b895cf904fe (patch) | |
tree | 2768b1223e96a8a0610a93d18425d9647c1123c8 /contrib/libs/icu/i18n/regexst.cpp | |
parent | 60040c91ffe701a84689b2c6310ff845e65cff42 (diff) | |
download | ydb-e9d19cec64684c9c1e6b0c98297e5b895cf904fe.tar.gz |
Restoring authorship annotation for <mcheshkov@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/i18n/regexst.cpp')
-rw-r--r-- | contrib/libs/icu/i18n/regexst.cpp | 136 |
1 files changed, 68 insertions, 68 deletions
diff --git a/contrib/libs/icu/i18n/regexst.cpp b/contrib/libs/icu/i18n/regexst.cpp index 97e417ab5a..d117a80e9b 100644 --- a/contrib/libs/icu/i18n/regexst.cpp +++ b/contrib/libs/icu/i18n/regexst.cpp @@ -1,4 +1,4 @@ -// © 2016 and later: Unicode, Inc. and others. +// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // regexst.h @@ -37,99 +37,99 @@ U_NAMESPACE_BEGIN -// "Rule Char" Characters are those with special meaning, and therefore -// need to be escaped to appear as literals in a regexp. -constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\."; +// "Rule Char" Characters are those with special meaning, and therefore +// need to be escaped to appear as literals in a regexp. +constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\."; // -// The backslash escape characters that ICU's unescape() function will handle. +// The backslash escape characters that ICU's unescape() function will handle. // -constexpr char16_t const *gUnescapeChars = u"acefnrtuUx"; +constexpr char16_t const *gUnescapeChars = u"acefnrtuUx"; // -// Unicode Set pattern for Regular Expression \w +// Unicode Set pattern for Regular Expression \w // -constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]"; +constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]"; // // Unicode Set Definitions for Regular Expression \s // -constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]"; +constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]"; // // UnicodeSets used in implementation of Grapheme Cluster detection, \X // -constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]"; -constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]"; -constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]"; -constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]"; -constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]"; -constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]"; -constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]"; - - -RegexStaticSets *RegexStaticSets::gStaticSets = nullptr; -UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER; - - -RegexStaticSets::RegexStaticSets(UErrorCode *status) { - // Initialize the shared static sets to their correct values. - fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze(); - fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze(); - fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze(); - fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze(); - fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze(); - fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze(); - fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze(); - fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze(); - fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze(); - fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze(); +constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]"; +constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]"; +constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]"; +constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]"; +constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]"; +constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]"; +constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]"; + + +RegexStaticSets *RegexStaticSets::gStaticSets = nullptr; +UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER; + + +RegexStaticSets::RegexStaticSets(UErrorCode *status) { + // Initialize the shared static sets to their correct values. + fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze(); + fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze(); + fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze(); + fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze(); + fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze(); + fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze(); + fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze(); + fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze(); + fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze(); + fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze(); // // "Normal" is the set of characters that don't need special handling // when finding grapheme cluster boundaries. // - fPropSets[URX_GC_NORMAL].complement(); - fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4); - fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]); - fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]); - fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]); - fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]); - fPropSets[URX_GC_NORMAL].freeze(); + fPropSets[URX_GC_NORMAL].complement(); + fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4); + fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]); + fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]); + fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]); + fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]); + fPropSets[URX_GC_NORMAL].freeze(); // Initialize the 8-bit fast bit sets from the parallel full // UnicodeSets. - // - // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping? - // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x" - // This runs in exponential time, making it easy to adjust the time for - // convenient measuring. - // - // This 8 bit optimization dates from the early days of ICU, - // with a less optimized UnicodeSet. At the time, the difference - // was substantial. - - for (int32_t i=0; i<URX_LAST_SET; i++) { - fPropSets8[i].init(&fPropSets[i]); + // + // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping? + // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x" + // This runs in exponential time, making it easy to adjust the time for + // convenient measuring. + // + // This 8 bit optimization dates from the early days of ICU, + // with a less optimized UnicodeSet. At the time, the difference + // was substantial. + + for (int32_t i=0; i<URX_LAST_SET; i++) { + fPropSets8[i].init(&fPropSets[i]); } // Sets used while parsing rules, but not referenced from the parse state table - fRuleSets[kRuleSet_rule_char-128] - .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze(); - - fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze(); - fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze(); + fRuleSets[kRuleSet_rule_char-128] + .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze(); + + fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze(); + fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze(); fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128]; - // Finally, initialize an empty UText string for utility purposes - fEmptyText = utext_openUChars(nullptr, nullptr, 0, status); + // Finally, initialize an empty UText string for utility purposes + fEmptyText = utext_openUChars(nullptr, nullptr, 0, status); } RegexStaticSets::~RegexStaticSets() { - fRuleDigitsAlias = nullptr; + fRuleDigitsAlias = nullptr; utext_close(fEmptyText); } @@ -144,21 +144,21 @@ RegexStaticSets::~RegexStaticSets() { U_CDECL_BEGIN static UBool U_CALLCONV regex_cleanup(void) { - delete RegexStaticSets::gStaticSets; - RegexStaticSets::gStaticSets = nullptr; - gStaticSetsInitOnce.reset(); - return TRUE; + delete RegexStaticSets::gStaticSets; + RegexStaticSets::gStaticSets = nullptr; + gStaticSetsInitOnce.reset(); + return TRUE; } static void U_CALLCONV initStaticSets(UErrorCode &status) { - U_ASSERT(RegexStaticSets::gStaticSets == nullptr); + U_ASSERT(RegexStaticSets::gStaticSets == nullptr); ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup); RegexStaticSets::gStaticSets = new RegexStaticSets(&status); if (U_FAILURE(status)) { delete RegexStaticSets::gStaticSets; - RegexStaticSets::gStaticSets = nullptr; + RegexStaticSets::gStaticSets = nullptr; } - if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) { + if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; } } |