diff options
author | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-12-02 11:31:25 +0300 |
commit | b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806 (patch) | |
tree | 2a23209faf0fea5586a6d4b9cee60d1b318d29fe /library/cpp/unicode | |
parent | 559174a9144de40d6bb3997ea4073c82289b4974 (diff) | |
download | ydb-b1f4ffc9c8abff3ba58dc1ec9a9f92d2f0de6806.tar.gz |
remove kikimr/driver DEPENDS
Diffstat (limited to 'library/cpp/unicode')
-rw-r--r-- | library/cpp/unicode/folding/fold.cpp | 78 | ||||
-rw-r--r-- | library/cpp/unicode/folding/fold.h | 141 | ||||
-rw-r--r-- | library/cpp/unicode/folding/fold_impl.rl6 | 635 |
3 files changed, 0 insertions, 854 deletions
diff --git a/library/cpp/unicode/folding/fold.cpp b/library/cpp/unicode/folding/fold.cpp deleted file mode 100644 index 47a42a80b2..0000000000 --- a/library/cpp/unicode/folding/fold.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include "fold.h" - -namespace NUF { - TNormalizer::TNormalizer(ELanguage lmain, ELanguage laux) - : DoRenyxa() - , DoLowerCase() - , DoSimpleCyr() - , FillOffsets() - { - Reset(); - SetLanguages(lmain, laux); - } - - TNormalizer::TNormalizer(const TLanguages& langs) - : DoRenyxa() - , DoLowerCase() - , DoSimpleCyr() - , FillOffsets() - { - Reset(); - SetLanguages(langs); - } - - void TNormalizer::SetLanguages(ELanguage lmain, ELanguage laux) { - Languages.reset(); - Scripts.reset(); - Languages.set(lmain); - Languages.set(laux); - Scripts.set(ScriptByLanguage(lmain)); - Scripts.set(ScriptByLanguage(laux)); - } - - void TNormalizer::SetLanguages(const TLanguages& langs) { - Languages = langs; - Scripts.reset(); - - for (ui32 i = 0; i < langs.size(); ++i) { - if (langs.test(i)) - Scripts.set(ScriptByLanguage(ELanguage(i))); - } - } - - void TNormalizer::SetDoRenyxa(bool da) { - DoRenyxa = da; - } - - void TNormalizer::SetDoLowerCase(bool da) { - DoLowerCase = da; - } - - void TNormalizer::SetDoSimpleCyr(bool da) { - DoSimpleCyr = da; - } - - void TNormalizer::SetFillOffsets(bool da) { - FillOffsets = da; - } - - void TNormalizer::Reset() { - CDBuf.clear(); - OutBuf.clear(); - CDOffsets.clear(); - TmpBuf.clear(); - p = p0 = pe = eof = ts = te = ret = nullptr; - cs = act = 0; - } - - void TNormalizer::SetInput(TWtringBuf b) { - Reset(); - CDBuf.reserve(2 * b.size()); - OutBuf.reserve(2 * b.size()); - - Decomposer.Normalize(b.data(), b.size(), CDBuf); - p = p0 = CDBuf.begin(); - pe = eof = CDBuf.end(); - } - -} diff --git a/library/cpp/unicode/folding/fold.h b/library/cpp/unicode/folding/fold.h deleted file mode 100644 index 516c9962c0..0000000000 --- a/library/cpp/unicode/folding/fold.h +++ /dev/null @@ -1,141 +0,0 @@ -#pragma once - -#include <library/cpp/unicode/normalization/normalization.h> -#include <library/cpp/langs/langs.h> -#include <util/generic/strbuf.h> -#include <util/generic/vector.h> - -#include <bitset> - -namespace NUF { - using TLanguages = std::bitset<LANG_MAX>; - using TScripts = std::bitset<SCRIPT_MAX>; - - /* language-sensitive - * insignificant diacritics are removed - * significant diacritics are either left in place or turned into diftongs (i.e. umlauts in german) - * ligatures and special symbols are decomposed - * all control and space characters are made spaces and duplicates are collapsed - * all dash characters are made dashes - * all invisible characters (shy, zwspaces) are removed - * all other characters are left intact - * designed to be more robust and aggressive than lemmer normalization - * MAY CONTAIN INCORRECT DATA OR DISCONTAIN SOME IMPORTANT DATA! - * - * TODO: make a tool to generate rules automatically on ICU and lemmer data - * - * @maintainer: velavokr - */ - - using TOffsets = TVector<size_t>; - class TNormalizer { - TLanguages Languages; - TScripts Scripts; - - TVector<wchar16> CDBuf; - TVector<wchar16> OutBuf; - TVector<wchar16> TmpBuf; - TOffsets CDOffsets; - - NUnicode::TNormalizer<NUnicode::NFD> Decomposer; - NUnicode::TNormalizer<NUnicode::NFC> Recomposer; - - const wchar16* p; - const wchar16* p0; - const wchar16* pe; - const wchar16* eof; - const wchar16* ts; - const wchar16* te; - const wchar16* ret; - int cs; - int act; - - bool DoRenyxa; - bool DoLowerCase; - bool DoSimpleCyr; - bool FillOffsets; - - public: - TNormalizer(ELanguage lmain = LANG_UNK, ELanguage laux = LANG_UNK); - TNormalizer(const TLanguages& langs); - - void SetDoRenyxa(bool); - void SetDoLowerCase(bool); - void SetDoSimpleCyr(bool); - void SetFillOffsets(bool); - void SetLanguages(ELanguage lmain, ELanguage laux = LANG_UNK); - void SetLanguages(const TLanguages& langs); - - void Reset(); - void SetInput(TWtringBuf b); - - TWtringBuf GetOutput() const { - return TWtringBuf(OutBuf.data(), OutBuf.size()); - } - - TWtringBuf GetCanonDenormalizedInput() const { - return TWtringBuf(CDBuf.data(), CDBuf.size()); - } - - const TOffsets& GetOffsetsInCanonDenormalizedInput() const { - return CDOffsets; - } - - void DoNormalize(); - - protected: - static const ui64 ZERO_WIDTH = - (ULL(1) << (Cf_FORMAT)) | (ULL(1) << (Cf_JOIN)) | (ULL(1) << (Cf_BIDI)) | (ULL(1) << (Cf_ZWNBSP)) | (ULL(1) << (Zs_ZWSPACE)) | (ULL(1) << (Mc_SPACING)) | (ULL(1) << (Mn_NONSPACING)) | (ULL(1) << (Me_ENCLOSING)); - - static const ui64 SPACE = - (ULL(1) << (Cc_SPACE)) | (ULL(1) << (Zs_SPACE)) | (ULL(1) << (Zl_LINE)) | (ULL(1) << (Zp_PARAGRAPH)) | (ULL(1) << (Cc_ASCII)) | (ULL(1) << (Cc_SEPARATOR)) | (ULL(1) << (Cn_UNASSIGNED)) | (ULL(1) << (Co_PRIVATE)); - - bool Is(ELanguage lang) const { - return Languages.test(lang); - } - bool Is(EScript scr) const { - return Scripts.test(scr); - } - - bool IsSpace() const { - return NUnicode::CharHasType(*p, SPACE); - } - bool IsNothing() const { - return NUnicode::CharHasType(*p, ZERO_WIDTH) || wchar16(0xAD) /*shy*/ == *p; - } - bool IsDash() const { - return ::IsDash(*p); - } - - void Emit(wchar16 c, size_t off = 0) { - OutBuf.push_back(c); - if (FillOffsets) - CDOffsets.push_back(ts - p0 + off); - } - - void EmitUpper(wchar16 c, size_t off = 0) { - if (DoLowerCase) - Emit(ToLower(c), off); - else - Emit(c, off); - } - - void EmitRenyxa(wchar16 c, size_t off = 0) { - if (DoRenyxa) - EmitUpper(c, off); - else - EmitUpper(*ts, off); - } - - void EmitSimpleCyr(wchar16 c, size_t off = 0) { - if (DoSimpleCyr) - EmitUpper(c, off); - else - EmitUpper(*ts, off); - } - - wchar16 Last() const { - return OutBuf.empty() ? 0 : OutBuf.back(); - } - }; -} diff --git a/library/cpp/unicode/folding/fold_impl.rl6 b/library/cpp/unicode/folding/fold_impl.rl6 deleted file mode 100644 index 5f62e1c01d..0000000000 --- a/library/cpp/unicode/folding/fold_impl.rl6 +++ /dev/null @@ -1,635 +0,0 @@ -#if defined(__GNUC__) -# pragma GCC diagnostic ignored "-Wsign-compare" -#endif - -#include <library/cpp/unicode/folding/fold.h> - -namespace NUF { - -void TNormalizer::DoNormalize() { -#if 0 -%%{ -machine Normalizer; -alphtype unsigned short; - -action H { Hold(); } -action R { Ret(); } - -dia = 0x300 .. 0x36F; - -main := |* - -############################ -## cyr -> lat renyxization -############################ - -## і -0x456 { EmitRenyxa('i'); }; -0x406 { EmitRenyxa('I'); }; - -## ј -0x458 { EmitRenyxa('j'); }; -0x408 { EmitRenyxa('J'); }; - -## с -0x441 { EmitRenyxa('c'); }; -0x421 { EmitRenyxa('C'); }; - -## - -############################ -## cyr simplification -############################ - -## ә -> а -0x4D9 { EmitSimpleCyr(0x430); }; -0x4D8 { EmitSimpleCyr(0x410); }; - -## Һ -> х -0x4BB { EmitSimpleCyr(0x445); }; -0x4BA { EmitSimpleCyr(0x425); }; - -## Ԧ -> Һ / х -0x0527 { - if (DoSimpleCyr) - EmitSimpleCyr(0x445); - else - Emit(0x04bb); -}; - -0x0526 { - if (DoSimpleCyr) - EmitSimpleCyr(0x420); - else - EmitUpper(0x04ba); -}; - -## є -> е -0x454 { EmitSimpleCyr(0x435); }; -0x404 { EmitSimpleCyr(0x415); }; - -## э -> е -0x44d { - if (Is(LANG_BEL)) - Emit(0x435); - else - Emit(0x44d); -}; -0x42d { - if (Is(LANG_BEL)) - EmitUpper(0x415); - else - EmitUpper(0x42d); -}; - -## ун -> вн -0x443 dia* 0x43d { - if (Is(LANG_BEL) || Is(LANG_UKR)) - Emit(0x432); - else - Emit(0x443); - - Emit(0x43d, te - ts - 1); -}; -0x423 dia* 0x43d { - if (Is(LANG_BEL) || Is(LANG_UKR)) - EmitUpper(0x412); - else - EmitUpper(0x423); - - Emit(0x43d, te - ts - 1); -}; - -## сьн -> сн -(0x441 | 'c') 0x44c 0x43d { - if (DoRenyxa) - Emit('c'); - else - Emit(0x441); - if (!Is(LANG_BEL)) - Emit(0x44c); - Emit(0x43d, te - ts - 1); -}; -(0x421 | 'C') 0x44c 0x43d { - if (DoRenyxa) - EmitUpper('C'); - else - EmitUpper(0x421); - if (!Is(LANG_BEL)) - Emit(0x44c); - Emit(0x43d, te - ts - 1); -}; - -############################ -## cyr diacritic fix -############################ - -## ҿ, ҽ -> е -0x04bf | 0x04bd { Emit(0x435); }; -0x04be | 0x04bc { EmitUpper(0x415); }; - -## ґ, ғ, ҕ, ӷ, ӻ -> г -0x491 | 0x493 | 0x0495 | 0x04f7 | 0x04fb { Emit(0x433); }; -0x490 | 0x492 | 0x0494 | 0x04f6 | 0x04fa { EmitUpper(0x413); }; - -## җ -> ж -0x497 { Emit(0x436); }; -0x496 { EmitUpper(0x416); }; - -## ҙ -> з -0x0499 { Emit(0x0437); }; -0x0498 { Emit(0x0417); }; - -## ӣ, й, ӥ, ҋ, ї -> й -0x438 (0x304 | 0x306 | 0x308) | 0x456 0x308 | 0x048b { Emit(0x439); }; -0x418 (0x304 | 0x306 | 0x308) | 0x406 0x308 | 0x048a { EmitUpper(0x419); }; - -## қ, ҝ, ҟ, ҡ, ӄ, ԟ -> к -0x49B | 0x049d | 0x049f | 0x04a1 | 0x04c4 | 0x051f { Emit(0x43A); }; -0x49A | 0x049c | 0x049e | 0x04a0 | 0x04c3 | 0x051e { EmitUpper(0x41A); }; - -## ӆ, ԓ, ԡ -> л -0x04c6 | 0x0513 | 0x0521 { Emit(0x043b); }; -0x04c5 | 0x0512 | 0x0520 { EmitUpper(0x041b); }; - -## ӎ -> м -0x04ce { Emit(0x043c); }; -0x04cd { EmitUpper(0x041c); }; - -## ң, ӈ, ӊ, ԣ -> н -0x4A3 | 0x04c8 | 0x04ca | 0x0523 { Emit(0x43D); }; -0x4A2 | 0x04c7 | 0x04c9 | 0x0522 { EmitUpper(0x41D); }; - -## ө -> о -0x4E9 { Emit(0x43E); }; -0x4E8 { EmitUpper(0x41E); }; - -## ҧ, ԥ -> п -0x04a7 | 0x0525 { Emit(0x043f); }; -0x04a6 | 0x0524 { EmitUpper(0x041f); }; - -## ҏ -0x048f { Emit(0x0440); }; -0x048e { EmitUpper(0x0420); }; - -## ҫ -0x04ab { Emit(0x0441); }; -0x04aa { EmitUpper(0x0421); }; - -## ҭ -0x04ad { Emit(0x0442); }; -0x04ac { EmitUpper(0x0422); }; - -## ұ, ү -> у -0x4B1 | 0x4AF { Emit(0x443); }; -0x4B0 | 0x4AE { EmitUpper(0x423); }; - -## ҳ, ӽ, ӿ -> х -0x04b3 | 0x04fd | 0x04ff { Emit(0x0445); }; -0x04b2 | 0x04fc | 0x04fe { EmitUpper(0x0425); }; - -## ҷ, ҹ, ӌ -> ч -0x04b7 | 0x04b9 | 0x04cc { Emit(0x0447); }; -0x04b6 | 0x04b8 | 0x04cb { EmitUpper(0x0427); }; - -## ҍ -> ь -0x048d { Emit(0x044c); }; -0x048c { EmitUpper(0x042c); }; - -############################ -## lat diacritic fix -############################ - -## Ⱥ, ɑ -0x0251 { Emit('a'); }; -0x023a { EmitUpper('A'); }; - -## ƀ, ƃ, ɓ -0x0180 | 0x183 | 0x0253 { Emit('b'); }; -0x0181 | 0x182 | 0x0243 { EmitUpper('B'); }; - -## ƈ, ȼ, ɕ -0x188 | 0x023c | 0x0255 { Emit('c'); }; -0x187 | 0x023b { EmitUpper('C'); }; - -## ð, đ, ƌ, Ɖ, Ɗ, ȡ, ɖ, ɗ -0x00f0 | 0x0111 | 0x18c | 0x221 | 0x256 | 0x257 { Emit('d'); }; -0x00d0 | 0x0110 | 0x189 | 0x18a | 0x18b { EmitUpper('D'); }; - -## ɛ, ɇ, ə, ɚ -0x0259 | 0x025a | 0x25b | 0x0247 { Emit('e'); }; -0x0190 | 0x0246 { EmitUpper('E'); }; - -## ƒ -0x192 { Emit('f'); }; -0x191 { EmitUpper('F'); }; - -## ǥ, ɠ -0x01e5 | 0x0260 | 0x0261 { Emit('g'); }; -0x01e4 | 0x0193 { EmitUpper('G'); }; - -## ħ, ɦ, ꜧ, ɧ -0x0127 | 0x0266 | 0x0267 | 0xa727 { Emit('h'); }; -0x0126 { EmitUpper('H'); }; - -## ɨ -0x0268 { Emit('i'); }; -0x0197 { EmitUpper('I'); }; - -## ɉ, ȷ -0x0249 | 0x0237 | 0x025f | 0x0284 | 0x029d { Emit('j'); }; -0x0248 { EmitUpper('J'); }; - -## ƙ -0x199 { Emit('k'); }; -0x198 { EmitUpper('K'); }; - -## ł, ƚ, ɫ, ɬ, ɭ, ȴ -0x0142 | 0x019a | 0x234 | 0x026b | 0x026c | 0x026d { Emit('l'); }; -0x0141 | 0x023d { EmitUpper('L'); }; - -## ɱ -0x0271 { Emit('m'); }; - -## ƞ, ȵ, ɲ, ɳ -0x019e | 0x0220 | 0x0272 | 0x0273 { Emit('n'); }; -0x019d | 0x0235 { EmitUpper('N'); }; - -## ɵ -0x0275 { Emit('o'); }; -0x019f { EmitUpper('O'); }; - -## ƥ -0x01a5 { Emit('p'); }; -0x01a4 { EmitUpper('P'); }; - -## ɋ, ʠ -0x024b | 0x02a0 { Emit('q'); }; - -## ɍ, ɼ, ɽ, ɾ -0x024d | 0x027c | 0x027d | 0x027e { Emit('r'); }; -0x024c { EmitUpper('R'); }; - -## ȿ, ʂ, ʆ, ʃ -0x023f | 0x0282 | 0x0283 | 0x0286 { Emit('s'); }; - -## ŧ, ƫ, ƭ, ȶ, ʈ, Ⱦ -0x0167 | 0x01ab | 0x01ad | 0x0236 | 0x0288 { Emit('t'); }; -0x0166 | 0x01ac | 0x01ae | 0x023e { EmitUpper('T'); }; - -## ʉ -0x0289 { Emit('u'); }; -0x0244 { EmitUpper('U'); }; - -## ʋ -0x028b { Emit('v'); }; -0x01b2 { EmitUpper('V'); }; - -## ƴ, ɏ -0x01b4 | 0x024f { Emit('y'); }; -0x01b3 | 0x024e { EmitUpper('Y'); }; - -## ƶ, ɀ, ʐ, ʓ, ʒ -0x01ba | 0x01b6 | 0x225 | 0x0240 | 0x0290 | 0x0291 | 0x0292 | 0x0293 { Emit('z'); }; -0x01b5 | 0x224 { EmitUpper('Z'); }; - -## İ -'I' 0x307 { - if (DoRenyxa) { - EmitUpper('I'); - } else if (Is(LANG_TUR)) { - if (DoLowerCase) - Emit(0x131); - else - Emit(0x130); - } else { - EmitUpper('I'); - } -}; - -## ı -0x131 { - if (Is(LANG_TUR)) - EmitRenyxa('i'); - else - Emit('i'); -}; - -## ø -0xf8 { - Emit('o'); - if (Is(LANG_NOR)) - Emit('e', 1); -}; - -0xd8 { - EmitUpper('O'); - if (Is(LANG_NOR)) - Emit('e', 1); -}; - -## å -'a' 0x30A { - Emit('a'); - if (Is(LANG_DAN) || Is(LANG_NOR)) - Emit('a', 1); -}; - -'A' 0x30A { - EmitUpper('A'); - if (Is(LANG_DAN) || Is(LANG_NOR)) - Emit('a', 1); -}; - -## ä -'a' 0x308 { - Emit('a'); - if (Is(LANG_GER) || Is(LANG_HUN) || Is(LANG_NOR)) - Emit('e', 1); -}; - -'A' 0x308 { - EmitUpper('A'); - if (Is(LANG_GER) || Is(LANG_HUN) || Is(LANG_NOR)) - Emit('e', 1); -}; - -## ö -'o' 0x308 { - Emit('o'); - if (Is(LANG_GER) || Is(LANG_HUN) || Is(LANG_NOR)) - Emit('e', 1); -}; - -'O' 0x308 { - EmitUpper('O'); - if (Is(LANG_GER) || Is(LANG_HUN) || Is(LANG_NOR)) - Emit('e', 1); -}; - -## ü -'u' 0x308 { - Emit('u'); - if (Is(LANG_GER) || Is(LANG_HUN) || Is(LANG_NOR)) - Emit('e', 1); -}; - -'U' 0x308 { - EmitUpper('U'); - if (Is(LANG_GER) || Is(LANG_HUN) || Is(LANG_NOR)) - Emit('e', 1); -}; - -############################ -## cyr ligature fix -############################ - -## ӕ -0x4D5 { Emit(0x430); Emit(0x435); }; -0x4D4 { EmitUpper(0x410); EmitUpper(0x415); }; - -## ԕ -0x0515 { Emit(0x043b); Emit(0x0445); }; -0x0514 { EmitUpper(0x041b); EmitUpper(0x0425); }; - -## љ -0x0459 { Emit(0x043b); Emit(0x044c); }; -0x0409 { EmitUpper(0x041b); EmitUpper(0x042c); }; - -## ҥ -0x04a5 { Emit(0x043d); Emit(0x0433); }; -0x04a4 { EmitUpper(0x041d); EmitUpper(0x0413); }; - -## њ -0x045a { Emit(0x043d); Emit(0x044c); }; -0x040a { EmitUpper(0x041d); EmitUpper(0x042c); }; - -## ԗ -0x0517 { Emit(0x0440); Emit(0x0445); }; -0x0516 { EmitUpper(0x0420); EmitUpper(0x0425); }; - -## ҵ -0x04b5 { Emit(0x0442); Emit(0x0446); }; -0x04b4 { EmitUpper(0x0422); EmitUpper(0x0426); }; - -## ԙ -0x0519 { Emit(0x044f); Emit(0x0435); }; -0x0518 { EmitUpper(0x042f); EmitUpper(0x0415); }; - -## ѹ -0x0478 { EmitUpper(0x41e); Emit(0x443); }; -0x0479 { Emit(0x43e); Emit(0x443); }; - -############################ -## lat ligature fix -############################ - -## ꜳ -0xa733 { Emit('a'); Emit('a'); }; -0xa732 { EmitUpper('A'); EmitUpper('A'); }; - -## æ -0xe6 { Emit('a'); Emit('e'); }; -0xc6 { EmitUpper('A'); EmitUpper('E'); }; - -## ꜵ -0xa735 { Emit('a'); Emit('o'); }; -0xa734 { EmitUpper('A'); EmitUpper('O'); }; - -## ꜷ -0xa737 { Emit('a'); Emit('u'); }; -0xa736 { EmitUpper('A'); EmitUpper('U'); }; - -## ꜹ, ꜻ -0xa739 | 0xa73b { Emit('a'); Emit('v'); }; -0xa738 | 0xa73a { EmitUpper('A'); EmitUpper('V'); }; - -## ꜽ -0xa73d { Emit('a'); Emit('y'); }; -0xa73c { EmitUpper('A'); EmitUpper('Y'); }; - -## ȸ -0x238 { Emit('d'); Emit('b'); }; - -## dz, dž, ʤ, ʥ -0x1F1 | 0x1C4 { EmitUpper('D'); EmitUpper('Z'); }; -0x1F2 | 0x1C5 { EmitUpper('D'); Emit('z'); }; -0x1F3 | 0x1C6 | 0x02a4 | 0x2a5 { Emit('d'); Emit('z'); }; - -## ff -0xfb00 { Emit('f'); Emit('f'); }; - -## fi -0xfb01 { Emit('f'); Emit('i'); }; - -## fl -0xfb02 { Emit('f'); Emit('l'); }; - -## ʩ -0x02a9 { Emit('f'); Emit('n'); Emit('g'); }; - -## ƕ -0x0195 { Emit('h'); Emit('v'); }; - -## ij -0x133 { Emit('i'); Emit('j'); }; -0x132 { Emit('I'); Emit('J'); }; - -## lj -0x1C7 { EmitUpper('L'); EmitUpper('J'); }; -0x1C8 { EmitUpper('L'); Emit('j'); }; -0x1C9 { Emit('l'); Emit('j'); }; - -## ʪ -0x02aa { Emit('l'); Emit('s'); }; - -## ʫ, ɮ -0x02ab | 0x026e { Emit('l'); Emit('z'); }; - -## nj -0x1CA { EmitUpper('N'); EmitUpper('J'); }; -0x1CB { EmitUpper('N'); Emit('j'); }; -0x1CC { Emit('n'); Emit('j'); }; - -## ŋ -0x14b { Emit('n'); Emit('g'); }; -0x14a { EmitUpper('N'); EmitUpper('G'); }; - -## œ -0x153 { Emit('o'); Emit('e'); }; -0x152 { EmitUpper('O'); EmitUpper('E'); }; - -## ƣ -0x1a3 { Emit('o'); Emit('i'); }; -0x1a2 { EmitUpper('O'); EmitUpper('I'); }; - -## ꝏ -0xa74f { Emit('o'); Emit('o'); }; -0xa74e { EmitUpper('O'); EmitUpper('O'); }; - -## ȹ -0x239 { Emit('q'); Emit('p'); }; - -## ß -0xdf { Emit('s'); Emit('s'); }; -0x1e9e { EmitUpper('S'); EmitUpper('S'); }; - -## st -0xfb06 { Emit('s'); Emit('t'); }; - -## ʦ, ʧ -0x02a6 | 0x02a7 { Emit('t'); Emit('s'); }; - -## ᵫ -0x1d6b { Emit('u'); Emit('e'); }; - -## ffi -0xfb03 { Emit('f'); Emit('f'); Emit('i'); }; - -## ffl -0xfb04 { Emit('f'); Emit('f'); Emit('l'); }; - -## ʨ -0x2a8 { Emit('t'); Emit('c'); }; - -## ᵺ -0x1d7a { Emit('t'); Emit('h'); }; - -############################ -## other symbols -############################ -# todo: check which letters need disambiguating accents and rewrite this - -## w -'w' { - if (Is(LANG_SWE)) - Emit('v'); - else - Emit('w'); -}; - -'W' { - if (Is(LANG_SWE)) - EmitUpper('V'); - else - EmitUpper('W'); -}; - -## disambiguating acute accent -## 0x301 { -## // if (Is(LANG_DAN) || Is(LANG_NOR) || Is(LANG_SPA) || Is(LANG_GRE)) -## // Emit(0x301); -## }; - -## disambiguating grave accent -## 0x300 { -## // if (Is(LANG_FRE) || Is(LANG_ITA) || Is(LANG_NOR) || Is(LANG_RUM) || Is(LANG_CAT)) -## // Emit(0x300); -## }; - -## disambiguating circumflex accent -## 0x302 { -## // if (Is(LANG_NOR)) -## // Emit(0x302); -## }; - -## single quotes and apostrophes -0x2b9 | 0x2bb | 0x2bc | 0x2c8 | 0x55A | 0x2018 | 0x2019 | 0x201b | 0x2032 | '`' { - Emit('\''); -}; - -## slashes ⁄ ∕ -0x2044 | 0x2215 { - Emit('/'); -}; - -## left chevrons -0xab | 0x226a | 0x300a { - Emit('<'); Emit('<'); -}; - -## right chevrons -0xbb | 0x226b | 0x300b { - Emit('>'); Emit('>'); -}; - -## left angles -0x3c | 0x2039 | 0x2329 | 0x27e8 | 0x3008 { - Emit('<'); -}; - -## right angles -0x3e | 0x203a | 0x232a | 0x27e9 | 0x3009 { - Emit('>'); -}; - -## other symbols -any { - if(IsNothing()) { - // nothing - } else if (IsSpace()) { - if (' ' != Last()) - Emit(' '); - } else if (IsDash()) { - Emit('-'); - } else { - EmitUpper(*ts); - } -}; - -*|; - -}%% -#endif - -%% write data noerror nofinal; -%% write init; -%% write exec; - -Y_UNUSED(Normalizer_en_main); - TmpBuf.swap(OutBuf); - OutBuf.clear(); - Recomposer.Normalize(TmpBuf.data(), TmpBuf.size(), OutBuf); -} - -} |