aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/unicode/normalization/normalization.cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/unicode/normalization/normalization.cpp
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/unicode/normalization/normalization.cpp')
-rw-r--r--library/cpp/unicode/normalization/normalization.cpp66
1 files changed, 66 insertions, 0 deletions
diff --git a/library/cpp/unicode/normalization/normalization.cpp b/library/cpp/unicode/normalization/normalization.cpp
new file mode 100644
index 0000000000..7da7211514
--- /dev/null
+++ b/library/cpp/unicode/normalization/normalization.cpp
@@ -0,0 +1,66 @@
+#include "normalization.h"
+
+static const wchar32 S_BASE = 0xAC00;
+static const wchar32 L_BASE = 0x1100;
+static const wchar32 V_BASE = 0x1161;
+static const wchar32 T_BASE = 0x11A7;
+static const int L_COUNT = 19;
+static const int V_COUNT = 21;
+static const int T_COUNT = 28;
+static const int N_COUNT = V_COUNT * T_COUNT; // 588
+static const int S_COUNT = L_COUNT * N_COUNT; // 11172
+
+static inline wchar32 ComposeHangul(wchar32 lead, wchar32 tail) {
+ // 1. check to see if two current characters are L and V
+ int lIndex = lead - L_BASE;
+ if (0 <= lIndex && lIndex < L_COUNT) {
+ int vIndex = tail - V_BASE;
+ if (0 <= vIndex && vIndex < V_COUNT) {
+ // make syllable of form LV
+ lead = (wchar32)(S_BASE + (lIndex * V_COUNT + vIndex) * T_COUNT);
+ return lead;
+ }
+ }
+
+ // 2. check to see if two current characters are LV and T
+ int sIndex = lead - S_BASE;
+ if (0 <= sIndex && sIndex < S_COUNT && (sIndex % T_COUNT) == 0) {
+ int TIndex = tail - T_BASE;
+ if (0 < TIndex && TIndex < T_COUNT) {
+ // make syllable of form LVT
+ lead += TIndex;
+ return lead;
+ }
+ }
+
+ return 0;
+}
+
+NUnicode::NPrivate::TComposition::TComposition() {
+ for (size_t i = 0; i != RawDataSize; ++i) {
+ const TRawData& data = RawData[i];
+
+ if (DecompositionCombining(data.Lead) != 0)
+ continue;
+
+ Data[TKey(data.Lead, data.Tail)] = data.Comp;
+ }
+
+ for (wchar32 s = 0xAC00; s != 0xD7A4; ++s) {
+ const wchar32* decompBegin = NUnicode::Decomposition<true>(s);
+
+ if (decompBegin == nullptr)
+ continue;
+
+ wchar32 lead = *(decompBegin++);
+ while (*decompBegin) {
+ wchar32 tail = *(decompBegin++);
+ wchar32 comp = ComposeHangul(lead, tail);
+ Y_ASSERT(comp != 0);
+
+ Data[TKey(lead, tail)] = comp;
+
+ lead = comp;
+ }
+ }
+}