aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/unicode/normalization/custom_encoder.cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/unicode/normalization/custom_encoder.cpp
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/unicode/normalization/custom_encoder.cpp')
-rw-r--r--library/cpp/unicode/normalization/custom_encoder.cpp83
1 files changed, 83 insertions, 0 deletions
diff --git a/library/cpp/unicode/normalization/custom_encoder.cpp b/library/cpp/unicode/normalization/custom_encoder.cpp
new file mode 100644
index 0000000000..c6f186405f
--- /dev/null
+++ b/library/cpp/unicode/normalization/custom_encoder.cpp
@@ -0,0 +1,83 @@
+#include "custom_encoder.h"
+#include "normalization.h"
+
+#include <util/string/cast.h>
+#include <util/stream/output.h>
+
+void TCustomEncoder::addToTable(wchar32 ucode, unsigned char code, const CodePage* target) {
+ unsigned char plane = (unsigned char)(ucode >> 8);
+ unsigned char pos = (unsigned char)(ucode & 255);
+ if (Table[plane] == DefaultPlane) {
+ Table[plane] = new char[256];
+ memset(Table[plane], 0, 256 * sizeof(char));
+ }
+
+ if (Table[plane][pos] == 0) {
+ Table[plane][pos] = code;
+ } else {
+ Y_ASSERT(target && *target->Names);
+ if (static_cast<unsigned char>(Table[plane][pos]) > 127 && code) {
+ Cerr << "WARNING: Only lower part of ASCII should have duplicate encodings "
+ << target->Names[0]
+ << " " << IntToString<16>(ucode)
+ << " " << IntToString<16>(code)
+ << " " << IntToString<16>(static_cast<unsigned char>(Table[plane][pos]))
+ << Endl;
+ }
+ }
+}
+
+bool isGoodDecomp(wchar32 rune, wchar32 decomp) {
+ if (
+ (NUnicode::NPrivate::CharInfo(rune) == NUnicode::NPrivate::CharInfo(decomp)) || (IsAlpha(rune) && IsAlpha(decomp)) || (IsNumeric(rune) && IsNumeric(decomp)) || (IsQuotation(rune) && IsQuotation(decomp)))
+ {
+ return true;
+ }
+ return false;
+}
+
+void TCustomEncoder::Create(const CodePage* target, bool extended) {
+ Y_ASSERT(target);
+
+ DefaultChar = (const char*)target->DefaultChar;
+
+ DefaultPlane = new char[256];
+
+ memset(DefaultPlane, 0, 256 * sizeof(char));
+ for (size_t i = 0; i != 256; ++i)
+ Table[i] = DefaultPlane;
+
+ for (size_t i = 0; i != 256; ++i) {
+ wchar32 ucode = target->unicode[i];
+ if (ucode != BROKEN_RUNE) // always UNASSIGNED
+ addToTable(ucode, (unsigned char)i, target);
+ }
+
+ if (!extended)
+ return;
+
+ for (wchar32 w = 1; w < 65535; w++) {
+ if (Code(w) == 0) {
+ wchar32 dw = w;
+ while (IsComposed(dw) && Code(dw) == 0) {
+ const wchar32* decomp_p = NUnicode::Decomposition<true>(dw);
+ Y_ASSERT(decomp_p != nullptr);
+
+ dw = decomp_p[0];
+ if (std::char_traits<wchar32>::length(decomp_p) > 1 && (dw == (wchar32)' ' || dw == (wchar32)'('))
+ dw = decomp_p[1];
+ }
+ if (Code(dw) != 0 && isGoodDecomp(w, dw))
+ addToTable(w, Code(dw), target);
+ }
+ }
+}
+
+TCustomEncoder::~TCustomEncoder() {
+ for (size_t i = 0; i != 256; ++i) {
+ if (Table[i] != DefaultPlane) {
+ delete[] Table[i];
+ }
+ }
+ delete[] DefaultPlane;
+}