diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/unicode/normalization/custom_encoder.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/unicode/normalization/custom_encoder.cpp')
-rw-r--r-- | library/cpp/unicode/normalization/custom_encoder.cpp | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/library/cpp/unicode/normalization/custom_encoder.cpp b/library/cpp/unicode/normalization/custom_encoder.cpp new file mode 100644 index 0000000000..c6f186405f --- /dev/null +++ b/library/cpp/unicode/normalization/custom_encoder.cpp @@ -0,0 +1,83 @@ +#include "custom_encoder.h" +#include "normalization.h" + +#include <util/string/cast.h> +#include <util/stream/output.h> + +void TCustomEncoder::addToTable(wchar32 ucode, unsigned char code, const CodePage* target) { + unsigned char plane = (unsigned char)(ucode >> 8); + unsigned char pos = (unsigned char)(ucode & 255); + if (Table[plane] == DefaultPlane) { + Table[plane] = new char[256]; + memset(Table[plane], 0, 256 * sizeof(char)); + } + + if (Table[plane][pos] == 0) { + Table[plane][pos] = code; + } else { + Y_ASSERT(target && *target->Names); + if (static_cast<unsigned char>(Table[plane][pos]) > 127 && code) { + Cerr << "WARNING: Only lower part of ASCII should have duplicate encodings " + << target->Names[0] + << " " << IntToString<16>(ucode) + << " " << IntToString<16>(code) + << " " << IntToString<16>(static_cast<unsigned char>(Table[plane][pos])) + << Endl; + } + } +} + +bool isGoodDecomp(wchar32 rune, wchar32 decomp) { + if ( + (NUnicode::NPrivate::CharInfo(rune) == NUnicode::NPrivate::CharInfo(decomp)) || (IsAlpha(rune) && IsAlpha(decomp)) || (IsNumeric(rune) && IsNumeric(decomp)) || (IsQuotation(rune) && IsQuotation(decomp))) + { + return true; + } + return false; +} + +void TCustomEncoder::Create(const CodePage* target, bool extended) { + Y_ASSERT(target); + + DefaultChar = (const char*)target->DefaultChar; + + DefaultPlane = new char[256]; + + memset(DefaultPlane, 0, 256 * sizeof(char)); + for (size_t i = 0; i != 256; ++i) + Table[i] = DefaultPlane; + + for (size_t i = 0; i != 256; ++i) { + wchar32 ucode = target->unicode[i]; + if (ucode != BROKEN_RUNE) // always UNASSIGNED + addToTable(ucode, (unsigned char)i, target); + } + + if (!extended) + return; + + for (wchar32 w = 1; w < 65535; w++) { + if (Code(w) == 0) { + wchar32 dw = w; + while (IsComposed(dw) && Code(dw) == 0) { + const wchar32* decomp_p = NUnicode::Decomposition<true>(dw); + Y_ASSERT(decomp_p != nullptr); + + dw = decomp_p[0]; + if (std::char_traits<wchar32>::length(decomp_p) > 1 && (dw == (wchar32)' ' || dw == (wchar32)'(')) + dw = decomp_p[1]; + } + if (Code(dw) != 0 && isGoodDecomp(w, dw)) + addToTable(w, Code(dw), target); + } + } +} + +TCustomEncoder::~TCustomEncoder() { + for (size_t i = 0; i != 256; ++i) { + if (Table[i] != DefaultPlane) { + delete[] Table[i]; + } + } + delete[] DefaultPlane; +} |