aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/charset/decodeunknownplane.cpp
diff options
context:
space:
mode:
authorshokhor <shokhor@yandex-team.com>2023-09-19 05:45:26 +0300
committershokhor <shokhor@yandex-team.com>2023-09-19 06:02:49 +0300
commitae87be2b19d1d4e6a5c7bad6650381690b5dfe9d (patch)
treee3dd23427bd7e3eda2785c33de59cf1ac9f07e1d /library/cpp/charset/decodeunknownplane.cpp
parent1df0281596525a87ffe350f2e94b91e10fa46bca (diff)
downloadydb-ae87be2b19d1d4e6a5c7bad6650381690b5dfe9d.tar.gz
Introduce library/cpp/charset/lite which excludes functionality with dependency on libiconv
Introduce library/cpp/charset/lite which excludes functionality with dependency on libiconv Задача: - выпилить зависимость от libiconv в библиотеке клавиатуры для андроид libiconv приходит из двух мест: - dict/dictutil - library/cpp/charset В обоих случаях libiconv используется в вариантах CharToWide/WideToChar для перекодировки. Первый кейс dictutil - клавиатура для релиза строится с ключом DICTUTIL_WITHOUT_ICONV_AND_UNICODE, который исключает зависимость. Следующий шаг сделать флаг по умолчанию для мобильных билдов. Второй кейс library/cpp/charset: 1. Шаг - выделяем библиотеку library/cpp/charset/lite без части функциональности, зависящей от libiconv (этот ПР) 2. Шаг - переводим библиотеки, используемые клавиатурой на lite версию (следующий ПР)
Diffstat (limited to 'library/cpp/charset/decodeunknownplane.cpp')
-rw-r--r--library/cpp/charset/decodeunknownplane.cpp121
1 files changed, 121 insertions, 0 deletions
diff --git a/library/cpp/charset/decodeunknownplane.cpp b/library/cpp/charset/decodeunknownplane.cpp
new file mode 100644
index 0000000000..201a25cd54
--- /dev/null
+++ b/library/cpp/charset/decodeunknownplane.cpp
@@ -0,0 +1,121 @@
+#include "ci_string.h"
+#include "codepage.h"
+#include "recyr.hh"
+
+#include <util/system/hi_lo.h>
+#include <util/generic/vector.h>
+
+template <typename TxChar>
+static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) {
+ if ((*s & 0xFF00) != 0xF000) {
+ rune_len = 1;
+ rune = *s;
+ return RECODE_OK;
+ }
+
+ rune_len = 0;
+
+ size_t _len = UTF8RuneLen((unsigned char)(*s));
+ if (s + _len > end)
+ return RECODE_EOINPUT; //[EOINPUT]
+ if (_len == 0)
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+
+ wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX]
+ if (_len > 1) {
+ _rune &= UTF8LeadByteMask(_len);
+ wchar32 ch = *s++;
+ if ((ch & 0xFFC0) != 0xF080)
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
+ _rune <<= 6;
+ _rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
+ if (_len > 2) {
+ ch = *s++;
+ if ((ch & 0xFFC0) != 0xF080)
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
+ _rune <<= 6;
+ _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
+ if (_len > 3) {
+ ch = *s;
+ if ((ch & 0xFFC0) != 0xF080)
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
+ _rune <<= 6;
+ _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
+ }
+ }
+ }
+ rune_len = _len;
+ if (_rune > Max<TxChar>())
+ rune = ' '; // maybe put sequence
+ else
+ rune = TxChar(_rune);
+ return RECODE_OK;
+}
+
+template <typename TxChar>
+void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
+ TxChar* e = ee;
+ if (SingleByteCodepage(enc)) {
+ const CodePage* cp = CodePageByCharset(enc);
+ for (TxChar* s = str; s < e; s++) {
+ if (Hi8(Lo16(*s)) == 0xF0)
+ *s = (TxChar)cp->unicode[Lo8(Lo16(*s))]; // NOT mb compliant
+ }
+ } else if (enc == CODES_UTF8) {
+ TxChar* s;
+ TxChar* d;
+
+ for (s = d = str; s < e;) {
+ size_t l = 0;
+
+ if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) {
+ d++, s += l;
+ } else {
+ *d++ = BROKEN_RUNE;
+ ++s;
+ }
+ }
+ e = d;
+ } else if (enc == CODES_UNKNOWN) {
+ for (TxChar* s = str; s < e; s++) {
+ if (Hi8(Lo16(*s)) == 0xF0)
+ *s = Lo8(Lo16(*s));
+ }
+ } else {
+ Y_ASSERT(!SingleByteCodepage(enc));
+
+ TxChar* s = str;
+ TxChar* d = str;
+
+ TVector<char> buf;
+
+ size_t read = 0;
+ size_t written = 0;
+ for (; s < e; ++s) {
+ if (Hi8(Lo16(*s)) == 0xF0) {
+ buf.push_back(Lo8(Lo16(*s)));
+ } else {
+ if (!buf.empty()) {
+ if (RecodeToUnicode(enc, buf.data(), d, buf.size(), e - d, read, written) == RECODE_OK) {
+ Y_ASSERT(read == buf.size());
+ d += written;
+ } else { // just copying broken symbols
+ Y_ASSERT(buf.size() <= static_cast<size_t>(e - d));
+ Copy(buf.data(), buf.size(), d);
+ d += buf.size();
+ }
+ buf.clear();
+ }
+ *d++ = *s;
+ }
+ }
+ }
+ ee = e;
+}
+
+void DecodeUnknownPlane(wchar16* str, wchar16*& ee, const ECharset enc) {
+ DoDecodeUnknownPlane(str, ee, enc);
+}
+void DecodeUnknownPlane(wchar32* str, wchar32*& ee, const ECharset enc) {
+ DoDecodeUnknownPlane(str, ee, enc);
+}