aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/charset/wide.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/charset/wide.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/charset/wide.h')
-rw-r--r--library/cpp/charset/wide.h306
1 files changed, 306 insertions, 0 deletions
diff --git a/library/cpp/charset/wide.h b/library/cpp/charset/wide.h
new file mode 100644
index 00000000000..32d30e849e9
--- /dev/null
+++ b/library/cpp/charset/wide.h
@@ -0,0 +1,306 @@
+#pragma once
+
+#include "codepage.h"
+#include "iconv.h"
+
+#include <util/charset/recode_result.h>
+#include <util/charset/unidata.h>
+#include <util/charset/utf8.h>
+#include <util/charset/wide.h>
+#include <util/generic/string.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/yexception.h>
+#include <util/memory/tempbuf.h>
+#include <util/system/yassert.h>
+
+//! converts text from unicode to yandex codepage
+//! @attention destination buffer must be long enough to fit all characters of the text
+//! @note @c dest buffer must fit at least @c len number of characters
+template <typename TCharType>
+inline size_t WideToChar(const TCharType* text, size_t len, char* dest, ECharset enc) {
+ Y_ASSERT(SingleByteCodepage(enc));
+
+ const char* start = dest;
+
+ const Encoder* const encoder = &EncoderByCharset(enc);
+ const TCharType* const last = text + len;
+ for (const TCharType* cur = text; cur != last; ++dest) {
+ *dest = encoder->Tr(ReadSymbolAndAdvance(cur, last));
+ }
+
+ return dest - start;
+}
+
+//! converts text to unicode using a codepage object
+//! @attention destination buffer must be long enough to fit all characters of the text
+//! @note @c dest buffer must fit at least @c len number of characters;
+//! if you need convert zero terminated string you should determine length of the
+//! string using the @c strlen function and pass as the @c len parameter;
+//! it does not make sense to create an additional version of this function because
+//! it will call to @c strlen anyway in order to allocate destination buffer
+template <typename TCharType>
+inline void CharToWide(const char* text, size_t len, TCharType* dest, const CodePage& cp) {
+ const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
+ const unsigned char* const last = cur + len;
+ for (; cur != last; ++cur, ++dest) {
+ *dest = static_cast<TCharType>(cp.unicode[*cur]); // static_cast is safe as no 1char codepage contains non-BMP symbols
+ }
+}
+
+namespace NDetail {
+ namespace NBaseOps {
+ // Template interface base recoding drivers, do not perform any memory management,
+ // do not care about buffer size, so supplied @dst
+ // should have enough room for the result (with proper reserve for the worst case)
+
+ // Depending on template params, perform conversion of single-byte/multi-byte/utf8 string to/from wide string.
+
+ template <typename TCharType>
+ inline TBasicStringBuf<TCharType> RecodeSingleByteChar(const TStringBuf src, TCharType* dst, const CodePage& cp) {
+ Y_ASSERT(cp.SingleByteCodepage());
+ ::CharToWide(src.data(), src.size(), dst, cp);
+ return TBasicStringBuf<TCharType>(dst, src.size());
+ }
+
+ template <typename TCharType>
+ inline TStringBuf RecodeSingleByteChar(const TBasicStringBuf<TCharType> src, char* dst, const CodePage& cp) {
+ Y_ASSERT(cp.SingleByteCodepage());
+ ::WideToChar(src.data(), src.size(), dst, cp.CPEnum);
+ return TStringBuf(dst, src.size());
+ }
+
+ template <typename TCharType>
+ inline TBasicStringBuf<TCharType> RecodeMultiByteChar(const TStringBuf src, TCharType* dst, ECharset encoding) {
+ Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
+ size_t read = 0;
+ size_t written = 0;
+ ::NICONVPrivate::RecodeToUnicode(encoding, src.data(), dst, src.size(), src.size(), read, written);
+ return TBasicStringBuf<TCharType>(dst, written);
+ }
+
+ template <typename TCharType>
+ inline TStringBuf RecodeMultiByteChar(const TBasicStringBuf<TCharType> src, char* dst, ECharset encoding) {
+ Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
+ size_t read = 0;
+ size_t written = 0;
+ ::NICONVPrivate::RecodeFromUnicode(encoding, src.data(), dst, src.size(), src.size() * 3, read, written);
+ return TStringBuf(dst, written);
+ }
+
+ template <typename TCharType>
+ inline TBasicStringBuf<TCharType> RecodeUtf8(const TStringBuf src, TCharType* dst) {
+ size_t len = 0;
+ if (!::UTF8ToWide(src.data(), src.size(), dst, len))
+ ythrow yexception() << "Invalid UTF8: \"" << src.SubStr(0, 50) << (src.size() > 50 ? "...\"" : "\"");
+ return TBasicStringBuf<TCharType>(dst, len);
+ }
+
+ template <typename TCharType>
+ inline TStringBuf RecodeUtf8(const TBasicStringBuf<TCharType> src, char* dst) {
+ size_t len = 0;
+ ::WideToUTF8(src.data(), src.size(), dst, len);
+ return TStringBuf(dst, len);
+ }
+
+ // Select one of re-coding methods from above, based on provided @encoding
+
+ template <typename TCharFrom, typename TCharTo>
+ TBasicStringBuf<TCharTo> Recode(const TBasicStringBuf<TCharFrom> src, TCharTo* dst, ECharset encoding) {
+ if (encoding == CODES_UTF8)
+ return RecodeUtf8(src, dst);
+ else if (SingleByteCodepage(encoding))
+ return RecodeSingleByteChar(src, dst, *CodePageByCharset(encoding));
+ else
+ return RecodeMultiByteChar(src, dst, encoding);
+ }
+
+ }
+
+ template <typename TCharFrom>
+ struct TRecodeTraits;
+
+ template <>
+ struct TRecodeTraits<char> {
+ using TCharTo = wchar16;
+ using TStringBufTo = TWtringBuf;
+ using TStringTo = TUtf16String;
+ enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case
+ // Here an unicode character can be converted up to 4 bytes of UTF8
+ };
+
+ template <>
+ struct TRecodeTraits<wchar16> {
+ using TCharTo = char;
+ using TStringBufTo = TStringBuf;
+ using TStringTo = TString;
+ enum { ReserveSize = 2 }; // possible surrogate pairs ?
+ };
+
+ // Operations with destination buffer where recoded string will be written
+ template <typename TResult>
+ struct TRecodeResultOps {
+ // default implementation will work with TString and TUtf16String - 99% of usage
+ using TResultChar = typename TResult::char_type;
+
+ static inline size_t Size(const TResult& dst) {
+ return dst.size();
+ }
+
+ static inline TResultChar* Reserve(TResult& dst, size_t len) {
+ dst.ReserveAndResize(len);
+ return dst.begin();
+ }
+
+ static inline void Truncate(TResult& dst, size_t len) {
+ dst.resize(len);
+ }
+ };
+
+ // Main template interface for recoding in both directions
+
+ template <typename TCharFrom, typename TResult>
+ typename TRecodeTraits<TCharFrom>::TStringBufTo Recode(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
+ using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
+ // make enough room for re-coded string
+ TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<TCharTo>::ReserveSize);
+ // do re-coding
+ TBasicStringBuf<TCharTo> res = NBaseOps::Recode(src, dstbuf, encoding);
+ // truncate result back to proper size
+ TRecodeResultOps<TResult>::Truncate(dst, res.size());
+ return res;
+ }
+
+ // appending version of Recode()
+ template <typename TCharFrom, typename TResult>
+ typename TRecodeTraits<TCharFrom>::TStringBufTo RecodeAppend(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
+ using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
+ size_t dstOrigSize = TRecodeResultOps<TResult>::Size(dst);
+ TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, dstOrigSize + src.size() * TRecodeTraits<TCharTo>::ReserveSize);
+ TBasicStringBuf<TCharTo> appended = NBaseOps::Recode(src, dstbuf + dstOrigSize, encoding);
+ size_t dstFinalSize = dstOrigSize + appended.size();
+ TRecodeResultOps<TResult>::Truncate(dst, dstFinalSize);
+ return TBasicStringBuf<TCharTo>(dstbuf, dstFinalSize);
+ }
+
+ // special implementation for robust utf8 functions
+ template <typename TResult>
+ TWtringBuf RecodeUTF8Robust(const TStringBuf src, TResult& dst) {
+ // make enough room for re-coded string
+ wchar16* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<wchar16>::ReserveSize);
+
+ // do re-coding
+ size_t written = 0;
+ UTF8ToWide<true>(src.data(), src.size(), dstbuf, written);
+
+ // truncate result back to proper size
+ TRecodeResultOps<TResult>::Truncate(dst, written);
+ return TWtringBuf(dstbuf, written);
+ }
+
+ template <typename TCharFrom>
+ inline typename TRecodeTraits<TCharFrom>::TStringTo Recode(const TBasicStringBuf<TCharFrom> src, ECharset encoding) {
+ typename TRecodeTraits<TCharFrom>::TStringTo res;
+ Recode<TCharFrom>(src, res, encoding);
+ return res;
+ }
+}
+
+// Write result into @dst. Return string-buffer pointing to re-coded content of @dst.
+
+template <bool robust>
+inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
+ if (robust && CODES_UTF8 == encoding)
+ return ::NDetail::RecodeUTF8Robust(src, dst);
+ return ::NDetail::Recode<char>(src, dst, encoding);
+}
+
+inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
+ return ::NDetail::Recode<char>(src, dst, encoding);
+}
+
+inline TStringBuf WideToChar(const TWtringBuf src, TString& dst, ECharset encoding) {
+ return ::NDetail::Recode<wchar16>(src, dst, encoding);
+}
+
+//! calls either to @c WideToUTF8 or @c WideToChar depending on the encoding type
+inline TString WideToChar(const wchar16* text, size_t len, ECharset enc) {
+ if (NCodepagePrivate::NativeCodepage(enc)) {
+ if (enc == CODES_UTF8)
+ return WideToUTF8(text, len);
+
+ TString s = TString::Uninitialized(len);
+ s.remove(WideToChar(text, len, s.begin(), enc));
+
+ return s;
+ }
+
+ TString s = TString::Uninitialized(len * 3);
+
+ size_t read = 0;
+ size_t written = 0;
+ NICONVPrivate::RecodeFromUnicode(enc, text, s.begin(), len, s.size(), read, written);
+ s.remove(written);
+
+ return s;
+}
+
+inline TUtf16String CharToWide(const char* text, size_t len, const CodePage& cp) {
+ TUtf16String w = TUtf16String::Uninitialized(len);
+ CharToWide(text, len, w.begin(), cp);
+ return w;
+}
+
+//! calls either to @c UTF8ToWide or @c CharToWide depending on the encoding type
+template <bool robust>
+inline TUtf16String CharToWide(const char* text, size_t len, ECharset enc) {
+ if (NCodepagePrivate::NativeCodepage(enc)) {
+ if (enc == CODES_UTF8)
+ return UTF8ToWide<robust>(text, len);
+
+ return CharToWide(text, len, *CodePageByCharset(enc));
+ }
+
+ TUtf16String w = TUtf16String::Uninitialized(len * 2);
+
+ size_t read = 0;
+ size_t written = 0;
+ NICONVPrivate::RecodeToUnicode(enc, text, w.begin(), len, len, read, written);
+ w.remove(written);
+
+ return w;
+}
+
+//! converts text from UTF8 to unicode, if conversion fails it uses codepage to convert the text
+//! @param text text to be converted
+//! @param len length of the text in characters
+//! @param cp a codepage that is used in case of failed conversion from UTF8
+inline TUtf16String UTF8ToWide(const char* text, size_t len, const CodePage& cp) {
+ TUtf16String w = TUtf16String::Uninitialized(len);
+ size_t written = 0;
+ if (UTF8ToWide(text, len, w.begin(), written))
+ w.remove(written);
+ else
+ CharToWide(text, len, w.begin(), cp);
+ return w;
+}
+
+inline TString WideToChar(const TWtringBuf w, ECharset enc) {
+ return WideToChar(w.data(), w.size(), enc);
+}
+
+inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
+ return CharToWide<false>(s.data(), s.size(), enc);
+}
+
+template <bool robust>
+inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
+ return CharToWide<robust>(s.data(), s.size(), enc);
+}
+
+inline TUtf16String CharToWide(const TStringBuf s, const CodePage& cp) {
+ return CharToWide(s.data(), s.size(), cp);
+}
+
+// true if @text can be fully encoded to specified @encoding,
+// with possibility to recover exact original text after decoding
+bool CanBeEncoded(TWtringBuf text, ECharset encoding);