#pragma once #include "codepage.h" #include "iconv.h" #include <util/charset/recode_result.h> #include <util/charset/unidata.h> #include <util/charset/utf8.h> #include <util/charset/wide.h> #include <util/generic/string.h> #include <util/generic/algorithm.h> #include <util/generic/yexception.h> #include <util/memory/tempbuf.h> #include <util/system/yassert.h> //! converts text from unicode to yandex codepage //! @attention destination buffer must be long enough to fit all characters of the text //! @note @c dest buffer must fit at least @c len number of characters template <typename TCharType> inline size_t WideToChar(const TCharType* text, size_t len, char* dest, ECharset enc) { Y_ASSERT(SingleByteCodepage(enc)); const char* start = dest; const Encoder* const encoder = &EncoderByCharset(enc); const TCharType* const last = text + len; for (const TCharType* cur = text; cur != last; ++dest) { *dest = encoder->Tr(ReadSymbolAndAdvance(cur, last)); } return dest - start; } //! converts text to unicode using a codepage object //! @attention destination buffer must be long enough to fit all characters of the text //! @note @c dest buffer must fit at least @c len number of characters; //! if you need convert zero terminated string you should determine length of the //! string using the @c strlen function and pass as the @c len parameter; //! it does not make sense to create an additional version of this function because //! it will call to @c strlen anyway in order to allocate destination buffer template <typename TCharType> inline void CharToWide(const char* text, size_t len, TCharType* dest, const CodePage& cp) { const unsigned char* cur = reinterpret_cast<const unsigned char*>(text); const unsigned char* const last = cur + len; for (; cur != last; ++cur, ++dest) { *dest = static_cast<TCharType>(cp.unicode[*cur]); // static_cast is safe as no 1char codepage contains non-BMP symbols } } namespace NDetail { namespace NBaseOps { // Template interface base recoding drivers, do not perform any memory management, // do not care about buffer size, so supplied @dst // should have enough room for the result (with proper reserve for the worst case) // Depending on template params, perform conversion of single-byte/multi-byte/utf8 string to/from wide string. template <typename TCharType> inline TBasicStringBuf<TCharType> RecodeSingleByteChar(const TStringBuf src, TCharType* dst, const CodePage& cp) { Y_ASSERT(cp.SingleByteCodepage()); ::CharToWide(src.data(), src.size(), dst, cp); return TBasicStringBuf<TCharType>(dst, src.size()); } template <typename TCharType> inline TStringBuf RecodeSingleByteChar(const TBasicStringBuf<TCharType> src, char* dst, const CodePage& cp) { Y_ASSERT(cp.SingleByteCodepage()); ::WideToChar(src.data(), src.size(), dst, cp.CPEnum); return TStringBuf(dst, src.size()); } template <typename TCharType> inline TBasicStringBuf<TCharType> RecodeMultiByteChar(const TStringBuf src, TCharType* dst, ECharset encoding) { Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding)); size_t read = 0; size_t written = 0; ::NICONVPrivate::RecodeToUnicode(encoding, src.data(), dst, src.size(), src.size(), read, written); return TBasicStringBuf<TCharType>(dst, written); } template <typename TCharType> inline TStringBuf RecodeMultiByteChar(const TBasicStringBuf<TCharType> src, char* dst, ECharset encoding) { Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding)); size_t read = 0; size_t written = 0; ::NICONVPrivate::RecodeFromUnicode(encoding, src.data(), dst, src.size(), src.size() * 3, read, written); return TStringBuf(dst, written); } template <typename TCharType> inline TBasicStringBuf<TCharType> RecodeUtf8(const TStringBuf src, TCharType* dst) { size_t len = 0; if (!::UTF8ToWide(src.data(), src.size(), dst, len)) ythrow yexception() << "Invalid UTF8: \"" << src.SubStr(0, 50) << (src.size() > 50 ? "...\"" : "\""); return TBasicStringBuf<TCharType>(dst, len); } template <typename TCharType> inline TStringBuf RecodeUtf8(const TBasicStringBuf<TCharType> src, char* dst) { size_t len = 0; ::WideToUTF8(src.data(), src.size(), dst, len); return TStringBuf(dst, len); } // Select one of re-coding methods from above, based on provided @encoding template <typename TCharFrom, typename TCharTo> TBasicStringBuf<TCharTo> Recode(const TBasicStringBuf<TCharFrom> src, TCharTo* dst, ECharset encoding) { if (encoding == CODES_UTF8) return RecodeUtf8(src, dst); else if (SingleByteCodepage(encoding)) return RecodeSingleByteChar(src, dst, *CodePageByCharset(encoding)); else return RecodeMultiByteChar(src, dst, encoding); } } template <typename TCharFrom> struct TRecodeTraits; template <> struct TRecodeTraits<char> { using TCharTo = wchar16; using TStringBufTo = TWtringBuf; using TStringTo = TUtf16String; enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case // Here an unicode character can be converted up to 4 bytes of UTF8 }; template <> struct TRecodeTraits<wchar16> { using TCharTo = char; using TStringBufTo = TStringBuf; using TStringTo = TString; enum { ReserveSize = 2 }; // possible surrogate pairs ? }; // Operations with destination buffer where recoded string will be written template <typename TResult> struct TRecodeResultOps { // default implementation will work with TString and TUtf16String - 99% of usage using TResultChar = typename TResult::char_type; static inline size_t Size(const TResult& dst) { return dst.size(); } static inline TResultChar* Reserve(TResult& dst, size_t len) { dst.ReserveAndResize(len); return dst.begin(); } static inline void Truncate(TResult& dst, size_t len) { dst.resize(len); } }; // Main template interface for recoding in both directions template <typename TCharFrom, typename TResult> typename TRecodeTraits<TCharFrom>::TStringBufTo Recode(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) { using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo; // make enough room for re-coded string TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<TCharTo>::ReserveSize); // do re-coding TBasicStringBuf<TCharTo> res = NBaseOps::Recode(src, dstbuf, encoding); // truncate result back to proper size TRecodeResultOps<TResult>::Truncate(dst, res.size()); return res; } // appending version of Recode() template <typename TCharFrom, typename TResult> typename TRecodeTraits<TCharFrom>::TStringBufTo RecodeAppend(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) { using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo; size_t dstOrigSize = TRecodeResultOps<TResult>::Size(dst); TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, dstOrigSize + src.size() * TRecodeTraits<TCharTo>::ReserveSize); TBasicStringBuf<TCharTo> appended = NBaseOps::Recode(src, dstbuf + dstOrigSize, encoding); size_t dstFinalSize = dstOrigSize + appended.size(); TRecodeResultOps<TResult>::Truncate(dst, dstFinalSize); return TBasicStringBuf<TCharTo>(dstbuf, dstFinalSize); } // special implementation for robust utf8 functions template <typename TResult> TWtringBuf RecodeUTF8Robust(const TStringBuf src, TResult& dst) { // make enough room for re-coded string wchar16* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<wchar16>::ReserveSize); // do re-coding size_t written = 0; UTF8ToWide<true>(src.data(), src.size(), dstbuf, written); // truncate result back to proper size TRecodeResultOps<TResult>::Truncate(dst, written); return TWtringBuf(dstbuf, written); } template <typename TCharFrom> inline typename TRecodeTraits<TCharFrom>::TStringTo Recode(const TBasicStringBuf<TCharFrom> src, ECharset encoding) { typename TRecodeTraits<TCharFrom>::TStringTo res; Recode<TCharFrom>(src, res, encoding); return res; } } // Write result into @dst. Return string-buffer pointing to re-coded content of @dst. template <bool robust> inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) { if (robust && CODES_UTF8 == encoding) return ::NDetail::RecodeUTF8Robust(src, dst); return ::NDetail::Recode<char>(src, dst, encoding); } inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) { return ::NDetail::Recode<char>(src, dst, encoding); } inline TStringBuf WideToChar(const TWtringBuf src, TString& dst, ECharset encoding) { return ::NDetail::Recode<wchar16>(src, dst, encoding); } //! calls either to @c WideToUTF8 or @c WideToChar depending on the encoding type inline TString WideToChar(const wchar16* text, size_t len, ECharset enc) { if (NCodepagePrivate::NativeCodepage(enc)) { if (enc == CODES_UTF8) return WideToUTF8(text, len); TString s = TString::Uninitialized(len); s.remove(WideToChar(text, len, s.begin(), enc)); return s; } TString s = TString::Uninitialized(len * 3); size_t read = 0; size_t written = 0; NICONVPrivate::RecodeFromUnicode(enc, text, s.begin(), len, s.size(), read, written); s.remove(written); return s; } inline TUtf16String CharToWide(const char* text, size_t len, const CodePage& cp) { TUtf16String w = TUtf16String::Uninitialized(len); CharToWide(text, len, w.begin(), cp); return w; } //! calls either to @c UTF8ToWide or @c CharToWide depending on the encoding type template <bool robust> inline TUtf16String CharToWide(const char* text, size_t len, ECharset enc) { if (NCodepagePrivate::NativeCodepage(enc)) { if (enc == CODES_UTF8) return UTF8ToWide<robust>(text, len); return CharToWide(text, len, *CodePageByCharset(enc)); } TUtf16String w = TUtf16String::Uninitialized(len * 2); size_t read = 0; size_t written = 0; NICONVPrivate::RecodeToUnicode(enc, text, w.begin(), len, len, read, written); w.remove(written); return w; } //! converts text from UTF8 to unicode, if conversion fails it uses codepage to convert the text //! @param text text to be converted //! @param len length of the text in characters //! @param cp a codepage that is used in case of failed conversion from UTF8 inline TUtf16String UTF8ToWide(const char* text, size_t len, const CodePage& cp) { TUtf16String w = TUtf16String::Uninitialized(len); size_t written = 0; if (UTF8ToWide(text, len, w.begin(), written)) w.remove(written); else CharToWide(text, len, w.begin(), cp); return w; } inline TString WideToChar(const TWtringBuf w, ECharset enc) { return WideToChar(w.data(), w.size(), enc); } inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) { return CharToWide<false>(s.data(), s.size(), enc); } template <bool robust> inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) { return CharToWide<robust>(s.data(), s.size(), enc); } inline TUtf16String CharToWide(const TStringBuf s, const CodePage& cp) { return CharToWide(s.data(), s.size(), cp); } // true if @text can be fully encoded to specified @encoding, // with possibility to recover exact original text after decoding bool CanBeEncoded(TWtringBuf text, ECharset encoding);