#pragma once #include "doccodes.h" #include <util/charset/recode_result.h> #include <util/charset/unidata.h> // all wchar32 functions #include <util/charset/utf8.h> #include <util/generic/string.h> #include <util/generic/ylimits.h> #include <util/generic/yexception.h> #include <util/system/yassert.h> #include <util/system/defaults.h> #include <cctype> struct CodePage; struct Recoder; struct Encoder; /*****************************************************************\ * struct CodePage * \*****************************************************************/ struct CodePage { ECharset CPEnum; // int MIBEnum; const char* Names[30]; // name[0] -- preferred mime-name wchar32 unicode[256]; const char* DefaultChar; //[CCL_NUM] bool IsLower(unsigned char ch) const { return ::IsLower(unicode[ch]); } bool IsUpper(unsigned char ch) const { return ::IsUpper(unicode[ch]); } bool IsAlpha(unsigned char ch) const { return ::IsAlpha(unicode[ch]); } bool IsDigit(unsigned char ch) const { return ::IsDigit(unicode[ch]); } bool IsXdigit(unsigned char ch) const { return ::IsXdigit(unicode[ch]); } bool IsAlnum(unsigned char ch) const { return ::IsAlnum(unicode[ch]); } bool IsSpace(unsigned char ch) const { return ::IsSpace(unicode[ch]); } bool IsPunct(unsigned char ch) const { return ::IsPunct(unicode[ch]); } bool IsCntrl(unsigned char ch) const { return ::IsCntrl(unicode[ch]); } bool IsGraph(unsigned char ch) const { return ::IsGraph(unicode[ch]); } bool IsPrint(unsigned char ch) const { return ::IsPrint(unicode[ch]); } bool IsComposed(unsigned char ch) const { return ::IsComposed(unicode[ch]); } // return pointer to char after the last char char* ToLower(const char* begin, const char* end, char* to) const; char* ToLower(const char* begin, char* to) const; // return pointer to char after the last char char* ToUpper(const char* begin, const char* end, char* to) const; char* ToUpper(const char* begin, char* to) const; int stricmp(const char* s1, const char* s2) const; int strnicmp(const char* s1, const char* s2, size_t len) const; inline unsigned char ToUpper(unsigned char ch) const; inline unsigned char ToLower(unsigned char ch) const; inline unsigned char ToTitle(unsigned char ch) const; inline int ToDigit(unsigned char ch) const { return ::ToDigit(unicode[ch]); } static void Initialize(); inline bool SingleByteCodepage() const { return DefaultChar != nullptr; } inline bool NativeCodepage() const { return SingleByteCodepage() || CPEnum == CODES_UTF8; } }; class TCodePageHash; namespace NCodepagePrivate { class TCodepagesMap { private: static const int DataShift = 2; static const int DataSize = CODES_MAX + DataShift; const CodePage* Data[DataSize]; private: inline const CodePage* GetPrivate(ECharset e) const { Y_ASSERT(e + DataShift >= 0 && e + DataShift < DataSize); return Data[e + DataShift]; } void SetData(const CodePage* cp); public: TCodepagesMap(); inline const CodePage* Get(ECharset e) const { const CodePage* res = GetPrivate(e); if (!res->SingleByteCodepage()) { ythrow yexception() << "CodePage (" << (int)e << ") structure can only be used for single byte encodings"; } return res; } inline bool SingleByteCodepage(ECharset e) const { return GetPrivate(e)->SingleByteCodepage(); } inline bool NativeCodepage(ECharset e) const { return GetPrivate(e)->NativeCodepage(); } inline const char* NameByCharset(ECharset e) const { return GetPrivate(e)->Names[0]; } static const TCodepagesMap& Instance(); friend class ::TCodePageHash; }; inline bool NativeCodepage(ECharset e) { return ::NCodepagePrivate::TCodepagesMap::Instance().NativeCodepage(e); } } inline bool SingleByteCodepage(ECharset e) { return ::NCodepagePrivate::TCodepagesMap::Instance().SingleByteCodepage(e); } inline bool ValidCodepage(ECharset e) { return e >= 0 && e < CODES_MAX; } inline const CodePage* CodePageByCharset(ECharset e) { return ::NCodepagePrivate::TCodepagesMap::Instance().Get(e); } ECharset CharsetByName(TStringBuf name); // Same as CharsetByName, but throws yexception() if name is invalid ECharset CharsetByNameOrDie(TStringBuf name); inline ECharset CharsetByCodePage(const CodePage* CP) { return CP->CPEnum; } inline const char* NameByCharset(ECharset e) { return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e); } inline const char* NameByCharsetSafe(ECharset e) { if (CODES_UNKNOWN < e && e < CODES_MAX) return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e); else ythrow yexception() << "unknown encoding: " << (int)e; } inline const char* NameByCodePage(const CodePage* CP) { return CP->Names[0]; } inline const CodePage* CodePageByName(const char* name) { ECharset code = CharsetByName(name); if (code == CODES_UNKNOWN) return nullptr; return CodePageByCharset(code); } ECharset EncodingHintByName(const char* name); /*****************************************************************\ * struct Encoder * \*****************************************************************/ struct Encoder { char* Table[256]; const char* DefaultChar; inline char Code(wchar32 ch) const { if (ch > 0xFFFF) return 0; return (unsigned char)Table[(ch >> 8) & 255][ch & 255]; } inline char Tr(wchar32 ch) const { char code = Code(ch); if (code == 0 && ch != 0) code = DefaultChar[NUnicode::CharType(ch)]; Y_ASSERT(code != 0 || ch == 0); return code; } inline unsigned char operator[](wchar32 ch) const { return Tr(ch); } void Tr(const wchar32* in, char* out, size_t len) const; void Tr(const wchar32* in, char* out) const; char* DefaultPlane; }; /*****************************************************************\ * struct Recoder * \*****************************************************************/ struct Recoder { unsigned char Table[257]; void Create(const CodePage& source, const CodePage& target); void Create(const CodePage& source, const Encoder* wideTarget); void Create(const CodePage& page, wchar32 (*mapper)(wchar32)); void Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapper)(wchar32)); inline unsigned char Tr(unsigned char c) const { return Table[c]; } inline unsigned char operator[](unsigned char c) const { return Table[c]; } void Tr(const char* in, char* out, size_t len) const; void Tr(const char* in, char* out) const; void Tr(char* in_out, size_t len) const; void Tr(char* in_out) const; }; extern const struct Encoder& WideCharToYandex; const Encoder& EncoderByCharset(ECharset enc); namespace NCodepagePrivate { class TCodePageData { private: static const CodePage* const AllCodePages[]; static const Recoder rcdr_to_yandex[]; static const Recoder rcdr_from_yandex[]; static const Recoder rcdr_to_lower[]; static const Recoder rcdr_to_upper[]; static const Recoder rcdr_to_title[]; static const Encoder* const EncodeTo[]; friend struct ::CodePage; friend class TCodepagesMap; friend RECODE_RESULT _recodeToYandex(ECharset, const char*, char*, size_t, size_t, size_t&, size_t&); friend RECODE_RESULT _recodeFromYandex(ECharset, const char*, char*, size_t, size_t, size_t&, size_t&); friend const Encoder& ::EncoderByCharset(ECharset enc); }; } inline const Encoder& EncoderByCharset(ECharset enc) { if (!SingleByteCodepage(enc)) { ythrow yexception() << "Encoder structure can only be used for single byte encodings"; } return *NCodepagePrivate::TCodePageData::EncodeTo[enc]; } inline unsigned char CodePage::ToUpper(unsigned char ch) const { return NCodepagePrivate::TCodePageData::rcdr_to_upper[CPEnum].Table[ch]; } inline unsigned char CodePage::ToLower(unsigned char ch) const { return NCodepagePrivate::TCodePageData::rcdr_to_lower[CPEnum].Table[ch]; } inline unsigned char CodePage::ToTitle(unsigned char ch) const { return NCodepagePrivate::TCodePageData::rcdr_to_title[CPEnum].Table[ch]; } extern const CodePage& csYandex; /// these functions change (lowers) [end] position in case of utf-8 /// null character is NOT assumed or written at [*end] void DecodeUnknownPlane(wchar16* start, wchar16*& end, const ECharset enc4unk); void DecodeUnknownPlane(wchar32* start, wchar32*& end, const ECharset enc4unk); inline void ToLower(char* s, size_t n, const CodePage& cp = csYandex) { char* const e = s + n; for (; s != e; ++s) *s = cp.ToLower(*s); } inline void ToUpper(char* s, size_t n, const CodePage& cp = csYandex) { char* const e = s + n; for (; s != e; ++s) *s = cp.ToUpper(*s); } inline TString ToLower(TString s, const CodePage& cp, size_t pos = 0, size_t n = TString::npos) { s.Transform([&cp](size_t, char c) { return cp.ToLower(c); }, pos, n); return s; } inline TString ToUpper(TString s, const CodePage& cp, size_t pos = 0, size_t n = TString::npos) { s.Transform([&cp](size_t, char c) { return cp.ToUpper(c); }, pos, n); return s; } inline TString ToTitle(TString s, const CodePage& cp, size_t pos = 0, size_t n = TString::npos) { s.Transform( [pos, &cp](size_t i, char c) { return i == pos ? cp.ToTitle(c) : cp.ToLower(c); }, pos, n); return s; }