aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/charset/codepage.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/charset/codepage.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/charset/codepage.h')
-rw-r--r--library/cpp/charset/codepage.h324
1 files changed, 324 insertions, 0 deletions
diff --git a/library/cpp/charset/codepage.h b/library/cpp/charset/codepage.h
new file mode 100644
index 00000000000..30a02a46106
--- /dev/null
+++ b/library/cpp/charset/codepage.h
@@ -0,0 +1,324 @@
+#pragma once
+
+#include "doccodes.h"
+
+#include <util/charset/recode_result.h>
+#include <util/charset/unidata.h> // all wchar32 functions
+#include <util/charset/utf8.h>
+#include <util/generic/string.h>
+#include <util/generic/ylimits.h>
+#include <util/generic/yexception.h>
+#include <util/system/yassert.h>
+#include <util/system/defaults.h>
+
+#include <cctype>
+
+struct CodePage;
+struct Recoder;
+struct Encoder;
+
+/*****************************************************************\
+* struct CodePage *
+\*****************************************************************/
+struct CodePage {
+ ECharset CPEnum; // int MIBEnum;
+ const char* Names[30]; // name[0] -- preferred mime-name
+ wchar32 unicode[256];
+ const char* DefaultChar; //[CCL_NUM]
+
+ bool IsLower(unsigned char ch) const {
+ return ::IsLower(unicode[ch]);
+ }
+ bool IsUpper(unsigned char ch) const {
+ return ::IsUpper(unicode[ch]);
+ }
+ bool IsAlpha(unsigned char ch) const {
+ return ::IsAlpha(unicode[ch]);
+ }
+ bool IsDigit(unsigned char ch) const {
+ return ::IsDigit(unicode[ch]);
+ }
+ bool IsXdigit(unsigned char ch) const {
+ return ::IsXdigit(unicode[ch]);
+ }
+ bool IsAlnum(unsigned char ch) const {
+ return ::IsAlnum(unicode[ch]);
+ }
+ bool IsSpace(unsigned char ch) const {
+ return ::IsSpace(unicode[ch]);
+ }
+ bool IsPunct(unsigned char ch) const {
+ return ::IsPunct(unicode[ch]);
+ }
+ bool IsCntrl(unsigned char ch) const {
+ return ::IsCntrl(unicode[ch]);
+ }
+ bool IsGraph(unsigned char ch) const {
+ return ::IsGraph(unicode[ch]);
+ }
+ bool IsPrint(unsigned char ch) const {
+ return ::IsPrint(unicode[ch]);
+ }
+ bool IsComposed(unsigned char ch) const {
+ return ::IsComposed(unicode[ch]);
+ }
+
+ // return pointer to char after the last char
+ char* ToLower(const char* begin, const char* end, char* to) const;
+ char* ToLower(const char* begin, char* to) const;
+
+ // return pointer to char after the last char
+ char* ToUpper(const char* begin, const char* end, char* to) const;
+ char* ToUpper(const char* begin, char* to) const;
+
+ int stricmp(const char* s1, const char* s2) const;
+ int strnicmp(const char* s1, const char* s2, size_t len) const;
+
+ inline unsigned char ToUpper(unsigned char ch) const;
+ inline unsigned char ToLower(unsigned char ch) const;
+ inline unsigned char ToTitle(unsigned char ch) const;
+
+ inline int ToDigit(unsigned char ch) const {
+ return ::ToDigit(unicode[ch]);
+ }
+
+ static void Initialize();
+
+ inline bool SingleByteCodepage() const {
+ return DefaultChar != nullptr;
+ }
+ inline bool NativeCodepage() const {
+ return SingleByteCodepage() || CPEnum == CODES_UTF8;
+ }
+};
+
+class TCodePageHash;
+
+namespace NCodepagePrivate {
+ class TCodepagesMap {
+ private:
+ static const int DataShift = 2;
+ static const int DataSize = CODES_MAX + DataShift;
+ const CodePage* Data[DataSize];
+
+ private:
+ inline const CodePage* GetPrivate(ECharset e) const {
+ Y_ASSERT(e + DataShift >= 0 && e + DataShift < DataSize);
+ return Data[e + DataShift];
+ }
+
+ void SetData(const CodePage* cp);
+
+ public:
+ TCodepagesMap();
+
+ inline const CodePage* Get(ECharset e) const {
+ const CodePage* res = GetPrivate(e);
+ if (!res->SingleByteCodepage()) {
+ ythrow yexception() << "CodePage (" << (int)e << ") structure can only be used for single byte encodings";
+ }
+
+ return res;
+ }
+
+ inline bool SingleByteCodepage(ECharset e) const {
+ return GetPrivate(e)->SingleByteCodepage();
+ }
+ inline bool NativeCodepage(ECharset e) const {
+ return GetPrivate(e)->NativeCodepage();
+ }
+ inline const char* NameByCharset(ECharset e) const {
+ return GetPrivate(e)->Names[0];
+ }
+
+ static const TCodepagesMap& Instance();
+
+ friend class ::TCodePageHash;
+ };
+
+ inline bool NativeCodepage(ECharset e) {
+ return ::NCodepagePrivate::TCodepagesMap::Instance().NativeCodepage(e);
+ }
+}
+
+inline bool SingleByteCodepage(ECharset e) {
+ return ::NCodepagePrivate::TCodepagesMap::Instance().SingleByteCodepage(e);
+}
+
+inline bool ValidCodepage(ECharset e) {
+ return e >= 0 && e < CODES_MAX;
+}
+
+inline const CodePage* CodePageByCharset(ECharset e) {
+ return ::NCodepagePrivate::TCodepagesMap::Instance().Get(e);
+}
+
+ECharset CharsetByName(TStringBuf name);
+
+// Same as CharsetByName, but throws yexception() if name is invalid
+ECharset CharsetByNameOrDie(TStringBuf name);
+
+inline ECharset CharsetByCodePage(const CodePage* CP) {
+ return CP->CPEnum;
+}
+
+inline const char* NameByCharset(ECharset e) {
+ return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e);
+}
+
+inline const char* NameByCharsetSafe(ECharset e) {
+ if (CODES_UNKNOWN < e && e < CODES_MAX)
+ return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e);
+ else
+ ythrow yexception() << "unknown encoding: " << (int)e;
+}
+
+inline const char* NameByCodePage(const CodePage* CP) {
+ return CP->Names[0];
+}
+
+inline const CodePage* CodePageByName(const char* name) {
+ ECharset code = CharsetByName(name);
+ if (code == CODES_UNKNOWN)
+ return nullptr;
+
+ return CodePageByCharset(code);
+}
+
+ECharset EncodingHintByName(const char* name);
+
+/*****************************************************************\
+* struct Encoder *
+\*****************************************************************/
+struct Encoder {
+ char* Table[256];
+ const char* DefaultChar;
+
+ inline char Code(wchar32 ch) const {
+ if (ch > 0xFFFF)
+ return 0;
+ return (unsigned char)Table[(ch >> 8) & 255][ch & 255];
+ }
+
+ inline char Tr(wchar32 ch) const {
+ char code = Code(ch);
+ if (code == 0 && ch != 0)
+ code = DefaultChar[NUnicode::CharType(ch)];
+ Y_ASSERT(code != 0 || ch == 0);
+ return code;
+ }
+
+ inline unsigned char operator[](wchar32 ch) const {
+ return Tr(ch);
+ }
+
+ void Tr(const wchar32* in, char* out, size_t len) const;
+ void Tr(const wchar32* in, char* out) const;
+ char* DefaultPlane;
+};
+
+/*****************************************************************\
+* struct Recoder *
+\*****************************************************************/
+struct Recoder {
+ unsigned char Table[257];
+
+ void Create(const CodePage& source, const CodePage& target);
+ void Create(const CodePage& source, const Encoder* wideTarget);
+
+ void Create(const CodePage& page, wchar32 (*mapper)(wchar32));
+ void Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapper)(wchar32));
+
+ inline unsigned char Tr(unsigned char c) const {
+ return Table[c];
+ }
+ inline unsigned char operator[](unsigned char c) const {
+ return Table[c];
+ }
+ void Tr(const char* in, char* out, size_t len) const;
+ void Tr(const char* in, char* out) const;
+ void Tr(char* in_out, size_t len) const;
+ void Tr(char* in_out) const;
+};
+
+extern const struct Encoder& WideCharToYandex;
+
+const Encoder& EncoderByCharset(ECharset enc);
+
+namespace NCodepagePrivate {
+ class TCodePageData {
+ private:
+ static const CodePage* const AllCodePages[];
+
+ static const Recoder rcdr_to_yandex[];
+ static const Recoder rcdr_from_yandex[];
+ static const Recoder rcdr_to_lower[];
+ static const Recoder rcdr_to_upper[];
+ static const Recoder rcdr_to_title[];
+
+ static const Encoder* const EncodeTo[];
+
+ friend struct ::CodePage;
+ friend class TCodepagesMap;
+ friend RECODE_RESULT _recodeToYandex(ECharset, const char*, char*, size_t, size_t, size_t&, size_t&);
+ friend RECODE_RESULT _recodeFromYandex(ECharset, const char*, char*, size_t, size_t, size_t&, size_t&);
+ friend const Encoder& ::EncoderByCharset(ECharset enc);
+ };
+}
+
+inline const Encoder& EncoderByCharset(ECharset enc) {
+ if (!SingleByteCodepage(enc)) {
+ ythrow yexception() << "Encoder structure can only be used for single byte encodings";
+ }
+
+ return *NCodepagePrivate::TCodePageData::EncodeTo[enc];
+}
+
+inline unsigned char CodePage::ToUpper(unsigned char ch) const {
+ return NCodepagePrivate::TCodePageData::rcdr_to_upper[CPEnum].Table[ch];
+}
+inline unsigned char CodePage::ToLower(unsigned char ch) const {
+ return NCodepagePrivate::TCodePageData::rcdr_to_lower[CPEnum].Table[ch];
+}
+inline unsigned char CodePage::ToTitle(unsigned char ch) const {
+ return NCodepagePrivate::TCodePageData::rcdr_to_title[CPEnum].Table[ch];
+}
+
+extern const CodePage& csYandex;
+
+/// these functions change (lowers) [end] position in case of utf-8
+/// null character is NOT assumed or written at [*end]
+void DecodeUnknownPlane(wchar16* start, wchar16*& end, const ECharset enc4unk);
+void DecodeUnknownPlane(wchar32* start, wchar32*& end, const ECharset enc4unk);
+
+inline void ToLower(char* s, size_t n, const CodePage& cp = csYandex) {
+ char* const e = s + n;
+ for (; s != e; ++s)
+ *s = cp.ToLower(*s);
+}
+
+inline void ToUpper(char* s, size_t n, const CodePage& cp = csYandex) {
+ char* const e = s + n;
+ for (; s != e; ++s)
+ *s = cp.ToUpper(*s);
+}
+
+inline TString ToLower(TString s, const CodePage& cp, size_t pos = 0, size_t n = TString::npos) {
+ s.Transform([&cp](size_t, char c) { return cp.ToLower(c); }, pos, n);
+ return s;
+}
+
+inline TString ToUpper(TString s, const CodePage& cp, size_t pos = 0, size_t n = TString::npos) {
+ s.Transform([&cp](size_t, char c) { return cp.ToUpper(c); }, pos, n);
+ return s;
+}
+
+inline TString ToTitle(TString s, const CodePage& cp, size_t pos = 0, size_t n = TString::npos) {
+ s.Transform(
+ [pos, &cp](size_t i, char c) {
+ return i == pos ? cp.ToTitle(c) : cp.ToLower(c);
+ },
+ pos,
+ n);
+ return s;
+}