/** Implementation of string encodings support * * Copyright (c) 2000-2019, Jeroen T. Vermeulen. * * See COPYING for copyright license. If you did not receive a file called * COPYING with this source code, please notify the distributor of this mistake, * or contact the author. */ #include "pqxx/compiler-internal.hxx" #include "pqxx/except.hxx" #include "pqxx/internal/encodings.hxx" #include <cstring> #include <iomanip> #include <map> #include <sstream> using namespace pqxx::internal; extern "C" { #include "libpq-fe.h" } // Internal helper functions namespace { /// Extract byte from buffer, return as unsigned char. unsigned char get_byte(const char buffer[], std::string::size_type offset) { return static_cast<unsigned char>(buffer[offset]); } [[noreturn]] void throw_for_encoding_error( const char* encoding_name, const char buffer[], std::string::size_type start, std::string::size_type count ) { std::stringstream s; s << "Invalid byte sequence for encoding " << encoding_name << " at byte " << start << ": " << std::hex << std::setw(2) << std::setfill('0') ; for (std::string::size_type i{0}; i < count; ++i) { s << "0x" << static_cast<unsigned int>(get_byte(buffer, start + i)); if (i + 1 < count) s << " "; } throw pqxx::argument_error{s.str()}; } /// Does value lie between bottom and top, inclusive? constexpr bool between_inc(unsigned char value, unsigned bottom, unsigned top) { return value >= bottom and value <= top; } /* EUC-JP and EUC-JIS-2004 represent slightly different code points but iterate the same: * https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-JP * http://x0213.org/codetable/index.en.html */ std::string::size_type next_seq_for_euc_jplike( const char buffer[], std::string::size_type buffer_len, std::string::size_type start, const char encoding_name[]) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (start + 2 > buffer_len) throw_for_encoding_error(encoding_name, buffer, start, 1); const auto byte2 = get_byte(buffer, start + 1); if (byte1 == 0x8e) { if (not between_inc(byte2, 0xa1, 0xfe)) throw_for_encoding_error(encoding_name, buffer, start, 2); return start + 2; } if (between_inc(byte1, 0xa1, 0xfe)) { if (not between_inc(byte2, 0xa1, 0xfe)) throw_for_encoding_error(encoding_name, buffer, start, 2); return start + 2; } if (byte1 == 0x8f and start + 3 <= buffer_len) { const auto byte3 = get_byte(buffer, start + 2); if ( not between_inc(byte2, 0xa1, 0xfe) or not between_inc(byte3, 0xa1, 0xfe) ) throw_for_encoding_error(encoding_name, buffer, start, 3); return start + 3; } throw_for_encoding_error(encoding_name, buffer, start, 1); } /* As far as I can tell, for the purposes of iterating the only difference between SJIS and SJIS-2004 is increased range in the first byte of two-byte sequences (0xEF increased to 0xFC). Officially, that is; apparently the version of SJIS used by Postgres has the same range as SJIS-2004. They both have increased range over the documented versions, not having the even/odd restriction for the first byte in 2-byte sequences. */ // https://en.wikipedia.org/wiki/Shift_JIS#Shift_JIS_byte_map // http://x0213.org/codetable/index.en.html std::string::size_type next_seq_for_sjislike( const char buffer[], std::string::size_type buffer_len, std::string::size_type start, const char* encoding_name ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80 or between_inc(byte1, 0xa1, 0xdf)) return start + 1; if ( not between_inc(byte1, 0x81, 0x9f) and not between_inc(byte1, 0xe0, 0xfc) ) throw_for_encoding_error(encoding_name, buffer, start, 1); if (start + 2 > buffer_len) throw_for_encoding_error( encoding_name, buffer, start, buffer_len - start); const auto byte2 = get_byte(buffer, start + 1); if (byte2 == 0x7f) throw_for_encoding_error(encoding_name, buffer, start, 2); if (between_inc(byte2, 0x40, 0x9e) or between_inc(byte2, 0x9f, 0xfc)) return start + 2; throw_for_encoding_error(encoding_name, buffer, start, 2); } } // namespace // Implement template specializations first namespace pqxx { namespace internal { template<encoding_group> struct glyph_scanner { static std::string::size_type call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start); }; template<> std::string::size_type glyph_scanner<encoding_group::MONOBYTE>::call( const char /* buffer */[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; else return start + 1; } // https://en.wikipedia.org/wiki/Big5#Organization template<> std::string::size_type glyph_scanner<encoding_group::BIG5>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (not between_inc(byte1, 0x81, 0xfe) or (start + 2 > buffer_len)) throw_for_encoding_error("BIG5", buffer, start, 1); const auto byte2 = get_byte(buffer, start + 1); if ( not between_inc(byte2, 0x40, 0x7e) and not between_inc(byte2, 0xa1, 0xfe)) throw_for_encoding_error("BIG5", buffer, start, 2); return start + 2; } /* The PostgreSQL documentation claims that the EUC_* encodings are 1-3 bytes each, but other documents explain that the EUC sets can contain 1-(2,3,4) bytes depending on the specific extension: EUC_CN : 1-2 EUC_JP : 1-3 EUC_JIS_2004: 1-2 EUC_KR : 1-2 EUC_TW : 1-4 */ // https://en.wikipedia.org/wiki/GB_2312#EUC-CN template<> std::string::size_type glyph_scanner<encoding_group::EUC_CN>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (not between_inc(byte1, 0xa1, 0xf7) or start + 2 > buffer_len) throw_for_encoding_error("EUC_CN", buffer, start, 1); const auto byte2 = get_byte(buffer, start + 1); if (not between_inc(byte2, 0xa1, 0xfe)) throw_for_encoding_error("EUC_CN", buffer, start, 2); return start + 2; } template<> std::string::size_type glyph_scanner<encoding_group::EUC_JP>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { return next_seq_for_euc_jplike(buffer, buffer_len, start, "EUC_JP"); } template<> std::string::size_type glyph_scanner<encoding_group::EUC_JIS_2004>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { return next_seq_for_euc_jplike(buffer, buffer_len, start, "EUC_JIS_2004"); } // https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-KR template<> std::string::size_type glyph_scanner<encoding_group::EUC_KR>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (not between_inc(byte1, 0xa1, 0xfe) or start + 2 > buffer_len) throw_for_encoding_error("EUC_KR", buffer, start, 1); const auto byte2 = get_byte(buffer, start + 1); if (not between_inc(byte2, 0xa1, 0xfe)) throw_for_encoding_error("EUC_KR", buffer, start, 1); return start + 2; } // https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-TW template<> std::string::size_type glyph_scanner<encoding_group::EUC_TW>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (start + 2 > buffer_len) throw_for_encoding_error("EUC_KR", buffer, start, 1); const auto byte2 = get_byte(buffer, start + 1); if (between_inc(byte1, 0xa1, 0xfe)) { if (not between_inc(byte2, 0xa1, 0xfe)) throw_for_encoding_error("EUC_KR", buffer, start, 2); return start + 2; } if (byte1 != 0x8e or start + 4 > buffer_len) throw_for_encoding_error("EUC_KR", buffer, start, 1); if ( between_inc(byte2, 0xa1, 0xb0) and between_inc(get_byte(buffer, start + 2), 0xa1, 0xfe) and between_inc(get_byte(buffer, start + 3), 0xa1, 0xfe) ) return start + 4; throw_for_encoding_error("EUC_KR", buffer, start, 4); } // https://en.wikipedia.org/wiki/GB_18030#Mapping template<> std::string::size_type glyph_scanner<encoding_group::GB18030>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (between_inc(byte1, 0x80, 0xff)) return start + 1; if (start + 2 > buffer_len) throw_for_encoding_error("GB18030", buffer, start, buffer_len - start); const auto byte2 = get_byte(buffer, start + 1); if (between_inc(byte2, 0x40, 0xfe)) { if (byte2 == 0x7f) throw_for_encoding_error("GB18030", buffer, start, 2); return start + 2; } if (start + 4 > buffer_len) throw_for_encoding_error("GB18030", buffer, start, buffer_len - start); if ( between_inc(byte2, 0x30, 0x39) and between_inc(get_byte(buffer, start + 2), 0x81, 0xfe) and between_inc(get_byte(buffer, start + 3), 0x30, 0x39) ) return start + 4; throw_for_encoding_error("GB18030", buffer, start, 4); } // https://en.wikipedia.org/wiki/GBK_(character_encoding)#Encoding template<> std::string::size_type glyph_scanner<encoding_group::GBK>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (start + 2 > buffer_len) throw_for_encoding_error("GBK", buffer, start, 1); const auto byte2 = get_byte(buffer, start + 1); if ( (between_inc(byte1, 0xa1, 0xa9) and between_inc(byte2, 0xa1, 0xfe)) or (between_inc(byte1, 0xb0, 0xf7) and between_inc(byte2, 0xa1, 0xfe)) or ( between_inc(byte1, 0x81, 0xa0) and between_inc(byte2, 0x40, 0xfe) and byte2 != 0x7f ) or ( between_inc(byte1, 0xaa, 0xfe) and between_inc(byte2, 0x40, 0xa0) and byte2 != 0x7f ) or ( between_inc(byte1, 0xa8, 0xa9) and between_inc(byte2, 0x40, 0xa0) and byte2 != 0x7f ) or (between_inc(byte1, 0xaa, 0xaf) and between_inc(byte2, 0xa1, 0xfe)) or (between_inc(byte1, 0xf8, 0xfe) and between_inc(byte2, 0xa1, 0xfe)) or ( between_inc(byte1, 0xa1, 0xa7) and between_inc(byte2, 0x40, 0xa0) and byte2 != 0x7f ) ) return start + 2; throw_for_encoding_error("GBK", buffer, start, 2); } /* The PostgreSQL documentation claims that the JOHAB encoding is 1-3 bytes, but "CJKV Information Processing" describes it (actually just the Hangul portion) as "three five-bit segments" that reside inside 16 bits (2 bytes). CJKV Information Processing by Ken Lunde, pg. 269: https://bit.ly/2BEOu5V */ template<> std::string::size_type glyph_scanner<encoding_group::JOHAB>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (start + 2 > buffer_len) throw_for_encoding_error("JOHAB", buffer, start, 1); const auto byte2 = get_byte(buffer, start); if ( ( between_inc(byte1, 0x84, 0xd3) and (between_inc(byte2, 0x41, 0x7e) or between_inc(byte2, 0x81, 0xfe)) ) or ( (between_inc(byte1, 0xd8, 0xde) or between_inc(byte1, 0xe0, 0xf9)) and (between_inc(byte2, 0x31, 0x7e) or between_inc(byte2, 0x91, 0xfe)) ) ) return start + 2; throw_for_encoding_error("JOHAB", buffer, start, 2); } /* PostgreSQL's MULE_INTERNAL is the emacs rather than Xemacs implementation; see the server/mb/pg_wchar.h PostgreSQL header file. This is implemented according to the description in said header file, but I was unable to get it to successfully iterate a MULE-encoded test CSV generated using PostgreSQL 9.2.23. Use this at your own risk. */ template<> std::string::size_type glyph_scanner<encoding_group::MULE_INTERNAL>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (start + 2 > buffer_len) throw_for_encoding_error("MULE_INTERNAL", buffer, start, 1); const auto byte2 = get_byte(buffer, start + 1); if (between_inc(byte1, 0x81, 0x8d) and byte2 >= 0xA0) return start + 2; if (start + 3 > buffer_len) throw_for_encoding_error("MULE_INTERNAL", buffer, start, 2); if ( ( (byte1 == 0x9A and between_inc(byte2, 0xa0, 0xdf)) or (byte1 == 0x9B and between_inc(byte2, 0xe0, 0xef)) or (between_inc(byte1, 0x90, 0x99) and byte2 >= 0xa0) ) and ( byte2 >= 0xA0 ) ) return start + 3; if (start + 4 > buffer_len) throw_for_encoding_error("MULE_INTERNAL", buffer, start, 3); if ( ( (byte1 == 0x9C and between_inc(byte2, 0xf0, 0xf4)) or (byte1 == 0x9D and between_inc(byte2, 0xf5, 0xfe)) ) and get_byte(buffer, start + 2) >= 0xa0 and get_byte(buffer, start + 4) >= 0xa0 ) return start + 4; throw_for_encoding_error("MULE_INTERNAL", buffer, start, 4); } template<> std::string::size_type glyph_scanner<encoding_group::SJIS>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { return next_seq_for_sjislike(buffer, buffer_len, start, "SJIS"); } template<> std::string::size_type glyph_scanner<encoding_group::SHIFT_JIS_2004>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { return next_seq_for_sjislike(buffer, buffer_len, start, "SHIFT_JIS_2004"); } // https://en.wikipedia.org/wiki/Unified_Hangul_Code template<> std::string::size_type glyph_scanner<encoding_group::UHC>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (start + 2 > buffer_len) throw_for_encoding_error("UHC", buffer, start, buffer_len - start); const auto byte2 = get_byte(buffer, start + 1); if (between_inc(byte1, 0x80, 0xc6)) { if ( between_inc(byte2, 0x41, 0x5a) or between_inc(byte2, 0x61, 0x7a) or between_inc(byte2, 0x80, 0xfe) ) return start + 2; throw_for_encoding_error("UHC", buffer, start, 2); } if (between_inc(byte1, 0xa1, 0xfe)) { if (not between_inc(byte2, 0xa1, 0xfe)) throw_for_encoding_error("UHC", buffer, start, 2); return start + 2; } throw_for_encoding_error("UHC", buffer, start, 1); } // https://en.wikipedia.org/wiki/UTF-8#Description template<> std::string::size_type glyph_scanner<encoding_group::UTF8>::call( const char buffer[], std::string::size_type buffer_len, std::string::size_type start ) { if (start >= buffer_len) return std::string::npos; const auto byte1 = get_byte(buffer, start); if (byte1 < 0x80) return start + 1; if (start + 2 > buffer_len) throw_for_encoding_error("UTF8", buffer, start, buffer_len - start); const auto byte2 = get_byte(buffer, start + 1); if (between_inc(byte1, 0xc0, 0xdf)) { if (not between_inc(byte2, 0x80, 0xbf)) throw_for_encoding_error("UTF8", buffer, start, 2); return start + 2; } if (start + 3 > buffer_len) throw_for_encoding_error("UTF8", buffer, start, buffer_len - start); const auto byte3 = get_byte(buffer, start + 2); if (between_inc(byte1, 0xe0, 0xef)) { if (between_inc(byte2, 0x80, 0xbf) and between_inc(byte3, 0x80, 0xbf)) return start + 3; throw_for_encoding_error("UTF8", buffer, start, 3); } if (start + 4 > buffer_len) throw_for_encoding_error("UTF8", buffer, start, buffer_len - start); if (between_inc(byte1, 0xf0, 0xf7)) { if ( between_inc(byte2, 0x80, 0xbf) and between_inc(byte3, 0x80, 0xbf) and between_inc(get_byte(buffer, start + 3), 0x80, 0xbf) ) return start + 4; throw_for_encoding_error("UTF8", buffer, start, 4); } throw_for_encoding_error("UTF8", buffer, start, 1); } const char *name_encoding(int encoding_id) { return pg_encoding_to_char(encoding_id); } encoding_group enc_group(int libpq_enc_id) { return enc_group(name_encoding(libpq_enc_id)); } encoding_group enc_group(const std::string& encoding_name) { static const std::map<std::string, encoding_group> encoding_map{ {"BIG5", encoding_group::BIG5}, {"EUC_CN", encoding_group::EUC_CN}, {"EUC_JP", encoding_group::EUC_JP}, {"EUC_JIS_2004", encoding_group::EUC_JIS_2004}, {"EUC_KR", encoding_group::EUC_KR}, {"EUC_TW", encoding_group::EUC_TW}, {"GB18030", encoding_group::GB18030}, {"GBK", encoding_group::GBK}, {"ISO_8859_5", encoding_group::MONOBYTE}, {"ISO_8859_6", encoding_group::MONOBYTE}, {"ISO_8859_7", encoding_group::MONOBYTE}, {"ISO_8859_8", encoding_group::MONOBYTE}, {"JOHAB", encoding_group::JOHAB}, {"KOI8R", encoding_group::MONOBYTE}, {"KOI8U", encoding_group::MONOBYTE}, {"LATIN1", encoding_group::MONOBYTE}, {"LATIN2", encoding_group::MONOBYTE}, {"LATIN3", encoding_group::MONOBYTE}, {"LATIN4", encoding_group::MONOBYTE}, {"LATIN5", encoding_group::MONOBYTE}, {"LATIN6", encoding_group::MONOBYTE}, {"LATIN7", encoding_group::MONOBYTE}, {"LATIN8", encoding_group::MONOBYTE}, {"LATIN9", encoding_group::MONOBYTE}, {"LATIN10", encoding_group::MONOBYTE}, {"MULE_INTERNAL", encoding_group::MULE_INTERNAL}, {"SJIS", encoding_group::SJIS}, {"SHIFT_JIS_2004", encoding_group::SHIFT_JIS_2004}, {"SQL_ASCII", encoding_group::MONOBYTE}, {"UHC", encoding_group::UHC}, {"UTF8", encoding_group::UTF8}, {"WIN866", encoding_group::MONOBYTE}, {"WIN874", encoding_group::MONOBYTE}, {"WIN1250", encoding_group::MONOBYTE}, {"WIN1251", encoding_group::MONOBYTE}, {"WIN1252", encoding_group::MONOBYTE}, {"WIN1253", encoding_group::MONOBYTE}, {"WIN1254", encoding_group::MONOBYTE}, {"WIN1255", encoding_group::MONOBYTE}, {"WIN1256", encoding_group::MONOBYTE}, {"WIN1257", encoding_group::MONOBYTE}, {"WIN1258", encoding_group::MONOBYTE}, }; const auto found_encoding_group = encoding_map.find(encoding_name); if (found_encoding_group == encoding_map.end()) throw std::invalid_argument{ "unrecognized encoding '" + encoding_name + "'" }; return found_encoding_group->second; } /// Look up instantiation @c T<enc>::call at runtime. /** Here, "T" is a struct template with a static member function "call", whose * type is "F". * * The return value is a pointer to the "call" member function for the * instantiation of T for encoding group enc. */ template<template<encoding_group> class T, typename F> inline F *for_encoding(encoding_group enc) { #define CASE_GROUP(ENC) \ case encoding_group::ENC: return T<encoding_group::ENC>::call switch (enc) { CASE_GROUP(MONOBYTE); CASE_GROUP(BIG5); CASE_GROUP(EUC_CN); CASE_GROUP(EUC_JP); CASE_GROUP(EUC_JIS_2004); CASE_GROUP(EUC_KR); CASE_GROUP(EUC_TW); CASE_GROUP(GB18030); CASE_GROUP(GBK); CASE_GROUP(JOHAB); CASE_GROUP(MULE_INTERNAL); CASE_GROUP(SJIS); CASE_GROUP(SHIFT_JIS_2004); CASE_GROUP(UHC); CASE_GROUP(UTF8); } throw pqxx::usage_error{ "Unsupported encoding group code " + to_string(int(enc)) + "."}; #undef CASE_GROUP } glyph_scanner_func *get_glyph_scanner(encoding_group enc) { return for_encoding<glyph_scanner, glyph_scanner_func>(enc); } template<encoding_group E> struct char_finder { static std::string::size_type call( const std::string &haystack, char needle, std::string::size_type start) { const auto buffer = haystack.c_str(); const auto size = haystack.size(); for ( auto here = start; here + 1 <= size; here = glyph_scanner<E>::call(buffer, size, here) ) { if (haystack[here] == needle) return here; } return std::string::npos; } }; template<encoding_group E> struct string_finder { static std::string::size_type call( const std::string &haystack, const std::string &needle, std::string::size_type start) { const auto buffer = haystack.c_str(); const auto size = haystack.size(); const auto needle_size = needle.size(); for ( auto here = start; here + needle_size <= size; here = glyph_scanner<E>::call(buffer, size, here) ) { if (std::memcmp(buffer + here, needle.c_str(), needle_size) == 0) return here; } return std::string::npos; } }; std::string::size_type find_with_encoding( encoding_group enc, const std::string& haystack, char needle, std::string::size_type start ) { using finder_func = std::string::size_type( const std::string &haystack, char needle, std::string::size_type start); const auto finder = for_encoding<char_finder, finder_func>(enc); return finder(haystack, needle, start); } std::string::size_type find_with_encoding( encoding_group enc, const std::string& haystack, const std::string& needle, std::string::size_type start ) { using finder_func = std::string::size_type( const std::string &haystack, const std::string &needle, std::string::size_type start); const auto finder = for_encoding<string_finder, finder_func>(enc); return finder(haystack, needle, start); } #undef DISPATCH_ENCODING_OPERATION } // namespace pqxx::internal } // namespace pqxx