aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/Modules/cjkcodecs/_codecs_iso2022.c
diff options
context:
space:
mode:
authorAlexSm <alex@ydb.tech>2024-03-05 10:40:59 +0100
committerGitHub <noreply@github.com>2024-03-05 12:40:59 +0300
commit1ac13c847b5358faba44dbb638a828e24369467b (patch)
tree07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Modules/cjkcodecs/_codecs_iso2022.c
parentffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff)
downloadydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz
Library import 16 (#2433)
Co-authored-by: robot-piglet <robot-piglet@yandex-team.com> Co-authored-by: deshevoy <deshevoy@yandex-team.com> Co-authored-by: robot-contrib <robot-contrib@yandex-team.com> Co-authored-by: thegeorg <thegeorg@yandex-team.com> Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com> Co-authored-by: svidyuk <svidyuk@yandex-team.com> Co-authored-by: shadchin <shadchin@yandex-team.com> Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com> Co-authored-by: innokentii <innokentii@yandex-team.com> Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com> Co-authored-by: snermolaev <snermolaev@yandex-team.com> Co-authored-by: dimdim11 <dimdim11@yandex-team.com> Co-authored-by: kickbutt <kickbutt@yandex-team.com> Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com> Co-authored-by: korsunandrei <korsunandrei@yandex-team.com> Co-authored-by: petrk <petrk@yandex-team.com> Co-authored-by: miroslav2 <miroslav2@yandex-team.com> Co-authored-by: serjflint <serjflint@yandex-team.com> Co-authored-by: akhropov <akhropov@yandex-team.com> Co-authored-by: prettyboy <prettyboy@yandex-team.com> Co-authored-by: ilikepugs <ilikepugs@yandex-team.com> Co-authored-by: hiddenpath <hiddenpath@yandex-team.com> Co-authored-by: mikhnenko <mikhnenko@yandex-team.com> Co-authored-by: spreis <spreis@yandex-team.com> Co-authored-by: andreyshspb <andreyshspb@yandex-team.com> Co-authored-by: dimaandreev <dimaandreev@yandex-team.com> Co-authored-by: rashid <rashid@yandex-team.com> Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com> Co-authored-by: r-vetrov <r-vetrov@yandex-team.com> Co-authored-by: ypodlesov <ypodlesov@yandex-team.com> Co-authored-by: zaverden <zaverden@yandex-team.com> Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com> Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com> Co-authored-by: v-korovin <v-korovin@yandex-team.com> Co-authored-by: arikon <arikon@yandex-team.com> Co-authored-by: khoden <khoden@yandex-team.com> Co-authored-by: psydmm <psydmm@yandex-team.com> Co-authored-by: robot-javacom <robot-javacom@yandex-team.com> Co-authored-by: dtorilov <dtorilov@yandex-team.com> Co-authored-by: sennikovmv <sennikovmv@yandex-team.com> Co-authored-by: hcpp <hcpp@ydb.tech>
Diffstat (limited to 'contrib/tools/python3/Modules/cjkcodecs/_codecs_iso2022.c')
-rw-r--r--contrib/tools/python3/Modules/cjkcodecs/_codecs_iso2022.c1150
1 files changed, 1150 insertions, 0 deletions
diff --git a/contrib/tools/python3/Modules/cjkcodecs/_codecs_iso2022.c b/contrib/tools/python3/Modules/cjkcodecs/_codecs_iso2022.c
new file mode 100644
index 0000000000..e8835ad090
--- /dev/null
+++ b/contrib/tools/python3/Modules/cjkcodecs/_codecs_iso2022.c
@@ -0,0 +1,1150 @@
+/*
+ * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
+ *
+ * Written by Hye-Shik Chang <perky@FreeBSD.org>
+ */
+
+#define USING_IMPORTED_MAPS
+#define USING_BINARY_PAIR_SEARCH
+#define EXTERN_JISX0213_PAIR
+#define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
+#define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
+
+#define CJK_MOD_SPECIFIC_STATE \
+ /* kr */ \
+ const encode_map *cp949_encmap; \
+ const decode_map *ksx1001_decmap; \
+ \
+ /* jp */ \
+ const encode_map *jisxcommon_encmap; \
+ const decode_map *jisx0208_decmap; \
+ const decode_map *jisx0212_decmap; \
+ const encode_map *jisx0213_bmp_encmap; \
+ const decode_map *jisx0213_1_bmp_decmap; \
+ const decode_map *jisx0213_2_bmp_decmap; \
+ const encode_map *jisx0213_emp_encmap; \
+ const decode_map *jisx0213_1_emp_decmap; \
+ const decode_map *jisx0213_2_emp_decmap; \
+ \
+ /* cn */ \
+ const encode_map *gbcommon_encmap; \
+ const decode_map *gb2312_decmap;
+
+
+#include "cjkcodecs.h"
+#include "alg_jisx0201.h"
+#include "emu_jisx0213_2000.h"
+#include "mappings_jisx0213_pair.h"
+
+/* STATE
+
+ state->c[0-3]
+
+ 00000000
+ ||^^^^^|
+ |+-----+---- G0-3 Character Set
+ +----------- Is G0-3 double byte?
+
+ state->c[4]
+
+ 00000000
+ ||
+ |+---- Locked-Shift?
+ +----- ESC Throughout
+*/
+
+#define ESC 0x1B
+#define SO 0x0E
+#define SI 0x0F
+#define LF 0x0A
+
+#define MAX_ESCSEQLEN 16
+
+#define CHARSET_ISO8859_1 'A'
+#define CHARSET_ASCII 'B'
+#define CHARSET_ISO8859_7 'F'
+#define CHARSET_JISX0201_K 'I'
+#define CHARSET_JISX0201_R 'J'
+
+#define CHARSET_GB2312 ('A'|CHARSET_DBCS)
+#define CHARSET_JISX0208 ('B'|CHARSET_DBCS)
+#define CHARSET_KSX1001 ('C'|CHARSET_DBCS)
+#define CHARSET_JISX0212 ('D'|CHARSET_DBCS)
+#define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS)
+#define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS)
+#define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS)
+#define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
+#define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS)
+#define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
+#define CHARSET_JISX0208_O ('@'|CHARSET_DBCS)
+
+#define CHARSET_DBCS 0x80
+#define ESCMARK(mark) ((mark) & 0x7f)
+
+#define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
+#define IS_ISO2022ESC(c2) \
+ ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
+ (c2) == '.' || (c2) == '&')
+ /* this is not a complete list of ISO-2022 escape sequence headers.
+ * but, it's enough to implement CJK instances of iso-2022. */
+
+#define MAP_UNMAPPABLE 0xFFFF
+#define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */
+
+#define F_SHIFTED 0x01
+#define F_ESCTHROUGHOUT 0x02
+
+#define STATE_SETG(dn, v) do { ((state)->c[dn]) = (v); } while (0)
+#define STATE_GETG(dn) ((state)->c[dn])
+
+#define STATE_G0 STATE_GETG(0)
+#define STATE_G1 STATE_GETG(1)
+#define STATE_G2 STATE_GETG(2)
+#define STATE_G3 STATE_GETG(3)
+#define STATE_SETG0(v) STATE_SETG(0, v)
+#define STATE_SETG1(v) STATE_SETG(1, v)
+#define STATE_SETG2(v) STATE_SETG(2, v)
+#define STATE_SETG3(v) STATE_SETG(3, v)
+
+#define STATE_SETFLAG(f) do { ((state)->c[4]) |= (f); } while (0)
+#define STATE_GETFLAG(f) ((state)->c[4] & (f))
+#define STATE_CLEARFLAG(f) do { ((state)->c[4]) &= ~(f); } while (0)
+#define STATE_CLEARFLAGS() do { ((state)->c[4]) = 0; } while (0)
+
+#define ISO2022_CONFIG ((const struct iso2022_config *)(codec->config))
+#define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag))
+#define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations)
+
+/* iso2022_config.flags */
+#define NO_SHIFT 0x01
+#define USE_G2 0x02
+#define USE_JISX0208_EXT 0x04
+
+/*-*- internal data structures -*-*/
+
+typedef int (*iso2022_init_func)(const MultibyteCodec *codec);
+typedef Py_UCS4 (*iso2022_decode_func)(const MultibyteCodec *codec,
+ const unsigned char *data);
+typedef DBCHAR (*iso2022_encode_func)(const MultibyteCodec *codec,
+ const Py_UCS4 *data,
+ Py_ssize_t *length);
+
+struct iso2022_designation {
+ unsigned char mark;
+ unsigned char plane;
+ unsigned char width;
+ iso2022_init_func initializer;
+ iso2022_decode_func decoder;
+ iso2022_encode_func encoder;
+};
+
+struct iso2022_config {
+ int flags;
+ const struct iso2022_designation *designations; /* non-ascii desigs */
+};
+
+/*-*- iso-2022 codec implementation -*-*/
+
+CODEC_INIT(iso2022)
+{
+ const struct iso2022_designation *desig;
+ for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++) {
+ if (desig->initializer != NULL && desig->initializer(codec) != 0) {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+ENCODER_INIT(iso2022)
+{
+ STATE_CLEARFLAGS();
+ STATE_SETG0(CHARSET_ASCII);
+ STATE_SETG1(CHARSET_ASCII);
+ return 0;
+}
+
+ENCODER_RESET(iso2022)
+{
+ if (STATE_GETFLAG(F_SHIFTED)) {
+ WRITEBYTE1(SI);
+ NEXT_OUT(1);
+ STATE_CLEARFLAG(F_SHIFTED);
+ }
+ if (STATE_G0 != CHARSET_ASCII) {
+ WRITEBYTE3(ESC, '(', 'B');
+ NEXT_OUT(3);
+ STATE_SETG0(CHARSET_ASCII);
+ }
+ return 0;
+}
+
+ENCODER(iso2022)
+{
+ while (*inpos < inlen) {
+ const struct iso2022_designation *dsg;
+ DBCHAR encoded;
+ Py_UCS4 c = INCHAR1;
+ Py_ssize_t insize;
+
+ if (c < 0x80) {
+ if (STATE_G0 != CHARSET_ASCII) {
+ WRITEBYTE3(ESC, '(', 'B');
+ STATE_SETG0(CHARSET_ASCII);
+ NEXT_OUT(3);
+ }
+ if (STATE_GETFLAG(F_SHIFTED)) {
+ WRITEBYTE1(SI);
+ STATE_CLEARFLAG(F_SHIFTED);
+ NEXT_OUT(1);
+ }
+ WRITEBYTE1((unsigned char)c);
+ NEXT(1, 1);
+ continue;
+ }
+
+ insize = 1;
+
+ encoded = MAP_UNMAPPABLE;
+ for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
+ Py_UCS4 buf[2] = {c, 0};
+ Py_ssize_t length = 1;
+ encoded = dsg->encoder(codec, buf, &length);
+ if (encoded == MAP_MULTIPLE_AVAIL) {
+ /* this implementation won't work for pair
+ * of non-bmp characters. */
+ if (inlen - *inpos < 2) {
+ if (!(flags & MBENC_FLUSH))
+ return MBERR_TOOFEW;
+ length = -1;
+ }
+ else {
+ buf[1] = INCHAR2;
+ length = 2;
+ }
+ encoded = dsg->encoder(codec, buf, &length);
+ if (encoded != MAP_UNMAPPABLE) {
+ insize = length;
+ break;
+ }
+ }
+ else if (encoded != MAP_UNMAPPABLE)
+ break;
+ }
+
+ if (!dsg->mark)
+ return 1;
+ assert(dsg->width == 1 || dsg->width == 2);
+
+ switch (dsg->plane) {
+ case 0: /* G0 */
+ if (STATE_GETFLAG(F_SHIFTED)) {
+ WRITEBYTE1(SI);
+ STATE_CLEARFLAG(F_SHIFTED);
+ NEXT_OUT(1);
+ }
+ if (STATE_G0 != dsg->mark) {
+ if (dsg->width == 1) {
+ WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark));
+ STATE_SETG0(dsg->mark);
+ NEXT_OUT(3);
+ }
+ else if (dsg->mark == CHARSET_JISX0208) {
+ WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark));
+ STATE_SETG0(dsg->mark);
+ NEXT_OUT(3);
+ }
+ else {
+ WRITEBYTE4(ESC, '$', '(',
+ ESCMARK(dsg->mark));
+ STATE_SETG0(dsg->mark);
+ NEXT_OUT(4);
+ }
+ }
+ break;
+ case 1: /* G1 */
+ if (STATE_G1 != dsg->mark) {
+ if (dsg->width == 1) {
+ WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark));
+ STATE_SETG1(dsg->mark);
+ NEXT_OUT(3);
+ }
+ else {
+ WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark));
+ STATE_SETG1(dsg->mark);
+ NEXT_OUT(4);
+ }
+ }
+ if (!STATE_GETFLAG(F_SHIFTED)) {
+ WRITEBYTE1(SO);
+ STATE_SETFLAG(F_SHIFTED);
+ NEXT_OUT(1);
+ }
+ break;
+ default: /* G2 and G3 is not supported: no encoding in
+ * CJKCodecs are using them yet */
+ return MBERR_INTERNAL;
+ }
+
+ if (dsg->width == 1) {
+ WRITEBYTE1((unsigned char)encoded);
+ NEXT_OUT(1);
+ }
+ else {
+ WRITEBYTE2(encoded >> 8, encoded & 0xff);
+ NEXT_OUT(2);
+ }
+ NEXT_INCHAR(insize);
+ }
+
+ return 0;
+}
+
+DECODER_INIT(iso2022)
+{
+ STATE_CLEARFLAGS();
+ STATE_SETG0(CHARSET_ASCII);
+ STATE_SETG1(CHARSET_ASCII);
+ STATE_SETG2(CHARSET_ASCII);
+ return 0;
+}
+
+DECODER_RESET(iso2022)
+{
+ STATE_SETG0(CHARSET_ASCII);
+ STATE_CLEARFLAG(F_SHIFTED);
+ return 0;
+}
+
+static Py_ssize_t
+iso2022processesc(const MultibyteCodec *codec, MultibyteCodec_State *state,
+ const unsigned char **inbuf, Py_ssize_t *inleft)
+{
+ unsigned char charset, designation;
+ Py_ssize_t i, esclen = 0;
+
+ for (i = 1;i < MAX_ESCSEQLEN;i++) {
+ if (i >= *inleft)
+ return MBERR_TOOFEW;
+ if (IS_ESCEND((*inbuf)[i])) {
+ esclen = i + 1;
+ break;
+ }
+ else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
+ (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') {
+ i += 2;
+ }
+ }
+
+ switch (esclen) {
+ case 0:
+ return 1; /* unterminated escape sequence */
+ case 3:
+ if (INBYTE2 == '$') {
+ charset = INBYTE3 | CHARSET_DBCS;
+ designation = 0;
+ }
+ else {
+ charset = INBYTE3;
+ if (INBYTE2 == '(')
+ designation = 0;
+ else if (INBYTE2 == ')')
+ designation = 1;
+ else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
+ designation = 2;
+ else
+ return 3;
+ }
+ break;
+ case 4:
+ if (INBYTE2 != '$')
+ return 4;
+
+ charset = INBYTE4 | CHARSET_DBCS;
+ if (INBYTE3 == '(')
+ designation = 0;
+ else if (INBYTE3 == ')')
+ designation = 1;
+ else
+ return 4;
+ break;
+ case 6: /* designation with prefix */
+ if (CONFIG_ISSET(USE_JISX0208_EXT) &&
+ (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
+ (*inbuf)[5] == 'B') {
+ charset = 'B' | CHARSET_DBCS;
+ designation = 0;
+ }
+ else
+ return 6;
+ break;
+ default:
+ return esclen;
+ }
+
+ /* raise error when the charset is not designated for this encoding */
+ if (charset != CHARSET_ASCII) {
+ const struct iso2022_designation *dsg;
+
+ for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
+ if (dsg->mark == charset)
+ break;
+ }
+ if (!dsg->mark)
+ return esclen;
+ }
+
+ STATE_SETG(designation, charset);
+ *inleft -= esclen;
+ (*inbuf) += esclen;
+ return 0;
+}
+
+#define ISO8859_7_DECODE(c, writer) \
+ if ((c) < 0xa0) { \
+ OUTCHAR(c); \
+ } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \
+ OUTCHAR(c); \
+ } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \
+ (0xbffffd77L & (1L << ((c)-0xb4))))) { \
+ OUTCHAR(0x02d0 + (c)); \
+ } else if ((c) == 0xa1) { \
+ OUTCHAR(0x2018); \
+ } else if ((c) == 0xa2) { \
+ OUTCHAR(0x2019); \
+ } else if ((c) == 0xaf) { \
+ OUTCHAR(0x2015); \
+ }
+
+static Py_ssize_t
+iso2022processg2(const MultibyteCodec *codec, MultibyteCodec_State *state,
+ const unsigned char **inbuf, Py_ssize_t *inleft,
+ _PyUnicodeWriter *writer)
+{
+ /* not written to use encoder, decoder functions because only few
+ * encodings use G2 designations in CJKCodecs */
+ if (STATE_G2 == CHARSET_ISO8859_1) {
+ if (INBYTE3 < 0x80)
+ OUTCHAR(INBYTE3 + 0x80);
+ else
+ return 3;
+ }
+ else if (STATE_G2 == CHARSET_ISO8859_7) {
+ ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
+ else
+ return 3;
+ }
+ else if (STATE_G2 == CHARSET_ASCII) {
+ if (INBYTE3 & 0x80)
+ return 3;
+ else
+ OUTCHAR(INBYTE3);
+ }
+ else
+ return MBERR_INTERNAL;
+
+ (*inbuf) += 3;
+ *inleft -= 3;
+ return 0;
+}
+
+DECODER(iso2022)
+{
+ const struct iso2022_designation *dsgcache = NULL;
+
+ while (inleft > 0) {
+ unsigned char c = INBYTE1;
+ Py_ssize_t err;
+
+ if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
+ /* ESC throughout mode:
+ * for non-iso2022 escape sequences */
+ OUTCHAR(c); /* assume as ISO-8859-1 */
+ NEXT_IN(1);
+ if (IS_ESCEND(c)) {
+ STATE_CLEARFLAG(F_ESCTHROUGHOUT);
+ }
+ continue;
+ }
+
+ switch (c) {
+ case ESC:
+ REQUIRE_INBUF(2);
+ if (IS_ISO2022ESC(INBYTE2)) {
+ err = iso2022processesc(codec, state,
+ inbuf, &inleft);
+ if (err != 0)
+ return err;
+ }
+ else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
+ REQUIRE_INBUF(3);
+ err = iso2022processg2(codec, state,
+ inbuf, &inleft, writer);
+ if (err != 0)
+ return err;
+ }
+ else {
+ OUTCHAR(ESC);
+ STATE_SETFLAG(F_ESCTHROUGHOUT);
+ NEXT_IN(1);
+ }
+ break;
+ case SI:
+ if (CONFIG_ISSET(NO_SHIFT))
+ goto bypass;
+ STATE_CLEARFLAG(F_SHIFTED);
+ NEXT_IN(1);
+ break;
+ case SO:
+ if (CONFIG_ISSET(NO_SHIFT))
+ goto bypass;
+ STATE_SETFLAG(F_SHIFTED);
+ NEXT_IN(1);
+ break;
+ case LF:
+ STATE_CLEARFLAG(F_SHIFTED);
+ OUTCHAR(LF);
+ NEXT_IN(1);
+ break;
+ default:
+ if (c < 0x20) /* C0 */
+ goto bypass;
+ else if (c >= 0x80)
+ return 1;
+ else {
+ const struct iso2022_designation *dsg;
+ unsigned char charset;
+ Py_UCS4 decoded;
+
+ if (STATE_GETFLAG(F_SHIFTED))
+ charset = STATE_G1;
+ else
+ charset = STATE_G0;
+
+ if (charset == CHARSET_ASCII) {
+bypass:
+ OUTCHAR(c);
+ NEXT_IN(1);
+ break;
+ }
+
+ if (dsgcache != NULL &&
+ dsgcache->mark == charset)
+ dsg = dsgcache;
+ else {
+ for (dsg = CONFIG_DESIGNATIONS;
+ dsg->mark != charset
+#ifdef Py_DEBUG
+ && dsg->mark != '\0'
+#endif
+ ; dsg++)
+ {
+ /* noop */
+ }
+ assert(dsg->mark != '\0');
+ dsgcache = dsg;
+ }
+
+ REQUIRE_INBUF(dsg->width);
+ decoded = dsg->decoder(codec, *inbuf);
+ if (decoded == MAP_UNMAPPABLE)
+ return dsg->width;
+
+ if (decoded < 0x10000) {
+ OUTCHAR(decoded);
+ }
+ else if (decoded < 0x30000) {
+ OUTCHAR(decoded);
+ }
+ else { /* JIS X 0213 pairs */
+ OUTCHAR2(decoded >> 16, decoded & 0xffff);
+ }
+ NEXT_IN(dsg->width);
+ }
+ break;
+ }
+ }
+ return 0;
+}
+
+/*-*- mapping access functions -*-*/
+
+static int
+ksx1001_init(const MultibyteCodec *codec)
+{
+ cjkcodecs_module_state *st = codec->modstate;
+ if (IMPORT_MAP(kr, cp949, &st->cp949_encmap, NULL) ||
+ IMPORT_MAP(kr, ksx1001, NULL, &st->ksx1001_decmap))
+ {
+ return -1;
+ }
+ return 0;
+}
+
+static Py_UCS4
+ksx1001_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ if (TRYMAP_DEC_ST(ksx1001, u, data[0], data[1]))
+ return u;
+ else
+ return MAP_UNMAPPABLE;
+}
+
+static DBCHAR
+ksx1001_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded;
+ assert(*length == 1);
+ if (*data < 0x10000) {
+ if (TRYMAP_ENC_ST(cp949, coded, *data)) {
+ if (!(coded & 0x8000))
+ return coded;
+ }
+ }
+ return MAP_UNMAPPABLE;
+}
+
+static int
+jisx0208_init(const MultibyteCodec *codec)
+{
+ cjkcodecs_module_state *st = codec->modstate;
+ if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
+ IMPORT_MAP(jp, jisx0208, NULL, &st->jisx0208_decmap))
+ {
+ return -1;
+ }
+ return 0;
+}
+
+static Py_UCS4
+jisx0208_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
+ return 0xff3c;
+ else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
+ return u;
+ else
+ return MAP_UNMAPPABLE;
+}
+
+static DBCHAR
+jisx0208_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded;
+ assert(*length == 1);
+ if (*data < 0x10000) {
+ if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
+ return 0x2140;
+ else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
+ if (!(coded & 0x8000))
+ return coded;
+ }
+ }
+ return MAP_UNMAPPABLE;
+}
+
+static int
+jisx0212_init(const MultibyteCodec *codec)
+{
+ cjkcodecs_module_state *st = codec->modstate;
+ if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
+ IMPORT_MAP(jp, jisx0212, NULL, &st->jisx0212_decmap))
+ {
+ return -1;
+ }
+ return 0;
+}
+
+static Py_UCS4
+jisx0212_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ if (TRYMAP_DEC_ST(jisx0212, u, data[0], data[1]))
+ return u;
+ else
+ return MAP_UNMAPPABLE;
+}
+
+static DBCHAR
+jisx0212_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded;
+ assert(*length == 1);
+ if (*data < 0x10000) {
+ if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
+ if (coded & 0x8000)
+ return coded & 0x7fff;
+ }
+ }
+ return MAP_UNMAPPABLE;
+}
+
+static int
+jisx0213_init(const MultibyteCodec *codec)
+{
+ cjkcodecs_module_state *st = codec->modstate;
+ if (jisx0208_init(codec) ||
+ IMPORT_MAP(jp, jisx0213_bmp, &st->jisx0213_bmp_encmap, NULL) ||
+ IMPORT_MAP(jp, jisx0213_1_bmp, NULL, &st->jisx0213_1_bmp_decmap) ||
+ IMPORT_MAP(jp, jisx0213_2_bmp, NULL, &st->jisx0213_2_bmp_decmap) ||
+ IMPORT_MAP(jp, jisx0213_emp, &st->jisx0213_emp_encmap, NULL) ||
+ IMPORT_MAP(jp, jisx0213_1_emp, NULL, &st->jisx0213_1_emp_decmap) ||
+ IMPORT_MAP(jp, jisx0213_2_emp, NULL, &st->jisx0213_2_emp_decmap) ||
+ IMPORT_MAP(jp, jisx0213_pair,
+ &jisx0213_pair_encmap, &jisx0213_pair_decmap))
+ {
+ return -1;
+ }
+ return 0;
+}
+
+#define config ((void *)2000)
+static Py_UCS4
+jisx0213_2000_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ EMULATE_JISX0213_2000_DECODE_PLANE1(config, u, data[0], data[1])
+ else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
+ return 0xff3c;
+ else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
+ ;
+ else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
+ ;
+ else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
+ u |= 0x20000;
+ else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
+ ;
+ else
+ return MAP_UNMAPPABLE;
+ return u;
+}
+
+static Py_UCS4
+jisx0213_2000_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(config, u, data[0], data[1])
+ if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
+ ;
+ else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
+ u |= 0x20000;
+ else
+ return MAP_UNMAPPABLE;
+ return u;
+}
+#undef config
+
+static Py_UCS4
+jisx0213_2004_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
+ return 0xff3c;
+ else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
+ ;
+ else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
+ ;
+ else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
+ u |= 0x20000;
+ else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
+ ;
+ else
+ return MAP_UNMAPPABLE;
+ return u;
+}
+
+static Py_UCS4
+jisx0213_2004_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
+ ;
+ else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
+ u |= 0x20000;
+ else
+ return MAP_UNMAPPABLE;
+ return u;
+}
+
+static DBCHAR
+jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length, const void *config)
+{
+ DBCHAR coded;
+
+ switch (*length) {
+ case 1: /* first character */
+ if (*data >= 0x10000) {
+ if ((*data) >> 16 == 0x20000 >> 16) {
+ EMULATE_JISX0213_2000_ENCODE_EMP(config, coded, *data)
+ else if (TRYMAP_ENC_ST(jisx0213_emp, coded, (*data) & 0xffff))
+ return coded;
+ }
+ return MAP_UNMAPPABLE;
+ }
+
+ EMULATE_JISX0213_2000_ENCODE_BMP(config, coded, *data)
+ else if (TRYMAP_ENC_ST(jisx0213_bmp, coded, *data)) {
+ if (coded == MULTIC)
+ return MAP_MULTIPLE_AVAIL;
+ }
+ else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
+ if (coded & 0x8000)
+ return MAP_UNMAPPABLE;
+ }
+ else
+ return MAP_UNMAPPABLE;
+ return coded;
+
+ case 2: /* second character of unicode pair */
+ coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
+ jisx0213_pair_encmap, JISX0213_ENCPAIRS);
+ if (coded != DBCINV)
+ return coded;
+ /* fall through */
+
+ case -1: /* flush unterminated */
+ *length = 1;
+ coded = find_pairencmap((ucs2_t)data[0], 0,
+ jisx0213_pair_encmap, JISX0213_ENCPAIRS);
+ if (coded == DBCINV)
+ return MAP_UNMAPPABLE;
+ else
+ return coded;
+ break;
+
+ default:
+ return MAP_UNMAPPABLE;
+ }
+}
+
+static DBCHAR
+jisx0213_2000_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
+ if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
+ return coded;
+ else if (coded & 0x8000)
+ return MAP_UNMAPPABLE;
+ else
+ return coded;
+}
+
+static DBCHAR
+jisx0213_2000_1_encoder_paironly(const MultibyteCodec *codec,
+ const Py_UCS4 *data, Py_ssize_t *length)
+{
+ DBCHAR coded;
+ Py_ssize_t ilength = *length;
+
+ coded = jisx0213_encoder(codec, data, length, (void *)2000);
+ switch (ilength) {
+ case 1:
+ if (coded == MAP_MULTIPLE_AVAIL)
+ return MAP_MULTIPLE_AVAIL;
+ else
+ return MAP_UNMAPPABLE;
+ case 2:
+ if (*length != 2)
+ return MAP_UNMAPPABLE;
+ else
+ return coded;
+ default:
+ return MAP_UNMAPPABLE;
+ }
+}
+
+static DBCHAR
+jisx0213_2000_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
+ if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
+ return coded;
+ else if (coded & 0x8000)
+ return coded & 0x7fff;
+ else
+ return MAP_UNMAPPABLE;
+}
+
+static DBCHAR
+jisx0213_2004_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
+ if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
+ return coded;
+ else if (coded & 0x8000)
+ return MAP_UNMAPPABLE;
+ else
+ return coded;
+}
+
+static DBCHAR
+jisx0213_2004_1_encoder_paironly(const MultibyteCodec *codec,
+ const Py_UCS4 *data, Py_ssize_t *length)
+{
+ DBCHAR coded;
+ Py_ssize_t ilength = *length;
+
+ coded = jisx0213_encoder(codec, data, length, NULL);
+ switch (ilength) {
+ case 1:
+ if (coded == MAP_MULTIPLE_AVAIL)
+ return MAP_MULTIPLE_AVAIL;
+ else
+ return MAP_UNMAPPABLE;
+ case 2:
+ if (*length != 2)
+ return MAP_UNMAPPABLE;
+ else
+ return coded;
+ default:
+ return MAP_UNMAPPABLE;
+ }
+}
+
+static DBCHAR
+jisx0213_2004_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
+ if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
+ return coded;
+ else if (coded & 0x8000)
+ return coded & 0x7fff;
+ else
+ return MAP_UNMAPPABLE;
+}
+
+static Py_UCS4
+jisx0201_r_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ JISX0201_R_DECODE_CHAR(*data, u)
+ else
+ return MAP_UNMAPPABLE;
+ return u;
+}
+
+static DBCHAR
+jisx0201_r_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded;
+ JISX0201_R_ENCODE(*data, coded)
+ else
+ return MAP_UNMAPPABLE;
+ return coded;
+}
+
+static Py_UCS4
+jisx0201_k_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
+ else
+ return MAP_UNMAPPABLE;
+ return u;
+}
+
+static DBCHAR
+jisx0201_k_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded;
+ JISX0201_K_ENCODE(*data, coded)
+ else
+ return MAP_UNMAPPABLE;
+ return coded - 0x80;
+}
+
+static int
+gb2312_init(const MultibyteCodec *codec)
+{
+ cjkcodecs_module_state *st = codec->modstate;
+ if (IMPORT_MAP(cn, gbcommon, &st->gbcommon_encmap, NULL) ||
+ IMPORT_MAP(cn, gb2312, NULL, &st->gb2312_decmap))
+ {
+ return -1;
+ }
+ return 0;
+}
+
+static Py_UCS4
+gb2312_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ Py_UCS4 u;
+ if (TRYMAP_DEC_ST(gb2312, u, data[0], data[1]))
+ return u;
+ else
+ return MAP_UNMAPPABLE;
+}
+
+static DBCHAR
+gb2312_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ DBCHAR coded;
+ assert(*length == 1);
+ if (*data < 0x10000) {
+ if (TRYMAP_ENC_ST(gbcommon, coded, *data)) {
+ if (!(coded & 0x8000))
+ return coded;
+ }
+ }
+ return MAP_UNMAPPABLE;
+}
+
+
+static Py_UCS4
+dummy_decoder(const MultibyteCodec *codec, const unsigned char *data)
+{
+ return MAP_UNMAPPABLE;
+}
+
+static DBCHAR
+dummy_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
+ Py_ssize_t *length)
+{
+ return MAP_UNMAPPABLE;
+}
+
+/*-*- registry tables -*-*/
+
+#define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \
+ ksx1001_init, \
+ ksx1001_decoder, ksx1001_encoder }
+#define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \
+ ksx1001_init, \
+ ksx1001_decoder, ksx1001_encoder }
+#define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \
+ NULL, \
+ jisx0201_r_decoder, jisx0201_r_encoder }
+#define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \
+ NULL, \
+ jisx0201_k_decoder, jisx0201_k_encoder }
+#define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \
+ jisx0208_init, \
+ jisx0208_decoder, jisx0208_encoder }
+#define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \
+ jisx0208_init, \
+ jisx0208_decoder, jisx0208_encoder }
+#define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \
+ jisx0212_init, \
+ jisx0212_decoder, jisx0212_encoder }
+#define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \
+ jisx0213_init, \
+ jisx0213_2000_1_decoder, \
+ jisx0213_2000_1_encoder }
+#define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
+ jisx0213_init, \
+ jisx0213_2000_1_decoder, \
+ jisx0213_2000_1_encoder_paironly }
+#define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \
+ jisx0213_init, \
+ jisx0213_2000_2_decoder, \
+ jisx0213_2000_2_encoder }
+#define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \
+ jisx0213_init, \
+ jisx0213_2004_1_decoder, \
+ jisx0213_2004_1_encoder }
+#define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
+ jisx0213_init, \
+ jisx0213_2004_1_decoder, \
+ jisx0213_2004_1_encoder_paironly }
+#define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \
+ jisx0213_init, \
+ jisx0213_2004_2_decoder, \
+ jisx0213_2004_2_encoder }
+#define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \
+ gb2312_init, \
+ gb2312_decoder, gb2312_encoder }
+#define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \
+ cns11643_init, \
+ cns11643_1_decoder, cns11643_1_encoder }
+#define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \
+ cns11643_init, \
+ cns11643_2_decoder, cns11643_2_encoder }
+#define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \
+ NULL, dummy_decoder, dummy_encoder }
+#define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \
+ NULL, dummy_decoder, dummy_encoder }
+#define REGISTRY_SENTINEL { 0, }
+#define CONFIGDEF(var, attrs) \
+ static const struct iso2022_config iso2022_##var##_config = { \
+ attrs, iso2022_##var##_designations \
+ };
+
+static const struct iso2022_designation iso2022_kr_designations[] = {
+ REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
+};
+CONFIGDEF(kr, 0)
+
+static const struct iso2022_designation iso2022_jp_designations[] = {
+ REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
+ REGISTRY_SENTINEL
+};
+CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
+
+static const struct iso2022_designation iso2022_jp_1_designations[] = {
+ REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
+ REGISTRY_JISX0208_O, REGISTRY_SENTINEL
+};
+CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
+
+static const struct iso2022_designation iso2022_jp_2_designations[] = {
+ REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
+ REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
+ REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
+};
+CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
+
+static const struct iso2022_designation iso2022_jp_2004_designations[] = {
+ REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
+ REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
+};
+CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
+
+static const struct iso2022_designation iso2022_jp_3_designations[] = {
+ REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
+ REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
+};
+CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
+
+static const struct iso2022_designation iso2022_jp_ext_designations[] = {
+ REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
+ REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
+};
+CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
+
+
+BEGIN_MAPPINGS_LIST(0)
+ /* no mapping table here */
+END_MAPPINGS_LIST
+
+#define ISO2022_CODEC(variation) \
+NEXT_CODEC = (MultibyteCodec){ \
+ "iso2022_" #variation, \
+ &iso2022_##variation##_config, \
+ iso2022_codec_init, \
+ _STATEFUL_METHODS(iso2022) \
+};
+
+BEGIN_CODECS_LIST(7)
+ ISO2022_CODEC(kr)
+ ISO2022_CODEC(jp)
+ ISO2022_CODEC(jp_1)
+ ISO2022_CODEC(jp_2)
+ ISO2022_CODEC(jp_2004)
+ ISO2022_CODEC(jp_3)
+ ISO2022_CODEC(jp_ext)
+END_CODECS_LIST
+
+I_AM_A_MODULE_FOR(iso2022)