This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
bugs in ISO-2022-CN-EXT converter
- To: libc-alpha at sources dot redhat dot com
- Subject: bugs in ISO-2022-CN-EXT converter
- From: Bruno Haible <haible at ilog dot fr>
- Date: Mon, 18 Sep 2000 15:07:05 +0200 (CEST)
The ISO-2022-CN-EXT has a few bugs. The worst is that is cannot be loaded
because it requires symbols which are not available from libISOIR165.so:
$ nm /glibc22/lib/gconv/ISO-2022-CN-EXT.so | grep __isoir165
U __isoir165_from_idx
U __isoir165_from_tab
U __isoir165_to_tab
$ nm /glibc22/lib/gconv/libISOIR165.so | grep __isoir165
00000880 R __isoir165_from_idx
000018c0 R __isoir165_tab
00008d60 R __isoir165_to_tab
Moreover,
- The CNS 11643-1992 plane 3 is not treated (it is the major part of
CNS 11643-1986 plane 14).
- Setting GB7590_set = 0 is wrong, because that would mean that in initial
state, GB7590 don't need a shift sequence.
- In the TO direction, SS2 and SS3 shift sequences are never emitted.
- Plus the same bugs as in the ISO-2022-CN converter.
- cns11643.h uses an undeclared variable.
Here is a patch which fixes them.
2000-09-17 Bruno Haible <haible@clisp.cons.org>
* iconvdata/iso-ir-165.c (__isoir165_from_tab): Renamed from
__isoir165_tab.
* iconvdata/cns11643.h (__cns11643l1_to_ucs4_tab): New declaration.
* iconvdata/iso-2022-cn-ext.c: Include "cns11643.h".
(GB7590_set, GB13132_set, CNS11643_3_set, CNS11643_4_set,
CNS11643_5_set, CNS11643_6_set, CNS11643_7_set): Change enum values.
(BODY for FROM_LOOP): Fix buffer overrun. Treat CNS11643 plane 3.
Return __GCONV_INCOMPLETE_INPUT instead of __GCONV_EMPTY_INPUT.
(BODY for TO_LOOP): Fix usage of `set' vs. `used'. Fix typo that
caused GB2312 to be used instead of ISO-IR-165. Treat CNS11643
plane 3. Fix shift sequences. Output announcement for SS2 and SS3
encodings when needed. When outputting an announcement, don't clear
most other announcements.
*** glibc-20000914/iconvdata/iso-ir-165.c.bak Thu Jan 13 07:53:29 2000
--- glibc-20000914/iconvdata/iso-ir-165.c Sun Sep 17 19:58:52 2000
***************
*** 546,552 ****
};
! const char __isoir165_tab[29852] =
"\x2a\x21" "\x2a\x22" "\x2a\x23" "\x21\x67" "\x2a\x25" "\x2a\x26" "\x2a\x27"
"\x2a\x28" "\x2a\x29" "\x2a\x2a" "\x2a\x2b" "\x2a\x2c" "\x2a\x2d" "\x2a\x2e"
"\x2a\x2f" "\x2a\x30" "\x2a\x31" "\x2a\x32" "\x2a\x33" "\x2a\x34" "\x2a\x35"
--- 546,552 ----
};
! const char __isoir165_from_tab[29852] =
"\x2a\x21" "\x2a\x22" "\x2a\x23" "\x21\x67" "\x2a\x25" "\x2a\x26" "\x2a\x27"
"\x2a\x28" "\x2a\x29" "\x2a\x2a" "\x2a\x2b" "\x2a\x2c" "\x2a\x2d" "\x2a\x2e"
"\x2a\x2f" "\x2a\x30" "\x2a\x31" "\x2a\x32" "\x2a\x33" "\x2a\x34" "\x2a\x35"
*** glibc-20000914/iconvdata/cns11643.h.bak Tue Sep 5 15:24:48 2000
--- glibc-20000914/iconvdata/cns11643.h Sun Sep 17 23:33:24 2000
***************
*** 20,27 ****
--- 20,30 ----
#include <stdint.h>
+ /* Table for CNS 11643, plane 1 to UCS4 conversion. */
+ extern const uint16_t __cns11643l1_to_ucs4_tab[];
/* Table for CNS 11643, plane 2 to UCS4 conversion. */
extern const uint16_t __cns11643l2_to_ucs4_tab[];
+ /* Table for CNS 11643, plane 14 to UCS4 conversion. */
extern const uint16_t __cns11643l14_to_ucs4_tab[];
*** glibc-20000914/iconvdata/iso-2022-cn-ext.c.bak Wed Sep 13 11:09:09 2000
--- glibc-20000914/iconvdata/iso-2022-cn-ext.c Mon Sep 18 10:54:27 2000
***************
*** 24,29 ****
--- 24,30 ----
#include <string.h>
#include "gb2312.h"
#include "iso-ir-165.h"
+ #include "cns11643.h"
#include "cns11643l1.h"
#include "cns11643l2.h"
***************
*** 80,120 ****
ISO_IR_165_set,
SO_mask = 7,
! GB7589_set = 8,
! GB13131_set = 16,
! CNS11643_2_set = 24,
! SS2_mask = 24,
!
! GB7590_set = 0,
! GB13132_set = 32,
! CNS11643_3_set = 64,
! CNS11643_4_set = 96,
! CNS11643_5_set = 128,
! CNS11643_6_set = 160,
! CNS11643_7_set = 192,
! SS3_mask = 224,
#define CURRENT_MASK (SO_mask | SS2_mask | SS3_mask)
! GB2312_ann = 256,
! GB12345_ann = 512,
! CNS11643_1_ann = 768,
! ISO_IR_165_ann = 1024,
! SO_ann = 1792,
!
! GB7589_ann = 2048,
! GB13131_ann = 4096,
! CNS11643_2_ann = 6144,
! SS2_ann = 6144,
!
! GB7590_ann = 8192,
! GB13132_ann = 16384,
! CNS11643_3_ann = 24576,
! CNS11643_4_ann = 32768,
! CNS11643_5_ann = 40960,
! CNS11643_6_ann = 49152,
! CNS11643_7_ann = 57344,
! SS3_ann = 57344
};
--- 81,121 ----
ISO_IR_165_set,
SO_mask = 7,
! GB7589_set = 1 << 3,
! GB13131_set = 2 << 3,
! CNS11643_2_set = 3 << 3,
! SS2_mask = 3 << 3,
!
! GB7590_set = 1 << 5,
! GB13132_set = 2 << 5,
! CNS11643_3_set = 3 << 5,
! CNS11643_4_set = 4 << 5,
! CNS11643_5_set = 5 << 5,
! CNS11643_6_set = 6 << 5,
! CNS11643_7_set = 7 << 5,
! SS3_mask = 7 << 5,
#define CURRENT_MASK (SO_mask | SS2_mask | SS3_mask)
! GB2312_ann = 1 << 8,
! GB12345_ann = 2 << 8,
! CNS11643_1_ann = 3 << 8,
! ISO_IR_165_ann = 4 << 8,
! SO_ann = 7 << 8,
!
! GB7589_ann = 1 << 11,
! GB13131_ann = 2 << 11,
! CNS11643_2_ann = 3 << 11,
! SS2_ann = 3 << 11,
!
! GB7590_ann = 1 << 13,
! GB13132_ann = 2 << 13,
! CNS11643_3_ann = 3 << 13,
! CNS11643_4_ann = 4 << 13,
! CNS11643_5_ann = 5 << 13,
! CNS11643_6_ann = 6 << 13,
! CNS11643_7_ann = 7 << 13,
! SS3_ann = 7 << 13
};
***************
*** 190,205 ****
- the initial byte of the SS2 sequence. \
- the initial byte of the SS3 sequence. \
*/ \
! if (inptr + 1 > inend \
|| (inptr[1] == '$' \
! && (inptr + 2 > inend \
! || (inptr[2] == ')' && inptr + 3 > inend) \
! || (inptr[2] == '*' && inptr + 3 > inend) \
! || (inptr[2] == '+' && inptr + 3 > inend))) \
! || (inptr[1] == SS2_1 && inptr + 3 > inend) \
! || (inptr[1] == SS3_1 && inptr + 3 > inend)) \
{ \
! result = __GCONV_EMPTY_INPUT; \
break; \
} \
if (inptr[1] == '$' \
--- 191,206 ----
- the initial byte of the SS2 sequence. \
- the initial byte of the SS3 sequence. \
*/ \
! if (inptr + 2 > inend \
|| (inptr[1] == '$' \
! && (inptr + 3 > inend \
! || (inptr[2] == ')' && inptr + 4 > inend) \
! || (inptr[2] == '*' && inptr + 4 > inend) \
! || (inptr[2] == '+' && inptr + 4 > inend))) \
! || (inptr[1] == SS2_1 && inptr + 4 > inend) \
! || (inptr[1] == SS3_1 && inptr + 4 > inend)) \
{ \
! result = __GCONV_INCOMPLETE_INPUT; \
break; \
} \
if (inptr[1] == '$' \
***************
*** 285,301 ****
continue; \
} \
\
! if (ch == ESC && (inend - inptr == 1 || inptr[1] == SS2_1)) \
{ \
/* This is a character from CNS 11643 plane 2. \
XXX We could test here whether the use of this character \
set was announced. \
XXX Current GB7589 and GB13131 are not supported. */ \
- if (inend - inptr < 4) \
- { \
- result = __GCONV_INCOMPLETE_INPUT; \
- break; \
- } \
inptr += 2; \
ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \
if (ch == __UNKNOWN_10646_CHAR) \
--- 286,297 ----
continue; \
} \
\
! if (ch == ESC && inptr[1] == SS2_1) \
{ \
/* This is a character from CNS 11643 plane 2. \
XXX We could test here whether the use of this character \
set was announced. \
XXX Current GB7589 and GB13131 are not supported. */ \
inptr += 2; \
ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \
if (ch == __UNKNOWN_10646_CHAR) \
***************
*** 306,340 ****
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
++*irreversible; \
continue; \
} \
} \
! /* Note that we can assume here that at least bytes are available if \
the first byte is ESC since otherwise the first if would have been \
true. */ \
else if (ch == ESC && inptr[1] == SS3_1) \
{ \
/* This is a character from CNS 11643 plane 3 or higher. \
! XXX Current GB7590 and GB13132 are not supported. */ \
! if (inend - inptr < 4) \
! { \
! result = __GCONV_INCOMPLETE_INPUT; \
break; \
} \
- inptr += 2; \
- ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \
if (ch == __UNKNOWN_10646_CHAR) \
{ \
if (! ignore_errors_p ()) \
{ \
- inptr -= 2; \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
++*irreversible; \
continue; \
} \
} \
else if (set == ASCII_set) \
{ \
--- 302,354 ----
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
+ inptr += 2; \
++*irreversible; \
continue; \
} \
} \
! /* Note that we can assume here that at least 4 bytes are available if \
the first byte is ESC since otherwise the first if would have been \
true. */ \
else if (ch == ESC && inptr[1] == SS3_1) \
{ \
/* This is a character from CNS 11643 plane 3 or higher. \
! XXX Currently GB7590 and GB13132 are not supported. */ \
! char buf[3]; \
! const char *tmp = buf; \
! \
! buf[1] = inptr[2]; \
! buf[2] = inptr[3]; \
! switch (ann & SS3_ann) \
! { \
! case CNS11643_3_ann: \
! /* CNS 11643 plane 3 is part of the old CNS 11643 plane 14. */ \
! if (buf[1] < 0x62 || (buf[1] == 0x62 && buf[2] <= 0x45)) \
! { \
! buf[0] = 0x2e; \
! ch = cns11643_to_ucs4 (&tmp, 3, 0); \
! } \
! else \
! ch = __UNKNOWN_10646_CHAR; \
! break; \
! default: \
! /* XXX Currently planes 4 to 7 are not supported. */ \
! ch = __UNKNOWN_10646_CHAR; \
break; \
} \
if (ch == __UNKNOWN_10646_CHAR) \
{ \
if (! ignore_errors_p ()) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
+ inptr += 4; \
++*irreversible; \
continue; \
} \
+ assert (tmp == buf + 3); \
+ inptr += 4; \
} \
else if (set == ASCII_set) \
{ \
***************
*** 361,367 ****
\
if (ch == 0) \
{ \
! result = __GCONV_EMPTY_INPUT; \
break; \
} \
else if (ch == __UNKNOWN_10646_CHAR) \
--- 375,381 ----
\
if (ch == 0) \
{ \
! result = __GCONV_INCOMPLETE_INPUT; \
break; \
} \
else if (ch == __UNKNOWN_10646_CHAR) \
***************
*** 427,442 ****
char buf[2]; \
int used; \
\
! if (set == GB2312_set || ((ann & CNS11643_1_ann) == 0 \
! && (ann & ISO_IR_165_ann) == 0)) \
{ \
written = ucs4_to_gb2312 (ch, buf, 2); \
used = GB2312_set; \
} \
! else if (set == ISO_IR_165_set || (ann & ISO_IR_165_set) != 0) \
{ \
! written = ucs4_to_gb2312 (ch, buf, 2); \
! used = GB2312_set; \
} \
else \
{ \
--- 441,456 ----
char buf[2]; \
int used; \
\
! if (set == GB2312_set || ((ann & SO_ann) != CNS11643_1_ann \
! && (ann & SO_ann) != ISO_IR_165_ann)) \
{ \
written = ucs4_to_gb2312 (ch, buf, 2); \
used = GB2312_set; \
} \
! else if (set == ISO_IR_165_set || (ann & SO_ann) == ISO_IR_165_set) \
{ \
! written = ucs4_to_isoir165 (ch, buf, 2); \
! used = ISO_IR_165_set; \
} \
else \
{ \
***************
*** 454,482 ****
used = CNS11643_2_set; \
else \
{ \
! /* Well, see whether we have to change the SO set. */ \
! if (set != GB2312_set) \
! { \
! written = ucs4_to_gb2312 (ch, buf, 2); \
! if (written != __UNKNOWN_10646_CHAR) \
! used = GB2312_set; \
! } \
! if (written == __UNKNOWN_10646_CHAR && set != ISO_IR_165_set) \
! { \
! written = ucs4_to_isoir165 (ch, buf, 2); \
! if (written != __UNKNOWN_10646_CHAR) \
! used = ISO_IR_165_set; \
! } \
! if (written == __UNKNOWN_10646_CHAR && set != CNS11643_1_set) \
! { \
! written = ucs4_to_cns11643l1 (ch, buf, 2); \
! if (written != __UNKNOWN_10646_CHAR) \
! used = CNS11643_1_set; \
! } \
\
! if (written == __UNKNOWN_10646_CHAR) \
{ \
/* Even this does not work. Error. */ \
STANDARD_ERR_HANDLER (4); \
} \
} \
--- 468,533 ----
used = CNS11643_2_set; \
else \
{ \
! char tmpbuf[3]; \
\
! switch (0) \
{ \
+ default: \
+ /* Well, see whether we have to change the SO set. */ \
+ \
+ if (used != GB2312_set) \
+ { \
+ written = ucs4_to_gb2312 (ch, buf, 2); \
+ if (written != __UNKNOWN_10646_CHAR) \
+ { \
+ used = GB2312_set; \
+ break; \
+ } \
+ } \
+ \
+ if (used != ISO_IR_165_set) \
+ { \
+ written = ucs4_to_isoir165 (ch, buf, 2); \
+ if (written != __UNKNOWN_10646_CHAR) \
+ { \
+ used = ISO_IR_165_set; \
+ break; \
+ } \
+ } \
+ \
+ if (used != CNS11643_1_set) \
+ { \
+ written = ucs4_to_cns11643l1 (ch, buf, 2); \
+ if (written != __UNKNOWN_10646_CHAR) \
+ { \
+ used = CNS11643_1_set; \
+ break; \
+ } \
+ } \
+ \
+ written = ucs4_to_cns11643 (ch, tmpbuf, 3); \
+ if (written == 3 && tmpbuf[0] != 1 && tmpbuf[0] != 2) \
+ { \
+ buf[0] = tmpbuf[1]; \
+ buf[1] = tmpbuf[2]; \
+ written = 2; \
+ /* CNS 11643 plane 3 is part of the old CNS 11643 \
+ plane 14. \
+ XXX Currently planes 4 to 7 are not supported. */ \
+ if (tmpbuf[0] == 14 \
+ && (tmpbuf[1] < 0x62 \
+ || (tmpbuf[1] == 0x62 && tmpbuf[2] <= 0x45))) \
+ { \
+ used = CNS11643_3_set; \
+ break; \
+ } \
+ } \
+ \
/* Even this does not work. Error. */ \
+ used = ASCII_set; \
+ } \
+ if (used == ASCII_set) \
+ { \
STANDARD_ERR_HANDLER (4); \
} \
} \
***************
*** 488,494 ****
{ \
/* First see whether we announced that we use this \
character set. */ \
! if ((ann & (2 << used)) == 0) \
{ \
const char *escseq; \
\
--- 539,545 ----
{ \
/* First see whether we announced that we use this \
character set. */ \
! if ((used & SO_mask) != 0 && (ann & SO_ann) != (used << 8)) \
{ \
const char *escseq; \
\
***************
*** 499,516 ****
} \
\
assert (used >= 1 && used <= 4); \
! escseq = "\e$)A\e$)G\e$*H\e$)E" + (used - 1) * 4; \
*outptr++ = *escseq++; \
*outptr++ = *escseq++; \
*outptr++ = *escseq++; \
*outptr++ = *escseq++; \
\
! if (used == GB2312_set) \
! ann = (ann & CNS11643_2_ann) | GB2312_ann; \
! else if (used == CNS11643_1_set) \
! ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \
! else \
! ann |= CNS11643_2_ann; \
} \
\
if (used == CNS11643_2_set) \
--- 550,588 ----
} \
\
assert (used >= 1 && used <= 4); \
! escseq = ")A\0\0)G)E" + (used - 1) * 2; \
! *outptr++ = ESC; \
! *outptr++ = '$'; \
! *outptr++ = *escseq++; \
! *outptr++ = *escseq++; \
! \
! ann = (ann & ~SO_ann) | (used << 8); \
! } \
! else if ((used & SS2_mask) != 0 && (ann & SS2_ann) != (used << 8))\
! { \
! const char *escseq; \
! \
! assert (used == CNS11643_2_set); /* XXX */ \
! escseq = "*H"; \
! *outptr++ = ESC; \
! *outptr++ = '$'; \
*outptr++ = *escseq++; \
*outptr++ = *escseq++; \
+ \
+ ann = (ann & ~SS2_ann) | (used << 8); \
+ } \
+ else if ((used & SS3_mask) != 0 && (ann & SS3_ann) != (used << 8))\
+ { \
+ const char *escseq; \
+ \
+ assert ((used >> 5) >= 3 && (used >> 5) <= 7); \
+ escseq = "+I+J+K+L+M" + ((used >> 5) - 3) * 2; \
+ *outptr++ = ESC; \
+ *outptr++ = '$'; \
*outptr++ = *escseq++; \
*outptr++ = *escseq++; \
\
! ann = (ann & ~SS3_ann) | (used << 8); \
} \
\
if (used == CNS11643_2_set) \
***************
*** 523,528 ****
--- 595,610 ----
*outptr++ = SS2_0; \
*outptr++ = SS2_1; \
} \
+ else if (used >= CNS11643_3_set && used <= CNS11643_7_set) \
+ { \
+ if (outptr + 2 > outend) \
+ { \
+ result = __GCONV_FULL_OUTPUT; \
+ break; \
+ } \
+ *outptr++ = SS3_0; \
+ *outptr++ = SS3_1; \
+ } \
else \
{ \
/* We only have to emit something if currently ASCII is \
***************
*** 555,560 ****
--- 637,643 ----
\
*outptr++ = buf[0]; \
*outptr++ = buf[1]; \
+ set = used; \
} \
\
/* Now that we wrote the output increment the input pointer. */ \