This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH/cygwin]: Support GEORGIAN-PS and PT154 charsets


Hi,

I just applied the below patch.  It adds support for the GEORGIAN-PS and
PT154 charsets if _MB_EXTENDED_CHARSETS_WINDOWS is set.

The charset *names* are recognized by all targets, as is now TIS-620 as
well.  That was the blessed behaviour for the ISO-8859, CPxxx, and KOI8
charsets already, so I implemented it for the new charsets the same way.

All new charsets being singlebyte charsets, they are by default mapped
to ASCII.  Only if _MB_EXTENDED_CHARSETS_WINDOWS is set, they are mapped
to the __cp_mbtowc and __cp_wctomb functions and the new entries in the
__ctype_cp and __cp_conv arrays are used.  I revamped the respective
part of the documentation to reflect the actual implementation now more
closely.

Since the extended functionality of the new charsets is only available for
Cygwin for now anyway, I checked this in.


Corinna


	* libc/ctype/ctype_cp.h (_CTYPE_GEORGIAN_PS_128_254): Define.
	(_CTYPE_GEORGIAN_PS_255): Define.
	(_CTYPE_PT154_128_254): Define.
	(_CTYPE_PT154_255): Define.
	(__ctype_cp): Add array members for above ctype definitions.
	* libc/locale/locale.c (loadlocale): Make TIS-620 charset name
	available for all targets.  Add guards for setting the conversion
	function pointers.  Add support for GEORGIAN-PS and PT154 charsets.
	Change documentation to reflect current behaviour more closely.
	* libc/locale/nl_langinfo.c (nl_langinfo): On Cygwin, translate
	"CP101" to "GEORGIAN-PS" and "CP102" to "PT154".
	* libc/stdlib/sb_charsets.c (__cp_conv): Add conversion arrays
	for GEORGIAN-PS and PT154.
	(__cp_index): Map invalid Windows codepage number 101 to
	GEORGIAN-PS conversion array, 102 to PT154 conversion array.


Index: libc/ctype/ctype_cp.h
===================================================================
RCS file: /cvs/src/src/newlib/libc/ctype/ctype_cp.h,v
retrieving revision 1.4
diff -u -p -r1.4 ctype_cp.h
--- libc/ctype/ctype_cp.h	24 Aug 2009 22:11:10 -0000	1.4
+++ libc/ctype/ctype_cp.h	6 Feb 2010 18:15:46 -0000
@@ -433,6 +433,42 @@
 	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
 	_U,	_U,	_U,	_U,	_U,	_U,	_U
 #define _CTYPE_CP21866_255 _U
+#define _CTYPE_GEORGIAN_PS_128_254 \
+   	_P,	0,	_P,	_L,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_U,	_P,	_U,	_U,	0,	0,  \
+	0,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_L,	_P,	_L,	0,	_L,	_U, \
+	_S|_B,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_P, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L
+#define _CTYPE_GEORGIAN_PS_255 _L
+#define _CTYPE_PT154_128_254 \
+   	_U,	_U,	_U,	_L,	_P,	_P,	_U,	_U, \
+	_U,	_L,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_L,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_S|_B,	_U,	_L,	_U,	_U,	_U,	_U,	_P, \
+	_U,	_P,	_U,	_P,	_P,	_L,	_P,	_U, \
+	_P,	_L,	_U,	_L,	_L,	_L,	_P,	_P, \
+	_L,	_P,	_L,	_P,	_L,	_U,	_L,	_L, \
+	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L
+#define _CTYPE_PT154_255 _L
 
 
 extern int __cp_index (const char *charset_ext);
@@ -442,7 +478,7 @@ extern int __cp_index (const char *chars
 #ifndef __CYGWIN__
 static _CONST
 #endif
-char __ctype_cp[24][128 + 256] = {
+char __ctype_cp[26][128 + 256] = {
   { _CTYPE_CP437_128_254,
     0,
     _CTYPE_DATA_0_127,
@@ -587,11 +623,23 @@ char __ctype_cp[24][128 + 256] = {
     _CTYPE_CP21866_128_254,
     _CTYPE_CP21866_255
   },
+  { _CTYPE_GEORGIAN_PS_128_254,
+    0,
+    _CTYPE_DATA_0_127,
+    _CTYPE_GEORGIAN_PS_128_254,
+    _CTYPE_GEORGIAN_PS_255
+  },
+  { _CTYPE_PT154_128_254,
+    0,
+    _CTYPE_DATA_0_127,
+    _CTYPE_PT154_128_254,
+    _CTYPE_PT154_255
+  },
 };
 
 #else /* !defined(ALLOW_NEGATIVE_CTYPE_INDEX) */
 
-static _CONST char __ctype_cp[22][1 + 256] = {
+static _CONST char __ctype_cp[26][1 + 256] = {
   { 0,
     _CTYPE_DATA_0_127,
     _CTYPE_CP437_128_254,
@@ -712,6 +760,16 @@ static _CONST char __ctype_cp[22][1 + 25
     _CTYPE_CP21866_128_254,
     _CTYPE_CP21866_255
   },
+  { 0,
+    _CTYPE_DATA_0_127,
+    _CTYPE_GEORGIAN_PS_128_254,
+    _CTYPE_GEORGIAN_PS_255
+  },
+  { 0,
+    _CTYPE_DATA_0_127,
+    _CTYPE_PT154_128_254,
+    _CTYPE_PT154_255
+  },
 };
 
 #endif /* ALLOW_NEGATIVE_CTYPE_INDEX */
Index: libc/locale/locale.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/locale/locale.c,v
retrieving revision 1.36
diff -u -p -r1.36 locale.c
--- libc/locale/locale.c	5 Feb 2010 21:24:42 -0000	1.36
+++ libc/locale/locale.c	6 Feb 2010 18:15:46 -0000
@@ -56,34 +56,36 @@ for a given language, a three character 
 <<"TERRITORY">> is a country code per ISO 3166.  For <<"charset">> and
 <<"modifier">> see below.
 
-Additionally to the POSIX specifier, seven extensions are supported for
-backward compatibility with older implementations using newlib:
-<<"C-UTF-8">>, <<"C-JIS">>, <<"C-eucJP">>, <<"C-SJIS">>, <<C-KOI8-R>>,
-<<C-KOI8-U>>, <<"C-ISO-8859-x">> with 1 <= x <= 15, or <<"C-CPxxx">> with
-xxx in [437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 932,
-1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258].
-
-Instead of <<"C-">>, you can specify also <<"C.">>.  Both variations allow
+Additionally to the POSIX specifier, the following extension is supported
+for backward compatibility with older implementations using newlib:
+<<"C-charset">>.
+Instead of <<"C-">>, you can also specify <<"C.">>.  Both variations allow
 to specify language neutral locales while using other charsets than ASCII,
 for instance <<"C.UTF-8">>, which keeps all settings as in the C locale,
 but uses the UTF-8 charset.
 
-Even when using POSIX locale strings, the only charsets allowed are
+The following charsets are recogized:
 <<"UTF-8">>, <<"JIS">>, <<"EUCJP">>, <<"SJIS">>, <<"KOI8-R">>, <<"KOI8-U">>,
-<<"ISO-8859-x">> with 1 <= x <= 15, or <<"CPxxx">> with xxx in
-[437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 932, 1125, 1250,
-1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258].
+<<"GEORGIAN-PS">>, <<"PT154">>, <<"TIS-620">>, <<"ISO-8859-x">> with
+1 <= x <= 16, or <<"CPxxx">> with xxx in [437, 720, 737, 775, 850, 852, 855,
+857, 858, 862, 866, 874, 932, 1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256,
+1257, 1258].
+
 Charsets are case insensitive.  For instance, <<"EUCJP">> and <<"eucJP">>
 are equivalent.  Charset names with dashes can also be written without
 dashes, as in <<"UTF8">>, <<"iso88591">> or <<"koi8r">>.  <<"EUCJP">> and
 <<"EUCKR"> are also recognized with dash, <<"EUC-JP">> and <<"EUC-KR">>.
 
+Full support for all of the above charsets requires that newlib has been
+build with multibyte support and support for all ISO and Windows Codepage.
+Otherwise all singlebyte charsets are simply mapped to ASCII.  Right now,
+only newlib for Cygwin is built with full charset support by default.
+Under Cygwin, this implementation additionally supports the charsets
+<<"GBK">>, <<"eucKR">>, and <<"Big5">>.  Cygwin does not support <<"JIS">>.
+
 (<<"">> is also accepted; if given, the settings are read from the
 corresponding LC_* environment variables and $LANG according to POSIX rules.
 
-Under Cygwin, this implementation additionally supports the charsets
-<<"GBK">>, <<"eucKR">>, <<"Big5">>, and <<"TIS-620">>.
-
 This implementation also supports a single modifier, <<"cjknarrow">>.
 Any other modifier is ignored.  <<"cjknarrow">>, in conjunction with one
 of the language specifiers <<"ja">>, <<"ko">>, and <<"zh">> specifies
@@ -720,38 +722,91 @@ loadlocale(struct _reent *p, int categor
       l_mbtowc = __ascii_mbtowc;
 #endif
       break;
-#ifdef __CYGWIN__
     case 'G':
     case 'g':
-      if (strcasecmp (charset, "GBK"))
-      	return NULL;
-      strcpy (charset, "GBK");
-      mbc_max = 2;
+#ifdef __CYGWIN__
+      if (!strcasecmp (charset, "GBK"))
+      	{
+	  strcpy (charset, "GBK");
+	  mbc_max = 2;
+#ifdef _MB_CAPABLE
+	  l_wctomb = __gbk_wctomb;
+	  l_mbtowc = __gbk_mbtowc;
+#endif
+	}
+      else
+#endif /* __CYGWIN__ */
+      /* GEORGIAN-PS and the alias without dash */
+      if (!strncasecmp (charset, "GEORGIAN", 8))
+	{
+	  c = charset + 8;
+	  if (*c == '-')
+	    ++c;
+	  if (strcasecmp (c, "PS"))
+	    return NULL;
+	  strcpy (charset, "CP101");
+	  mbc_max = 1;
 #ifdef _MB_CAPABLE
-      l_wctomb = __gbk_wctomb;
-      l_mbtowc = __gbk_mbtowc;
+#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
+	  l_wctomb = __cp_wctomb;
+	  l_mbtowc = __cp_mbtowc;
+#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */
+	  l_wctomb = __ascii_wctomb;
+	  l_mbtowc = __ascii_mbtowc;
+#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
 #endif
+	}
+      else
+	return NULL;
       break;
-    case 'B':
-    case 'b':
-      if (strcasecmp (charset, "BIG5"))
-      	return NULL;
-      strcpy (charset, "BIG5");
-      mbc_max = 2;
+    case 'P':
+    case 'p':
+      /* PT154 */
+      if (strcasecmp (charset, "PT154"))
+	return NULL;
+      strcpy (charset, "CP102");
+      mbc_max = 1;
 #ifdef _MB_CAPABLE
-      l_wctomb = __big5_wctomb;
-      l_mbtowc = __big5_mbtowc;
+#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
+      l_wctomb = __cp_wctomb;
+      l_mbtowc = __cp_mbtowc;
+#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */
+      l_wctomb = __ascii_wctomb;
+      l_mbtowc = __ascii_mbtowc;
+#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
 #endif
       break;
     case 'T':
     case 't':
-      if (strcasecmp (charset, "TIS620") && strcasecmp (charset, "TIS-620"))
+      if (strncasecmp (charset, "TIS", 3))
+      	return NULL;
+      c = charset + 3;
+      if (*c == '-')
+	++c;
+      if (strcasecmp (c, "620"))
       	return NULL;
       strcpy (charset, "CP874");
       mbc_max = 1;
 #ifdef _MB_CAPABLE
+#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
       l_wctomb = __cp_wctomb;
       l_mbtowc = __cp_mbtowc;
+#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */
+      l_wctomb = __ascii_wctomb;
+      l_mbtowc = __ascii_mbtowc;
+#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
+#endif
+      break;
+#ifdef __CYGWIN__
+    case 'B':
+    case 'b':
+      if (strcasecmp (charset, "BIG5"))
+      	return NULL;
+      strcpy (charset, "BIG5");
+      mbc_max = 2;
+#ifdef _MB_CAPABLE
+      l_wctomb = __big5_wctomb;
+      l_mbtowc = __big5_mbtowc;
 #endif
       break;
 #endif /* __CYGWIN__ */
Index: libc/locale/nl_langinfo.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/locale/nl_langinfo.c,v
retrieving revision 1.7
diff -u -p -r1.7 nl_langinfo.c
--- libc/locale/nl_langinfo.c	24 Jan 2010 12:18:20 -0000	1.7
+++ libc/locale/nl_langinfo.c	6 Feb 2010 18:15:46 -0000
@@ -78,6 +78,10 @@ _DEFUN(nl_langinfo, (item), 
 		      ret = "KOI8-R";
 		    else if (strcmp (ret + 2, "21866") == 0)
 		      ret = "KOI8-U";
+		    else if (strcmp (ret + 2, "101") == 0)
+		      ret = "GEORGIAN-PS";
+		    else if (strcmp (ret + 2, "102") == 0)
+		      ret = "PT154";
 		  }
 		else if (ret[0] == 'S'/*JIS*/)
 		  {
Index: libc/stdlib/sb_charsets.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/stdlib/sb_charsets.c,v
retrieving revision 1.4
diff -u -p -r1.4 sb_charsets.c
--- libc/stdlib/sb_charsets.c	29 Sep 2009 19:11:01 -0000	1.4
+++ libc/stdlib/sb_charsets.c	6 Feb 2010 18:15:46 -0000
@@ -203,7 +203,7 @@ wchar_t __iso_8859_conv[14][0x60] = {
    value (function __cp_index), the second index is the value of the
    incoming character - 0x80.
    Values < 0x80 don't have to be converted anyway. */
-wchar_t __cp_conv[24][0x80] = {
+wchar_t __cp_conv[26][0x80] = {
   /* CP437 */
   { 0xc7, 0xfc, 0xe9, 0xe2, 0xe4, 0xe0, 0xe5, 0xe7,
     0xea, 0xeb, 0xe8, 0xef, 0xee, 0xec, 0xc4, 0xc5,
@@ -611,7 +611,47 @@ wchar_t __cp_conv[24][0x80] = {
     0x42e, 0x410, 0x411, 0x426, 0x414, 0x415, 0x424, 0x413,
     0x425, 0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e,
     0x41f, 0x42f, 0x420, 0x421, 0x422, 0x423, 0x416, 0x412,
-    0x42c, 0x42b, 0x417, 0x428, 0x42d, 0x429, 0x427, 0x42a }
+    0x42c, 0x42b, 0x417, 0x428, 0x42d, 0x429, 0x427, 0x42a },
+  /* The following are not valid Windows codepages, but they fit nicely here.
+     The CP numbers are only used internally and are guranteed not to clash
+     with valid Windows codepage identifier. */
+  /* CP101 (GEORGIAN-PS)  Georgian charset, used as the default charset in
+     the ka_GE locale (Georgian, Georgia).  Apparently derived from Windows
+     CP1252. */
+  { 0x80, 0x81, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021,
+    0x2c6, 0x2030, 0x160, 0x2039, 0x152, 0x8d, 0x8e, 0x8f,
+    0x90, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
+    0x2dc, 0x2122, 0x161, 0x203a, 0x153, 0x9d, 0x9e, 0x178, 
+    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 
+    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 
+    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 
+    0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 
+    0x10d0, 0x10d1, 0x10d2, 0x10d3, 0x10d4, 0x10d5, 0x10d6, 0x10f1,
+    0x10d7, 0x10d8, 0x10d9, 0x10da, 0x10db, 0x10dc, 0x10f2, 0x10dd,
+    0x10de, 0x10df, 0x10e0, 0x10e1, 0x10e2, 0x10f3, 0x10e3, 0x10e4,
+    0x10e5, 0x10e6, 0x10e7, 0x10e8, 0x10e9, 0x10ea, 0x10eb, 0x10ec,
+    0x10ed, 0x10ee, 0x10f4, 0x10ef, 0x10f0, 0x10f5, 0xe6, 0xe7,
+    0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 
+    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 
+    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff },
+  /* CP102 (PT154) Cyrillic-Asian charset, used as the default charset in
+     the kk_KZ locale (Kazakh, Kazakhstan). */
+  { 0x496, 0x492, 0x4ee, 0x493, 0x201e, 0x2026, 0x4b6, 0x4ae,
+    0x4b2, 0x4af, 0x4a0, 0x4e2, 0x4a2, 0x49a, 0x4ba, 0x4b8, 
+    0x497, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
+    0x4b3, 0x4b7, 0x4a1, 0x4e3, 0x4a3, 0x49b, 0x4bb, 0x4b9, 
+    0xa0, 0x40e, 0x45e, 0x408, 0x4e8, 0x498, 0x4b0, 0xa7,
+    0x401, 0xa9, 0x4d8, 0xab, 0xac, 0x4ef, 0xae, 0x49c,
+    0xb0, 0x4b1, 0x406, 0x456, 0x499, 0x4e9, 0xb6, 0xb7,
+    0x451, 0x2116, 0x4d9, 0xbb, 0x458, 0x4aa, 0x4ab, 0x49d, 
+    0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 
+    0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e, 0x41f, 
+    0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 
+    0x428, 0x429, 0x42a, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, 
+    0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 
+    0x438, 0x439, 0x43a, 0x43b, 0x43c, 0x43d, 0x43e, 0x43f, 
+    0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 
+    0x448, 0x449, 0x44a, 0x44b, 0x44c, 0x44d, 0x44e, 0x44f }
 };
 #endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
 
@@ -727,6 +767,12 @@ __cp_index (const char *charset_ext)
     case 21866:
       cp_idx = 23;
       break;
+    case 101:
+      cp_idx = 24;
+      break;
+    case 102:
+      cp_idx = 25;
+      break;
     default:
       cp_idx = -1;
       break;


-- 
Corinna Vinschen
Cygwin Project Co-Leader
Red Hat


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]