This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Remove erroneous Uincode conversions from _wctomb_r and mbtowc_r


Ping?

On Feb 17 18:49, Corinna Vinschen wrote:
> Hi,
> 
> the conversion functions _wctomb_r and _mbtowc_r convert 5 and 6 byte
> UTF-8 sequences into a wchar counterpart.  Vice versa, wchar_t values >
> 0x10ffff are converted to 4, 5 and 6 byte UTF-8 sequences.  However, per
> the Unicode standard (http://www.unicode.org/standard/standard.html),
> these values are invalid.  Unicode is restricted to the value range
> 0x000000 to 0x10ffff.  Any character outside this range has to be
> treated as invalid.
> 
> The below patch fixes the two functions to handle only valid UTF characters.
> 
> 
> Corinna
> 
> 
> 	* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8
> 	sequences since they are invalid in the Unicode standard.
> 	* wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t
> 	values beyond 0x10ffff into UTF-8 chars.
> 
> 
> Index: libc/stdlib/mbtowc_r.c
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/stdlib/mbtowc_r.c,v
> retrieving revision 1.7
> diff -u -p -r1.7 mbtowc_r.c
> --- libc/stdlib/mbtowc_r.c	23 Apr 2004 21:44:22 -0000	1.7
> +++ libc/stdlib/mbtowc_r.c	17 Feb 2009 17:48:07 -0000
> @@ -193,120 +193,6 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state)
>  	  state->__count = 0;
>  	  return i;
>  	}
> -      else if (ch >= 0xf8 && ch <= 0xfb)
> -	{
> -	  /* five-byte sequence */
> -	  if (sizeof(wchar_t) < 4)
> -	    return -1; /* we can't store such a value */
> -	  state->__value.__wchb[0] = ch;
> -	  if (state->__count == 0)
> -	    state->__count = 1;
> -	  else
> -	    ++n;
> -	  if (n < 2)
> -	    return -2;
> -	  ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
> -	  if (state->__value.__wchb[0] == 0xf8 && ch < 0x88)
> -	    /* overlong UTF-8 sequence */
> -	    return -1;
> -	  if (ch < 0x80 || ch > 0xbf)
> -	    return -1;
> -	  state->__value.__wchb[1] = ch;
> -	  if (state->__count == 1)
> -	    state->__count = 2;
> -	  else
> -	    ++n;
> -	  if (n < 3)
> -	    return -2;
> -	  ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
> -	  if (ch < 0x80 || ch > 0xbf)
> -	    return -1;
> -	  state->__value.__wchb[2] = ch;
> -	  if (state->__count == 2)
> -	    state->__count = 3;
> -	  else
> -	    ++n;
> -	  if (n < 4)
> -	    return -2;
> -	  ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
> -	  if (ch < 0x80 || ch > 0xbf)
> -	    return -1;
> -	  state->__value.__wchb[3] = ch;
> -	  state->__count = 4;
> -	  if (n < 5)
> -	    return -2;
> -	  ch = t[i++];
> -	  *pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24)
> -	    |    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18)
> -	    |    (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12)
> -	    |    (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6)
> -	    |    (wchar_t)(ch & 0x3f);
> -	
> -	  state->__count = 0;
> -	  return i;
> -	}
> -      else if (ch >= 0xfc && ch <= 0xfd)
> -        {
> -          /* six-byte sequence */
> -	  int ch2;
> -	  if (sizeof(wchar_t) < 4)
> -	    return -1; /* we can't store such a value */
> -	  state->__value.__wchb[0] = ch;
> -	  if (state->__count == 0)
> -	    state->__count = 1;
> -	  else
> -	    ++n;
> -	  if (n < 2)
> -	    return -2;
> -	  ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
> -	  if (state->__value.__wchb[0] == 0xfc && ch < 0x84)
> -	    /* overlong UTF-8 sequence */
> -	    return -1;
> -	  if (ch < 0x80 || ch > 0xbf)
> -	    return -1;
> -	  state->__value.__wchb[1] = ch;
> -	  if (state->__count == 1)
> -	    state->__count = 2;
> -	  else
> -	    ++n;
> -	  if (n < 3)
> -	    return -2;
> -	  ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
> -	  if (ch < 0x80 || ch > 0xbf)
> -	    return -1;
> -	  state->__value.__wchb[2] = ch;
> -	  if (state->__count == 2)
> -	    state->__count = 3;
> -	  else
> -	    ++n;
> -	  if (n < 4)
> -	    return -2;
> -	  ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
> -	  if (ch < 0x80 || ch > 0xbf)
> -	    return -1;
> -	  state->__value.__wchb[3] = ch;
> -	  if (state->__count == 3)
> -	    state->__count = 4;
> -	  else
> -	    ++n;
> -	  if (n < 5)
> -	    return -2;
> -	  if (n == 5)
> -	    return -1; /* at this point we can't save enough to restart */
> -	  ch = t[i++];
> -	  if (ch < 0x80 || ch > 0xbf)
> -	    return -1;
> -	  ch2 = t[i++];
> -	  *pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30)
> -	    |    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24)
> -	    |    (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18)
> -	    |    (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12)
> -	    |    (wchar_t)((ch & 0x3f) << 6)
> -	    |    (wchar_t)(ch2 & 0x3f);
> -	
> -	  state->__count = 0;
> -	  return i;
> -	}
>        else
>  	return -1;
>      }      
> Index: libc/stdlib/wctomb_r.c
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/stdlib/wctomb_r.c,v
> retrieving revision 1.7
> diff -u -p -r1.7 wctomb_r.c
> --- libc/stdlib/wctomb_r.c	16 May 2007 19:31:06 -0000	1.7
> +++ libc/stdlib/wctomb_r.c	17 Feb 2009 17:48:07 -0000
> @@ -50,7 +50,7 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
>            *s   = 0x80 |  (wchar &   0x3f);
>            return 3;
>          }
> -      else if (wchar >= 0x10000 && wchar <= 0x1fffff)
> +      else if (wchar >= 0x10000 && wchar <= 0x10ffff)
>          {
>            *s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
>            *s++ = 0x80 | ((wchar &  0x3f000) >> 12);
> @@ -58,25 +58,6 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
>            *s   = 0x80 |  (wchar &     0x3f);
>            return 4;
>          }
> -      else if (wchar >= 0x200000 && wchar <= 0x3ffffff)
> -        {
> -          *s++ = 0xf8 | ((wchar & 0x3000000) >> 24);
> -          *s++ = 0x80 | ((wchar &  0xfc0000) >> 18);
> -          *s++ = 0x80 | ((wchar &   0x3f000) >> 12);
> -          *s++ = 0x80 | ((wchar &     0xfc0) >> 6);
> -          *s   = 0x80 |  (wchar &      0x3f);
> -          return 5;
> -        }
> -      else if (wchar >= 0x4000000 && wchar <= 0x7fffffff)
> -        {
> -          *s++ = 0xfc | ((wchar & 0x40000000) >> 30);
> -          *s++ = 0x80 | ((wchar & 0x3f000000) >> 24);
> -          *s++ = 0x80 | ((wchar &   0xfc0000) >> 18);
> -          *s++ = 0x80 | ((wchar &    0x3f000) >> 12);
> -          *s++ = 0x80 | ((wchar &      0xfc0) >> 6);
> -          *s   = 0x80 |  (wchar &       0x3f);
> -          return 6;
> -        }
>        else
>          return -1;
>      }
> 
> 
> -- 
> Corinna Vinschen
> Cygwin Project Co-Leader
> Red Hat

-- 
Corinna Vinschen
Cygwin Project Co-Leader
Red Hat


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]