This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Fix UTF-16 surrogate handling in __utf8_mbtowc


Corinna Vinschen wrote:
On Jul 28 14:56, Jeff Johnston wrote:
Please go ahead.

Thanks, will do. But here's a question:


+ tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
+ | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
+ | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6);
+ tmp = (tmp - 0x10000) >> 10;
+ /* Check if the sequence can fit into a surrogate pair at all.
+ If tmp is > 0x3ff at this point, the full Unicode value will
+ be > 0x10ffff. This is an invalid Unicode value and outside
+ of the defintion of UTF-16 surrogate pairs. */
+ if (tmp > 0x3ff)
+ {
+ r->_errno = EILSEQ;
+ return -1;
+ }

This code tests if the wide char value is within the boundaries of the Unicode specification, because it's impossible to represent bigger values (> 0x10ffff) as UTF-16 surrogate pair.

However, the surrounding code still allows values > 0x10ffff for UTF-32
systems.  So on these systems the code will happily allow and generate
non-Unicode values.

The question is, shouldn't the code be changed to disallow values beyond
0x10ffff on all systems, rather than just checking it in the UTF-16
case?

If the code allows those invalid sequences to generate and doesn't catch them at an earlier stage,
then it should be fixed, so go ahead, assuming you have tested the patch.


-- Jeff J.

New patch attached, which checks for values beyond the Unicode range
generically.  Whichever you prefer.


Corinna



* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Rework UTF-16 surrogate pair handling to be more bullet-proof even with incomplete UTF-8 sequences. Add check for 4 byte sequences resulting in values outside the valid Unicode range.


Index: libc/stdlib/mbtowc_r.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/stdlib/mbtowc_r.c,v
retrieving revision 1.14
diff -u -p -r1.14 mbtowc_r.c
--- libc/stdlib/mbtowc_r.c 28 Jul 2009 16:49:19 -0000 1.14
+++ libc/stdlib/mbtowc_r.c 28 Jul 2009 19:56:07 -0000
@@ -205,18 +205,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
if (n == 0)
return -2;
- if (state->__count == 4)
- {
- /* Create the second half of the surrogate pair. For a description
- see the comment below. */
- wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
- | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
- | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
- | (wchar_t)(state->__value.__wchb[3] & 0x3f);
- state->__count = 0;
- *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff);
- return 2;
- }
if (state->__count == 0)
ch = t[i++];
else
@@ -312,7 +300,7 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
*pwc = tmp;
return i;
}
- if (ch >= 0xf0 && ch <= 0xf7)
+ if (ch >= 0xf0 && ch <= 0xf4)
{
/* four-byte sequence */
wint_t tmp;
@@ -324,9 +312,10 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
if (n < 2)
return -2;
ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
- if (state->__value.__wchb[0] == 0xf0 && ch < 0x90)
+ if ((state->__value.__wchb[0] == 0xf0 && ch < 0x90)
+ || (state->__value.__wchb[0] == 0xf4 && ch >= 0x90))
{
- /* overlong UTF-8 sequence */
+ /* overlong UTF-8 sequence or result is > 0x10ffff */
r->_errno = EILSEQ;
return -1;
}
@@ -353,6 +342,26 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
state->__count = 3;
else if (n < (size_t)-1)
++n;
+ if (state->__count == 3 && sizeof(wchar_t) == 2)
+ {
+ /* On systems which have wchar_t being UTF-16 values, the value
+ doesn't fit into a single wchar_t in this case. So what we
+ do here is to store the state with a special value of __count
+ and return the first half of a surrogate pair. The first
+ three bytes of a UTF-8 sequence are enough to generate the
+ first half of a UTF-16 surrogate pair. As return value we
+ choose to return the number of bytes actually read up to
+ here.
+ The second half of the surrogate pair is returned in case we
+ recognize the special __count value of four, and the next
+ byte is actually a valid value. See below. */
+ tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
+ | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
+ | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6);
+ state->__count = 4;
+ *pwc = 0xd800 | ((tmp - 0x10000) >> 10);
+ return i;
+ }
if (n < 4)
return -2;
ch = t[i++];
@@ -365,21 +374,12 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
| (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
| (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
| (wint_t)(ch & 0x3f);
- if (tmp > 0xffff && sizeof(wchar_t) == 2)
- {
- /* On systems which have wchar_t being UTF-16 values, the value
- doesn't fit into a single wchar_t in this case. So what we
- do here is to store the state with a special value of __count
- and return the first half of a surrogate pair. As return
- value we choose to return the half of the actual UTF-8 char.
- The second half is returned in case we recognize the special
- __count value above. */
- state->__value.__wchb[3] = ch;
- state->__count = 4;
- *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff);
- return 2;
- }
- *pwc = tmp;
+ if (state->__count == 4 && sizeof(wchar_t) == 2)
+ /* Create the second half of the surrogate pair for systems with
+ wchar_t == UTF-16 . */
+ *pwc = 0xdc00 | (tmp & 0x3ff);
+ else
+ *pwc = tmp;
state->__count = 0;
return i;
}





Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]