This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

surrogate handling in UTF-16 and UCS-2 converters



Markus Kuhn pointed out that an UTF-8 to UTF-16 conversion should reject
code points which belong to single surrogates (0xD800..0xDFFF), otherwise
we have a security problem: An attacker could provide an UTF-8 encoded
input, which passes some kind of application dependent validation, and
then gets converted to UTF-16 where neighbour surrogate codes suddenly get
a different semantics. We don't know yet which kinds of characters will
appear in Unicode planes 1 to 16 and which properties this will have.

This patch blocks this attack by rejecting single surrogates in the
INTERNAL to UTF-16 converter. Same for the INTERNAL to UCS-2 converter,
because UCS-2 strings are sometimes interpreted as UTF-16 (some applications
are currently silently switching from UCS-2 as internal representation
to UTF-16).

The patch also rejects single surrogates in the UCS-2 to INTERNAL direction,
for consistency. (If surrogates occur in UCS-2 input, it means the input is
actually UTF-16, and applications should be told about it.)


2000-09-17  Bruno Haible  <haible@clisp.cons.org>

	* iconvdata/utf-16.c (BODY for TO_LOOP): Reject UCS-4 input in the
	range 0xD800..0xDFFF.
	* iconvdata/unicode.c (BODY for TO_LOOP): Likewise.
	(BODY for FROM_LOOP): Likewise.
	* iconv/gconv_simple.c (ucs2_internal_loop): Likewise.
	(internal_ucs2_loop): Likewise.
	(ucs2reverse_internal_loop): Likewise.
	(internal_ucs2reverse_loop): Likewise.

*** glibc-20000914/iconvdata/utf-16.c.bak	Mon Jul  3 16:39:27 2000
--- glibc-20000914/iconvdata/utf-16.c	Thu Sep 14 11:26:49 2000
***************
*** 195,200 ****
--- 195,216 ----
  #define BODY \
    {									      \
      uint32_t c = get32 (inptr);						      \
+ 									      \
+     if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0))		      \
+       {									      \
+ 	/* Surrogate characters in UCS-4 input are not valid.		      \
+ 	   We must catch this.  If we let surrogates pass through,	      \
+ 	   attackers could make a security hole exploit by		      \
+ 	   synthesizing any desired plane 1-16 character.  */		      \
+ 	if (! ignore_errors_p ())					      \
+ 	  {								      \
+ 	    result = __GCONV_ILLEGAL_INPUT;				      \
+ 	    break;							      \
+ 	  }								      \
+ 	inptr += 4;							      \
+ 	++*irreversible;						      \
+ 	continue;							      \
+       }									      \
  									      \
      if (swap)								      \
        {									      \
*** glibc-20000914/iconvdata/unicode.c.bak	Fri Aug 25 23:52:57 2000
--- glibc-20000914/iconvdata/unicode.c	Thu Sep 14 11:52:45 2000
***************
*** 154,159 ****
--- 154,176 ----
        {									      \
  	STANDARD_ERR_HANDLER (4);					      \
        }									      \
+     else if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0))		      \
+       {									      \
+ 	/* Surrogate characters in UCS-4 input are not valid.		      \
+ 	   We must catch this, because the UCS-2 output might be	      \
+ 	   interpreted as UTF-16 by other programs.  If we let		      \
+ 	   surrogates pass through, attackers could make a security	      \
+ 	   hole exploit by synthesizing any desired plane 1-16		      \
+ 	   character.  */						      \
+ 	if (! ignore_errors_p ())					      \
+ 	  {								      \
+ 	    result = __GCONV_ILLEGAL_INPUT;				      \
+ 	    break;							      \
+ 	  }								      \
+ 	inptr += 4;							      \
+ 	++*irreversible;						      \
+ 	continue;							      \
+       }									      \
      else								      \
        {									      \
  	put16 (outptr, c);						      \
***************
*** 179,189 ****
--- 196,221 ----
      if (swap)								      \
        u1 = bswap_16 (u1);						      \
  									      \
+     if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0))		      \
+       {									      \
+ 	/* Surrogate characters in UCS-2 input are not valid.  Reject	      \
+ 	   them.  (Catching this here is not security relevant.)  */	      \
+ 	if (! ignore_errors_p ())					      \
+ 	  {								      \
+ 	    result = __GCONV_ILLEGAL_INPUT;				      \
+ 	    break;							      \
+ 	  }								      \
+ 	inptr += 2;							      \
+ 	++*irreversible;						      \
+ 	continue;							      \
+       }									      \
+ 									      \
      put32 (outptr, u1);							      \
  									      \
      inptr += 2;								      \
      outptr += 4;							      \
    }
+ #define LOOP_NEED_FLAGS
  #define EXTRA_LOOP_DECLS \
  	, int swap
  #include <iconv/loop.c>
*** glibc-20000914/iconv/gconv_simple.c.bak	Tue Sep  5 15:24:48 2000
--- glibc-20000914/iconv/gconv_simple.c	Thu Sep 14 11:57:22 2000
***************
*** 773,779 ****
        }									      \
      else								      \
        /* It's an one byte sequence.  */					      \
-       /* XXX unaligned.  */						      \
        *((uint32_t *) outptr)++ = *inptr++;				      \
    }
  #define LOOP_NEED_FLAGS
--- 773,778 ----
***************
*** 797,803 ****
  #define LOOPFCT			FROM_LOOP
  #define BODY \
    {									      \
-     /* XXX unaligned.  */						      \
      if (__builtin_expect (*((uint32_t *) inptr), 0) > 0x7f)		      \
        {									      \
  	STANDARD_ERR_HANDLER (4);					      \
--- 796,801 ----
***************
*** 1147,1153 ****
  #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
  #define LOOPFCT			FROM_LOOP
  #define BODY \
!   *((uint32_t *) outptr)++ = *((uint16_t *) inptr)++;
  #include <iconv/loop.c>
  #include <iconv/skeleton.c>
  
--- 1145,1171 ----
  #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
  #define LOOPFCT			FROM_LOOP
  #define BODY \
!   {									      \
!     uint16_t u1 = *((uint16_t *) inptr);				      \
! 									      \
!     if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0))		      \
!       {									      \
! 	/* Surrogate characters in UCS-2 input are not valid.  Reject	      \
! 	   them.  (Catching this here is not security relevant.)  */	      \
! 	if (! ignore_errors_p ())					      \
! 	  {								      \
! 	    result = __GCONV_ILLEGAL_INPUT;				      \
! 	    break;							      \
! 	  }								      \
! 	inptr += 2;							      \
! 	++*irreversible;						      \
! 	continue;							      \
!       }									      \
! 									      \
!     *((uint32_t *) outptr)++ = u1;					      \
!     inptr += 2;								      \
!   }
! #define LOOP_NEED_FLAGS
  #include <iconv/loop.c>
  #include <iconv/skeleton.c>
  
***************
*** 1168,1179 ****
  #define LOOPFCT			FROM_LOOP
  #define BODY \
    {									      \
!     if (__builtin_expect (*((uint32_t *) inptr), 0) >= 0x10000)		      \
        {									      \
  	STANDARD_ERR_HANDLER (4);					      \
        }									      \
      else 								      \
!       *((uint16_t *) outptr)++ = *((uint32_t *) inptr)++;		      \
    }
  #define LOOP_NEED_FLAGS
  #include <iconv/loop.c>
--- 1186,1219 ----
  #define LOOPFCT			FROM_LOOP
  #define BODY \
    {									      \
!     uint32_t val = *((uint32_t *) inptr);				      \
! 									      \
!     if (__builtin_expect (val, 0) >= 0x10000)				      \
        {									      \
  	STANDARD_ERR_HANDLER (4);					      \
        }									      \
+     else if (__builtin_expect (val >= 0xd800 && val < 0xe000, 0))	      \
+       {									      \
+ 	/* Surrogate characters in UCS-4 input are not valid.		      \
+ 	   We must catch this, because the UCS-2 output might be	      \
+ 	   interpreted as UTF-16 by other programs.  If we let		      \
+ 	   surrogates pass through, attackers could make a security	      \
+ 	   hole exploit by synthesizing any desired plane 1-16		      \
+ 	   character.  */						      \
+ 	if (! ignore_errors_p ())					      \
+ 	  {								      \
+ 	    result = __GCONV_ILLEGAL_INPUT;				      \
+ 	    break;							      \
+ 	  }								      \
+ 	inptr += 4;							      \
+ 	++*irreversible;						      \
+ 	continue;							      \
+       }									      \
      else 								      \
!       {									      \
! 	*((uint16_t *) outptr)++ = val;					      \
! 	inptr += 4;							      \
!       }									      \
    }
  #define LOOP_NEED_FLAGS
  #include <iconv/loop.c>
***************
*** 1195,1202 ****
  #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
  #define LOOPFCT			FROM_LOOP
  #define BODY \
!   *((uint32_t *) outptr)++ = bswap_16 (*(uint16_t *) inptr);		      \
!   inptr += 2;
  #include <iconv/loop.c>
  #include <iconv/skeleton.c>
  
--- 1235,1261 ----
  #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
  #define LOOPFCT			FROM_LOOP
  #define BODY \
!   {									      \
!     uint16_t u1 = bswap_16 (*((uint16_t *) inptr));			      \
! 									      \
!     if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0))		      \
!       {									      \
! 	/* Surrogate characters in UCS-2 input are not valid.  Reject	      \
! 	   them.  (Catching this here is not security relevant.)  */	      \
! 	if (! ignore_errors_p ())					      \
! 	  {								      \
! 	    result = __GCONV_ILLEGAL_INPUT;				      \
! 	    break;							      \
! 	  }								      \
! 	inptr += 2;							      \
! 	++*irreversible;						      \
! 	continue;							      \
!       }									      \
! 									      \
!     *((uint32_t *) outptr)++ = u1;					      \
!     inptr += 2;								      \
!   }
! #define LOOP_NEED_FLAGS
  #include <iconv/loop.c>
  #include <iconv/skeleton.c>
  
***************
*** 1222,1229 ****
        {									      \
  	STANDARD_ERR_HANDLER (4);					      \
        }									      \
!     *((uint16_t *) outptr)++ = bswap_16 (val);				      \
!     inptr += 4;								      \
    }
  #define LOOP_NEED_FLAGS
  #include <iconv/loop.c>
--- 1281,1308 ----
        {									      \
  	STANDARD_ERR_HANDLER (4);					      \
        }									      \
!     else if (__builtin_expect (val >= 0xd800 && val < 0xe000, 0))	      \
!       {									      \
! 	/* Surrogate characters in UCS-4 input are not valid.		      \
! 	   We must catch this, because the UCS-2 output might be	      \
! 	   interpreted as UTF-16 by other programs.  If we let		      \
! 	   surrogates pass through, attackers could make a security	      \
! 	   hole exploit by synthesizing any desired plane 1-16		      \
! 	   character.  */						      \
! 	if (! ignore_errors_p ())					      \
! 	  {								      \
! 	    result = __GCONV_ILLEGAL_INPUT;				      \
! 	    break;							      \
! 	  }								      \
! 	inptr += 4;							      \
! 	++*irreversible;						      \
! 	continue;							      \
!       }									      \
!     else 								      \
!       {									      \
! 	*((uint16_t *) outptr)++ = bswap_16 (val);			      \
! 	inptr += 4;							      \
!       }									      \
    }
  #define LOOP_NEED_FLAGS
  #include <iconv/loop.c>

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]