This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Fix regex ->translate handling


Hi!

RE_TRANSLATE_TYPE is cleverly char * (used to be in old regex too), while
all uses of it need to be (unsigned char) preg->translate[ch].
Without this patch (which makes internal vars use unsigned char *)
bug-regex22 segfaults.
Also, it seems old regex differentiated between "\W" and "\w" patterns when
re.translate['W'] is 'w', so this patch makes that work as well.
Though old regex apparently didn't consider de_DE.ISO-8859-1 '\xc4'
or '\xd6' (translated to '\xe4' or '\xf6') as word consituent characters,
though that was clearly old regex bug
(
/* Jim Meyering writes:

   "... Some ctype macros are valid only for character codes that
   isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
   using /bin/cc or gcc but without giving an ansi option).  So, all
   ctype uses should be through macros like ISPRINT...  If
   STDC_HEADERS is defined, then autoconf has verified that the ctype
   macros don't need to be guarded with references to isascii. ...
   Defining isascii to 1 should let any compiler worth its salt
   eliminate the && through constant folding."
   Solaris defines some of these symbols so we must undefine them first.  */

# undef ISASCII
# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
#  define ISASCII(c) 1
# else
#  define ISASCII(c) isascii(c)
# endif
...
# define ISALNUM(c) (ISASCII (c) && isalnum (c))
).

2004-01-05  Jakub Jelinek  <jakub@redhat.com>

	* posix/regcomp.c (build_charclass, buld_charclass_op): Change first
	argument to unsigned RE_TRANSLATE_TYPE.
	* posix/regex_internal.h (re_string_t): Change trans type to
	unsigned RE_TRANSLATE_TYPE.
	* posix/regex_internal.c (re_string_construct_common): Cast
	trans to unsigned RE_TRANSLATE_TYPE.
	(re_string_peek_byte_case, re_string_fetch_byte_case): Avoid fast
	path if pstr->trans.  Never translate the character through
	pstr->trans.
	* posix/Makefile (tests): Add bug-regex22.
	(bug-regex22-ENV): Set.
	* posix/bug-regex22.c: New test.

--- libc/posix/regcomp.c.jj	2004-01-02 23:48:33.000000000 +0100
+++ libc/posix/regcomp.c	2004-01-05 12:53:47.000000000 +0100
@@ -109,7 +109,7 @@ static reg_errcode_t build_equiv_class (
 					re_charset_t *mbcset,
 					int *equiv_class_alloc,
 					const unsigned char *name);
-static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
+static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 				      re_bitset_ptr_t sbcset,
 				      re_charset_t *mbcset,
 				      int *char_class_alloc,
@@ -118,12 +118,13 @@ static reg_errcode_t build_charclass (RE
 #else  /* not RE_ENABLE_I18N */
 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 					const unsigned char *name);
-static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
+static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 				      re_bitset_ptr_t sbcset,
 				      const unsigned char *class_name,
 				      reg_syntax_t syntax);
 #endif /* not RE_ENABLE_I18N */
-static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
+static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
+				       unsigned RE_TRANSLATE_TYPE trans,
 				       const unsigned char *class_name,
 				       const unsigned char *extra, int not,
 				       reg_errcode_t *err);
@@ -3401,7 +3402,7 @@ build_charclass (trans, sbcset, mbcset, 
 #else /* not RE_ENABLE_I18N */
 build_charclass (trans, sbcset, class_name, syntax)
 #endif /* not RE_ENABLE_I18N */
-     RE_TRANSLATE_TYPE trans;
+     unsigned RE_TRANSLATE_TYPE trans;
      re_bitset_ptr_t sbcset;
      const unsigned char *class_name;
      reg_syntax_t syntax;
@@ -3476,7 +3477,7 @@ build_charclass (trans, sbcset, class_na
 static bin_tree_t *
 build_charclass_op (dfa, trans, class_name, extra, not, err)
      re_dfa_t *dfa;
-     RE_TRANSLATE_TYPE trans;
+     unsigned RE_TRANSLATE_TYPE trans;
      const unsigned char *class_name;
      const unsigned char *extra;
      int not;
--- libc/posix/regex_internal.h.jj	2004-01-03 14:13:05.000000000 +0100
+++ libc/posix/regex_internal.h	2004-01-05 12:54:14.000000000 +0100
@@ -337,7 +337,7 @@ struct re_string_t
      the beginning of the input string.  */
   unsigned int tip_context;
   /* The translation passed as a part of an argument of re_compile_pattern.  */
-  RE_TRANSLATE_TYPE trans;
+  unsigned RE_TRANSLATE_TYPE trans;
   /* Copy of re_dfa_t's word_char.  */
   re_const_bitset_ptr_t word_char;
   /* 1 if REG_ICASE.  */
--- libc/posix/regex_internal.c.jj	2004-01-03 13:42:55.000000000 +0100
+++ libc/posix/regex_internal.c	2004-01-05 14:48:00.000000000 +0100
@@ -189,7 +189,7 @@ re_string_construct_common (str, len, ps
   pstr->raw_mbs = (const unsigned char *) str;
   pstr->len = len;
   pstr->raw_len = len;
-  pstr->trans = trans;
+  pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
   pstr->icase = icase ? 1 : 0;
   pstr->mbs_allocated = (trans != NULL || icase);
   pstr->mb_cur_max = dfa->mb_cur_max;
@@ -758,7 +758,7 @@ re_string_peek_byte_case (pstr, idx)
   int ch, off;
 
   /* Handle the common (easiest) cases first.  */
-  if (BE (!pstr->icase, 1))
+  if (BE (!pstr->mbs_allocated, 1))
     return re_string_peek_byte (pstr, idx);
 
 #ifdef RE_ENABLE_I18N
@@ -774,8 +774,6 @@ re_string_peek_byte_case (pstr, idx)
 #endif
 
   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
-  if (pstr->trans)
-    ch = pstr->trans[ch];
 
 #ifdef RE_ENABLE_I18N
   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
@@ -793,15 +791,13 @@ static unsigned char
 re_string_fetch_byte_case (pstr)
      re_string_t *pstr;
 {
-  int ch;
-
-  if (BE (!pstr->icase, 1))
+  if (BE (!pstr->mbs_allocated, 1))
     return re_string_fetch_byte (pstr);
 
 #ifdef RE_ENABLE_I18N
   if (pstr->offsets_needed)
     {
-      int off;
+      int off, ch;
 
       /* For tr_TR.UTF-8 [[:islower:]] there is
 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
@@ -815,8 +811,6 @@ re_string_fetch_byte_case (pstr)
 
       off = pstr->offsets[pstr->cur_idx];
       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
-      if (pstr->trans)
-	ch = pstr->trans[ch];
 
       if (! isascii (ch))
 	return re_string_fetch_byte (pstr);
@@ -827,10 +821,7 @@ re_string_fetch_byte_case (pstr)
     }
 #endif
 
-  ch = pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
-  if (pstr->trans)
-    ch = pstr->trans[ch];
-  return ch;
+  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 }
 
 static void
--- libc/posix/Makefile.jj	2003-12-04 14:47:27.000000000 +0100
+++ libc/posix/Makefile	2004-01-05 13:21:25.000000000 +0100
@@ -1,4 +1,4 @@
-# Copyright (C) 1991-1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+# Copyright (C) 1991-1999, 2000-2003, 2004 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 
 # The GNU C Library is free software; you can redistribute it and/or
@@ -79,8 +79,8 @@ tests		:= tstgetopt testfnm runtests run
 		   bug-regex8 bug-regex9 bug-regex10 bug-regex11 bug-regex12 \
 		   bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
 		   bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
-		   bug-regex21 tst-nice tst-nanosleep transbug tst-rxspencer \
-		   tst-pcre tst-boost
+		   bug-regex21 bug-regex22 tst-nice tst-nanosleep \
+		   transbug tst-rxspencer tst-pcre tst-boost
 ifeq (yes,$(build-shared))
 test-srcs	:= globtest
 tests           += wordexp-test tst-exec tst-spawn
@@ -162,6 +162,7 @@ bug-regex17-ENV = LOCPATH=$(common-objpf
 bug-regex18-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex19-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata
+bug-regex22-ENV = LOCPATH=$(common-objpfx)localedata
 tst-rxspencer-ARGS = --utf8 rxspencer/tests
 tst-rxspencer-ENV = LOCPATH=$(common-objpfx)localedata
 tst-pcre-ARGS = PCRE.tests
--- libc/posix/bug-regex22.c.jj	2004-01-05 12:57:31.000000000 +0100
+++ libc/posix/bug-regex22.c	2004-01-05 14:47:48.000000000 +0100
@@ -0,0 +1,109 @@
+/* Test re.translate != NULL.
+   Copyright (C) 2004 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2004.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <ctype.h>
+#include <locale.h>
+#include <regex.h>
+#include <stdio.h>
+#include <string.h>
+
+int
+main (void)
+{
+  struct re_pattern_buffer re;
+  char trans[256];
+  int i, result = 0;
+  const char *s;
+
+  setlocale (LC_ALL, "de_DE.ISO-8859-1");
+
+  for (i = 0; i < 256; ++i)
+    trans[i] = tolower (i);
+
+  re_set_syntax (RE_SYNTAX_POSIX_EGREP);
+
+  memset (&re, 0, sizeof (re));
+  re.translate = trans;
+  s = re_compile_pattern ("\\W", 2, &re);
+
+  if (s != NULL)
+    {
+      printf ("failed to compile pattern \"\\W\": %s\n", s);
+      result = 1;
+    }
+  else
+    {
+      int ret = re_search (&re, "abc.de", 6, 0, 6, NULL);
+      if (ret != 3)
+	{
+	  printf ("1st re_search returned %d\n", ret);
+	  result = 1;
+	}
+
+      ret = re_search (&re, "\xc4\xd6\xae\xf7", 4, 0, 4, NULL);
+      if (ret != 2)
+	{
+	  printf ("2nd re_search returned %d\n", ret);
+	  result = 1;
+	}
+      re.translate = NULL;
+      regfree (&re);
+    }
+
+  memset (&re, 0, sizeof (re));
+  re.translate = trans;
+  s = re_compile_pattern ("\\w", 2, &re);
+
+  if (s != NULL)
+    {
+      printf ("failed to compile pattern \"\\w\": %s\n", s);
+      result = 1;
+    }
+  else
+    {
+      int ret = re_search (&re, ".,!abc", 6, 0, 6, NULL);
+      if (ret != 3)
+	{
+	  printf ("3rd re_search returned %d\n", ret);
+	  result = 1;
+	}
+
+      ret = re_search (&re, "\xae\xf7\xc4\xd6", 4, 0, 4, NULL);
+      if (ret != 2)
+	{
+	  printf ("4th re_search returned %d\n", ret);
+	  result = 1;
+	}
+      re.translate = NULL;
+      regfree (&re);
+    }
+
+  memset (&re, 0, sizeof (re));
+  re.translate = trans;
+  s = re_compile_pattern ("[[:DIGIT:]]", 2, &re);
+  if (s == NULL)
+    {
+      printf ("compilation of \"[[:DIGIT:]]\" pattern unexpectedly succeeded: %s\n",
+	      s);
+      result = 1;
+    }
+
+  return result;
+}

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]