This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
suggested patches for regex routines from gawk

From: Aharon Robbins <arnold at skeeve dot com>
To: libc-alpha at sourceware dot org
Date: Fri, 28 Dec 2012 12:09:51 +0200
Subject: suggested patches for regex routines from gawk
Hello All.

I use the GLIBC regex routines as my "upstream" for the regex routines
in gawk.  I have a set of patches that I have been carrying forward
for many years now, which I would like to see included into GLIBC.
(FWIW, the git people use my routines as their upstream, so getting
these into GLIBC would benefit more than just my project. :-)

The changes below fall into two main categories. The first is Rational
Range Interpretation, whereby [a-z] means ONLY the lowercase letters,
no matter what the locale is.  Gawk and grep have already gone this way,
bash will soon have an option to support it, and I think sed also
supports it via gnulib but I haven't looked there to check.  (The 2008
POSIX spec blesses this behavior, as well.)

The other category is simple portability fixes. This falls into a few
subcategories - removing a few gratutious GCC-isms, adding missing #ifdefs
(or rearranging code to use them properly), and adding code for non-GCC
compilers in one or two instances, including variable declarations at
the top of blocks for C 90 compilers. A few unused variables are also
just deleted.

I have a few other patches that may be more controversial, so I am
not submitting them right now.  If the changes below get in, that'd be
really great, and then I can submit the others.

These are relative to what's in master.

I am not subscribed to this list, so please cc me on any questions or
commentary, I will be happy to explain things if need be, but I think
the changes are pretty self-explanatory.

Please let me know if I need to sign paperwork for GLIBC or whatever.

Much thanks,

Arnold Robbins
-------------------------------
diff --git a/posix/regcomp.c b/posix/regcomp.c
index e85b235..482c41b 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -301,7 +301,7 @@ static void
 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 			 char *fastmap)
 {
-  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
+  volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
   int node_cnt;
   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
@@ -315,7 +315,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 #ifdef RE_ENABLE_I18N
 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 	    {
-	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
+	      unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 	      wchar_t wc;
 	      mbstate_t state;
 
@@ -331,6 +331,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
 		      != (size_t) -1))
 		re_set_fastmap (fastmap, 0, buf[0]);
+	      re_free (buf);
 	    }
 #endif
 	}
@@ -574,12 +575,15 @@ weak_alias (__regerror, regerror)
    UTF-8 is used.  Otherwise we would allocate memory just to initialize
    it the same all the time.  UTF-8 is the preferred encoding so this is
    a worthwhile optimization.  */
-static const bitset_t utf8_sb_map =
-{
+#if __GNUC__ >= 3
+static const bitset_t utf8_sb_map = {
   /* Set the first 128 bits.  */
   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 };
-#endif
+#else /* ! (__GNUC__ >= 3) */
+static bitset_t utf8_sb_map;
+#endif /* __GNUC__ >= 3 */
+#endif /* RE_ENABLE_I18N */
 
 
 static void
@@ -887,7 +891,21 @@ init_dfa (re_dfa_t *dfa, size_t pat_len)
   if (dfa->mb_cur_max > 1)
     {
       if (dfa->is_utf8)
-	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
+        {
+#if !defined(__GNUC__) || __GNUC__ < 3
+	  static short utf8_sb_map_inited = 0;
+
+	  if (! utf8_sb_map_inited)
+	    {
+		int i;
+
+	  	utf8_sb_map_inited = 0;
+		for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
+		  utf8_sb_map[i] = BITSET_WORD_MAX;
+	    }
+#endif
+	  dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
+	}
       else
 	{
 	  int i, j, ch;
@@ -925,9 +943,8 @@ static void
 internal_function
 init_word_char (re_dfa_t *dfa)
 {
+  int i, j, ch;
   dfa->word_ops_used = 1;
-  int i = 0;
-  int ch = 0;
   if (BE (dfa->map_notascii == 0, 1))
     {
       if (sizeof (dfa->word_char[0]) == 8)
@@ -959,8 +976,8 @@ init_word_char (re_dfa_t *dfa)
 	}
     }
 
-  for (; i < BITSET_WORDS; ++i)
-    for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
+  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
+    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
       if (isalnum (ch) || ch == '_')
 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
 }
@@ -2621,11 +2638,12 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
 static reg_errcode_t
 internal_function
 # ifdef RE_ENABLE_I18N
-build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
-		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
+build_range_exp (reg_syntax_t syntax, bitset_t sbcset, re_charset_t *mbcset,
+		int *range_alloc, bracket_elem_t *start_elem,
+		bracket_elem_t *end_elem)
 # else /* not RE_ENABLE_I18N */
-build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
-		 bracket_elem_t *end_elem)
+build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
+		bracket_elem_t *start_elem, bracket_elem_t *end_elem)
 # endif /* not RE_ENABLE_I18N */
 {
   unsigned int start_ch, end_ch;
@@ -2648,7 +2666,6 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
     wchar_t wc;
     wint_t start_wc;
     wint_t end_wc;
-    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
 
     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
@@ -2662,9 +2679,7 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
 	      ? __btowc (end_ch) : end_elem->opr.wch);
     if (start_wc == WEOF || end_wc == WEOF)
       return REG_ECOLLATE;
-    cmp_buf[0] = start_wc;
-    cmp_buf[4] = end_wc;
-    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
+    else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
       return REG_ERANGE;
 
     /* Got valid collation sequence values, add them as a new entry.
@@ -2705,10 +2720,8 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
     /* Build the table for single byte characters.  */
     for (wc = 0; wc < SBC_MAX; ++wc)
       {
-	cmp_buf[2] = wc;
-	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
-	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
-	  bitset_set (sbcset, wc);
+         if (start_wc <= wc && wc <= end_wc)
+           bitset_set (sbcset, wc);
       }
   }
 # else /* not RE_ENABLE_I18N */
@@ -3167,15 +3180,15 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	  token_len = peek_token_bracket (token, regexp, syntax);
 
 #ifdef _LIBC
-	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
+	  *err = build_range_exp (syntax, sbcset, mbcset, &range_alloc,
 				  &start_elem, &end_elem);
 #else
 # ifdef RE_ENABLE_I18N
-	  *err = build_range_exp (sbcset,
+	  *err = build_range_exp (syntax, sbcset,
 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
 				  &range_alloc, &start_elem, &end_elem);
 # else
-	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
+	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
 # endif
 #endif /* RE_ENABLE_I18N */
 	  if (BE (*err != REG_NOERROR, 0))
@@ -3446,6 +3459,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name)
 	return REG_ECOLLATE;
 
       /* Build single byte matcing table for this equivalence class.  */
+      char_buf[1] = (unsigned char) '\0';
       len = weights[idx1 & 0xffffff];
       for (ch = 0; ch < SBC_MAX; ++ch)
 	{
diff --git a/posix/regex_internal.c b/posix/regex_internal.c
index 9be8a53..c3960b0 100644
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@@ -518,7 +518,7 @@ re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
       /* Then proceed the next character.  */
       rawbuf_idx += mbclen;
     }
-  *last_wc = wc;
+  *last_wc = (wint_t) wc;
   return rawbuf_idx;
 }
 #endif /* RE_ENABLE_I18N  */
@@ -686,10 +686,10 @@ re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 	}
       else
 	{
+#ifdef RE_ENABLE_I18N
 	  /* No, skip all characters until IDX.  */
 	  int prev_valid_len = pstr->valid_len;
 
-#ifdef RE_ENABLE_I18N
 	  if (BE (pstr->offsets_needed, 0))
 	    {
 	      pstr->len = pstr->raw_len - idx + offset;
@@ -966,6 +966,16 @@ static reg_errcode_t
 internal_function __attribute_warn_unused_result__
 re_node_set_alloc (re_node_set *set, int size)
 {
+  /*
+   * valgrind says size can be 0, which then doesn't
+   * free the block of size 0.  Harumph. This seems
+   * to work ok, though.
+   */
+  if (size == 0)
+    {
+       memset(set, 0, sizeof(*set));
+       return REG_NOERROR;
+    }
   set->alloc = size;
   set->nelem = 0;
   set->elems = re_malloc (int, size);
@@ -1408,7 +1418,6 @@ static int
 internal_function
 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
 {
-  int type = token.type;
   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
     {
       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
@@ -1444,7 +1453,7 @@ re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
   dfa->nodes[dfa->nodes_len].constraint = 0;
 #ifdef RE_ENABLE_I18N
   dfa->nodes[dfa->nodes_len].accept_mb =
-    (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
+    (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
 #endif
   dfa->nexts[dfa->nodes_len] = -1;
   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
diff --git a/posix/regex_internal.h b/posix/regex_internal.h
index 6dfdef6..54bbd1e 100644
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@@ -98,6 +98,9 @@
 # define BE(expr, val) __builtin_expect (expr, val)
 #else
 # define BE(expr, val) (expr)
+# ifdef inline
+# undef inline
+# endif
 # define inline
 #endif
 
@@ -112,7 +115,13 @@
 
 /* Rename to standard API for using out of glibc.  */
 #ifndef _LIBC
+# ifdef __wctype
+# undef __wctype
+# endif
 # define __wctype wctype
+# ifdef __iswctype
+# undef __iswctype
+# endif
 # define __iswctype iswctype
 # define __btowc btowc
 # define __mbrtowc mbrtowc
@@ -417,10 +426,9 @@ static unsigned int re_string_context_at (const re_string_t *input, int idx,
 #define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
 #define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
 
-#include <alloca.h>
-
 #ifndef _LIBC
 # if HAVE_ALLOCA
+#  include <alloca.h>
 /* The OS usually guarantees only one guard page at the bottom of the stack,
    and a page size can be as small as 4096 bytes.  So we cannot safely
    allocate anything larger than 4096 bytes.  Also care for the possibility
@@ -655,7 +663,9 @@ struct re_dfa_t
 #ifdef DEBUG
   char* re_str;
 #endif
+#if defined _LIBC
   __libc_lock_define (, lock)
+#endif
 };
 
 #define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
diff --git a/posix/regexec.c b/posix/regexec.c
index ec4ae13..8ededc5 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -226,7 +226,6 @@ regexec (preg, string, nmatch, pmatch, eflags)
 {
   reg_errcode_t err;
   int start, length;
-  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 
   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
     return REG_BADPAT;
@@ -414,7 +413,6 @@ re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
   regmatch_t *pmatch;
   int nregs, rval;
   int eflags = 0;
-  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 
   /* Check for out-of-range.  */
   if (BE (start < 0 || start > length, 0))
@@ -1440,9 +1438,11 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
   cur_node = dfa->init_node;
   re_node_set_init_empty (&eps_via_nodes);
 
+#ifdef HAVE_ALLOCA
   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
   else
+#endif
     {
       prev_idx_match = re_malloc (regmatch_t, nmatch);
       if (prev_idx_match == NULL)
@@ -2926,7 +2926,7 @@ check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
     }
 
-  str_idx = path->next_idx ?: top_str;
+  str_idx = path->next_idx ? path->next_idx : top_str;
 
   /* Temporary modify MCTX.  */
   backup_state_log = mctx->state_log;
@@ -3064,7 +3064,9 @@ check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
   const re_dfa_t *const dfa = mctx->dfa;
   int result;
   int cur_idx;
+#ifdef RE_ENABLE_I18N
   reg_errcode_t err = REG_NOERROR;
+#endif
   re_node_set union_set;
   re_node_set_init_empty (&union_set);
   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
@@ -3347,9 +3349,11 @@ build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
      from `state'.  `dests_node[i]' represents the nodes which i-th
      destination state contains, and `dests_ch[i]' represents the
      characters which i-th destination state accepts.  */
+#ifdef HAVE_ALLOCA
   if (__libc_use_alloca (sizeof (struct dests_alloc)))
     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
   else
+#endif
     {
       dests_alloc = re_malloc (struct dests_alloc, 1);
       if (BE (dests_alloc == NULL, 0))
@@ -3392,11 +3396,13 @@ build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
 	  0))
     goto out_free;
 
+#ifdef HAVE_ALLOCA
   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
 			 + ndests * 3 * sizeof (re_dfastate_t *)))
     dest_states = (re_dfastate_t **)
       alloca (ndests * 3 * sizeof (re_dfastate_t *));
   else
+#endif
     {
       dest_states = (re_dfastate_t **)
 	malloc (ndests * 3 * sizeof (re_dfastate_t *));
@@ -3755,6 +3761,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
   const re_token_t *node = dfa->nodes + node_idx;
   int char_len, elem_len;
   int i;
+  wint_t wc;
 
   if (BE (node->type == OP_UTF8_PERIOD, 0))
     {
@@ -3824,7 +3831,8 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
     }
 
   elem_len = re_string_elem_size_at (input, str_idx);
-  if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
+  wc = __btowc(*(input->mbs+str_idx));
+  if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
     return 0;
 
   if (node->type == COMPLEX_BRACKET)
@@ -3915,6 +3923,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 	  if (cset->nequiv_classes)
 	    {
 	      const unsigned char *cp = pin;
+          int32_t idx;
 	      table = (const int32_t *)
 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 	      weights = (const unsigned char *)
@@ -3923,7 +3932,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
 	      indirect = (const int32_t *)
 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
-	      int32_t idx = findidx (&cp, elem_len);
+	      idx = findidx (&cp, elem_len);
 	      if (idx > 0)
 		for (i = 0; i < cset->nequiv_classes; ++i)
 		  {
@@ -3954,18 +3963,10 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 # endif /* _LIBC */
 	{
 	  /* match with range expression?  */
-#if __GNUC__ >= 2
-	  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
-#else
-	  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
-	  cmp_buf[2] = wc;
-#endif
 	  for (i = 0; i < cset->nranges; ++i)
 	    {
-	      cmp_buf[0] = cset->range_starts[i];
-	      cmp_buf[4] = cset->range_ends[i];
-	      if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
-		  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+              if (cset->range_starts[i] <= wc
+                  && wc <= cset->range_ends[i])
 		{
 		  match_len = char_len;
 		  goto check_node_accept_bytes_match;
Follow-Ups:
- Re: suggested patches for regex routines from gawk
  - From: Andreas Schwab
- Re: suggested patches for regex routines from gawk
  - From: Roland McGrath
- Re: suggested patches for regex routines from gawk
  - From: OndÅej BÃlka
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]