This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
suggested patches for regex routines from gawk
- From: Aharon Robbins <arnold at skeeve dot com>
- To: libc-alpha at sourceware dot org
- Date: Fri, 28 Dec 2012 12:09:51 +0200
- Subject: suggested patches for regex routines from gawk
Hello All.
I use the GLIBC regex routines as my "upstream" for the regex routines
in gawk. I have a set of patches that I have been carrying forward
for many years now, which I would like to see included into GLIBC.
(FWIW, the git people use my routines as their upstream, so getting
these into GLIBC would benefit more than just my project. :-)
The changes below fall into two main categories. The first is Rational
Range Interpretation, whereby [a-z] means ONLY the lowercase letters,
no matter what the locale is. Gawk and grep have already gone this way,
bash will soon have an option to support it, and I think sed also
supports it via gnulib but I haven't looked there to check. (The 2008
POSIX spec blesses this behavior, as well.)
The other category is simple portability fixes. This falls into a few
subcategories - removing a few gratutious GCC-isms, adding missing #ifdefs
(or rearranging code to use them properly), and adding code for non-GCC
compilers in one or two instances, including variable declarations at
the top of blocks for C 90 compilers. A few unused variables are also
just deleted.
I have a few other patches that may be more controversial, so I am
not submitting them right now. If the changes below get in, that'd be
really great, and then I can submit the others.
These are relative to what's in master.
I am not subscribed to this list, so please cc me on any questions or
commentary, I will be happy to explain things if need be, but I think
the changes are pretty self-explanatory.
Please let me know if I need to sign paperwork for GLIBC or whatever.
Much thanks,
Arnold Robbins
-------------------------------
diff --git a/posix/regcomp.c b/posix/regcomp.c
index e85b235..482c41b 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -301,7 +301,7 @@ static void
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
char *fastmap)
{
- re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
+ volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
int node_cnt;
int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
@@ -315,7 +315,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
#ifdef RE_ENABLE_I18N
if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
{
- unsigned char *buf = alloca (dfa->mb_cur_max), *p;
+ unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
wchar_t wc;
mbstate_t state;
@@ -331,6 +331,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
&& (__wcrtomb ((char *) buf, towlower (wc), &state)
!= (size_t) -1))
re_set_fastmap (fastmap, 0, buf[0]);
+ re_free (buf);
}
#endif
}
@@ -574,12 +575,15 @@ weak_alias (__regerror, regerror)
UTF-8 is used. Otherwise we would allocate memory just to initialize
it the same all the time. UTF-8 is the preferred encoding so this is
a worthwhile optimization. */
-static const bitset_t utf8_sb_map =
-{
+#if __GNUC__ >= 3
+static const bitset_t utf8_sb_map = {
/* Set the first 128 bits. */
[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
};
-#endif
+#else /* ! (__GNUC__ >= 3) */
+static bitset_t utf8_sb_map;
+#endif /* __GNUC__ >= 3 */
+#endif /* RE_ENABLE_I18N */
static void
@@ -887,7 +891,21 @@ init_dfa (re_dfa_t *dfa, size_t pat_len)
if (dfa->mb_cur_max > 1)
{
if (dfa->is_utf8)
- dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
+ {
+#if !defined(__GNUC__) || __GNUC__ < 3
+ static short utf8_sb_map_inited = 0;
+
+ if (! utf8_sb_map_inited)
+ {
+ int i;
+
+ utf8_sb_map_inited = 0;
+ for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
+ utf8_sb_map[i] = BITSET_WORD_MAX;
+ }
+#endif
+ dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
+ }
else
{
int i, j, ch;
@@ -925,9 +943,8 @@ static void
internal_function
init_word_char (re_dfa_t *dfa)
{
+ int i, j, ch;
dfa->word_ops_used = 1;
- int i = 0;
- int ch = 0;
if (BE (dfa->map_notascii == 0, 1))
{
if (sizeof (dfa->word_char[0]) == 8)
@@ -959,8 +976,8 @@ init_word_char (re_dfa_t *dfa)
}
}
- for (; i < BITSET_WORDS; ++i)
- for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
+ for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
+ for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
if (isalnum (ch) || ch == '_')
dfa->word_char[i] |= (bitset_word_t) 1 << j;
}
@@ -2621,11 +2638,12 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
static reg_errcode_t
internal_function
# ifdef RE_ENABLE_I18N
-build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
- bracket_elem_t *start_elem, bracket_elem_t *end_elem)
+build_range_exp (reg_syntax_t syntax, bitset_t sbcset, re_charset_t *mbcset,
+ int *range_alloc, bracket_elem_t *start_elem,
+ bracket_elem_t *end_elem)
# else /* not RE_ENABLE_I18N */
-build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
- bracket_elem_t *end_elem)
+build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
+ bracket_elem_t *start_elem, bracket_elem_t *end_elem)
# endif /* not RE_ENABLE_I18N */
{
unsigned int start_ch, end_ch;
@@ -2648,7 +2666,6 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
wchar_t wc;
wint_t start_wc;
wint_t end_wc;
- wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
@@ -2662,9 +2679,7 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
? __btowc (end_ch) : end_elem->opr.wch);
if (start_wc == WEOF || end_wc == WEOF)
return REG_ECOLLATE;
- cmp_buf[0] = start_wc;
- cmp_buf[4] = end_wc;
- if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
+ else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
return REG_ERANGE;
/* Got valid collation sequence values, add them as a new entry.
@@ -2705,10 +2720,8 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
/* Build the table for single byte characters. */
for (wc = 0; wc < SBC_MAX; ++wc)
{
- cmp_buf[2] = wc;
- if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
- && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
- bitset_set (sbcset, wc);
+ if (start_wc <= wc && wc <= end_wc)
+ bitset_set (sbcset, wc);
}
}
# else /* not RE_ENABLE_I18N */
@@ -3167,15 +3180,15 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
token_len = peek_token_bracket (token, regexp, syntax);
#ifdef _LIBC
- *err = build_range_exp (sbcset, mbcset, &range_alloc,
+ *err = build_range_exp (syntax, sbcset, mbcset, &range_alloc,
&start_elem, &end_elem);
#else
# ifdef RE_ENABLE_I18N
- *err = build_range_exp (sbcset,
+ *err = build_range_exp (syntax, sbcset,
dfa->mb_cur_max > 1 ? mbcset : NULL,
&range_alloc, &start_elem, &end_elem);
# else
- *err = build_range_exp (sbcset, &start_elem, &end_elem);
+ *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
# endif
#endif /* RE_ENABLE_I18N */
if (BE (*err != REG_NOERROR, 0))
@@ -3446,6 +3459,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name)
return REG_ECOLLATE;
/* Build single byte matcing table for this equivalence class. */
+ char_buf[1] = (unsigned char) '\0';
len = weights[idx1 & 0xffffff];
for (ch = 0; ch < SBC_MAX; ++ch)
{
diff --git a/posix/regex_internal.c b/posix/regex_internal.c
index 9be8a53..c3960b0 100644
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@@ -518,7 +518,7 @@ re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
/* Then proceed the next character. */
rawbuf_idx += mbclen;
}
- *last_wc = wc;
+ *last_wc = (wint_t) wc;
return rawbuf_idx;
}
#endif /* RE_ENABLE_I18N */
@@ -686,10 +686,10 @@ re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
}
else
{
+#ifdef RE_ENABLE_I18N
/* No, skip all characters until IDX. */
int prev_valid_len = pstr->valid_len;
-#ifdef RE_ENABLE_I18N
if (BE (pstr->offsets_needed, 0))
{
pstr->len = pstr->raw_len - idx + offset;
@@ -966,6 +966,16 @@ static reg_errcode_t
internal_function __attribute_warn_unused_result__
re_node_set_alloc (re_node_set *set, int size)
{
+ /*
+ * valgrind says size can be 0, which then doesn't
+ * free the block of size 0. Harumph. This seems
+ * to work ok, though.
+ */
+ if (size == 0)
+ {
+ memset(set, 0, sizeof(*set));
+ return REG_NOERROR;
+ }
set->alloc = size;
set->nelem = 0;
set->elems = re_malloc (int, size);
@@ -1408,7 +1418,6 @@ static int
internal_function
re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
{
- int type = token.type;
if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
{
size_t new_nodes_alloc = dfa->nodes_alloc * 2;
@@ -1444,7 +1453,7 @@ re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
dfa->nodes[dfa->nodes_len].constraint = 0;
#ifdef RE_ENABLE_I18N
dfa->nodes[dfa->nodes_len].accept_mb =
- (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
+ (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
#endif
dfa->nexts[dfa->nodes_len] = -1;
re_node_set_init_empty (dfa->edests + dfa->nodes_len);
diff --git a/posix/regex_internal.h b/posix/regex_internal.h
index 6dfdef6..54bbd1e 100644
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@@ -98,6 +98,9 @@
# define BE(expr, val) __builtin_expect (expr, val)
#else
# define BE(expr, val) (expr)
+# ifdef inline
+# undef inline
+# endif
# define inline
#endif
@@ -112,7 +115,13 @@
/* Rename to standard API for using out of glibc. */
#ifndef _LIBC
+# ifdef __wctype
+# undef __wctype
+# endif
# define __wctype wctype
+# ifdef __iswctype
+# undef __iswctype
+# endif
# define __iswctype iswctype
# define __btowc btowc
# define __mbrtowc mbrtowc
@@ -417,10 +426,9 @@ static unsigned int re_string_context_at (const re_string_t *input, int idx,
#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
-#include <alloca.h>
-
#ifndef _LIBC
# if HAVE_ALLOCA
+# include <alloca.h>
/* The OS usually guarantees only one guard page at the bottom of the stack,
and a page size can be as small as 4096 bytes. So we cannot safely
allocate anything larger than 4096 bytes. Also care for the possibility
@@ -655,7 +663,9 @@ struct re_dfa_t
#ifdef DEBUG
char* re_str;
#endif
+#if defined _LIBC
__libc_lock_define (, lock)
+#endif
};
#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
diff --git a/posix/regexec.c b/posix/regexec.c
index ec4ae13..8ededc5 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -226,7 +226,6 @@ regexec (preg, string, nmatch, pmatch, eflags)
{
reg_errcode_t err;
int start, length;
- re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
return REG_BADPAT;
@@ -414,7 +413,6 @@ re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
regmatch_t *pmatch;
int nregs, rval;
int eflags = 0;
- re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
/* Check for out-of-range. */
if (BE (start < 0 || start > length, 0))
@@ -1440,9 +1438,11 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
cur_node = dfa->init_node;
re_node_set_init_empty (&eps_via_nodes);
+#ifdef HAVE_ALLOCA
if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
else
+#endif
{
prev_idx_match = re_malloc (regmatch_t, nmatch);
if (prev_idx_match == NULL)
@@ -2926,7 +2926,7 @@ check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
}
- str_idx = path->next_idx ?: top_str;
+ str_idx = path->next_idx ? path->next_idx : top_str;
/* Temporary modify MCTX. */
backup_state_log = mctx->state_log;
@@ -3064,7 +3064,9 @@ check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
const re_dfa_t *const dfa = mctx->dfa;
int result;
int cur_idx;
+#ifdef RE_ENABLE_I18N
reg_errcode_t err = REG_NOERROR;
+#endif
re_node_set union_set;
re_node_set_init_empty (&union_set);
for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
@@ -3347,9 +3349,11 @@ build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
from `state'. `dests_node[i]' represents the nodes which i-th
destination state contains, and `dests_ch[i]' represents the
characters which i-th destination state accepts. */
+#ifdef HAVE_ALLOCA
if (__libc_use_alloca (sizeof (struct dests_alloc)))
dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
else
+#endif
{
dests_alloc = re_malloc (struct dests_alloc, 1);
if (BE (dests_alloc == NULL, 0))
@@ -3392,11 +3396,13 @@ build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
0))
goto out_free;
+#ifdef HAVE_ALLOCA
if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
+ ndests * 3 * sizeof (re_dfastate_t *)))
dest_states = (re_dfastate_t **)
alloca (ndests * 3 * sizeof (re_dfastate_t *));
else
+#endif
{
dest_states = (re_dfastate_t **)
malloc (ndests * 3 * sizeof (re_dfastate_t *));
@@ -3755,6 +3761,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
const re_token_t *node = dfa->nodes + node_idx;
int char_len, elem_len;
int i;
+ wint_t wc;
if (BE (node->type == OP_UTF8_PERIOD, 0))
{
@@ -3824,7 +3831,8 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
}
elem_len = re_string_elem_size_at (input, str_idx);
- if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
+ wc = __btowc(*(input->mbs+str_idx));
+ if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
return 0;
if (node->type == COMPLEX_BRACKET)
@@ -3915,6 +3923,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
if (cset->nequiv_classes)
{
const unsigned char *cp = pin;
+ int32_t idx;
table = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
weights = (const unsigned char *)
@@ -3923,7 +3932,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
indirect = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
- int32_t idx = findidx (&cp, elem_len);
+ idx = findidx (&cp, elem_len);
if (idx > 0)
for (i = 0; i < cset->nequiv_classes; ++i)
{
@@ -3954,18 +3963,10 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
# endif /* _LIBC */
{
/* match with range expression? */
-#if __GNUC__ >= 2
- wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
-#else
- wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
- cmp_buf[2] = wc;
-#endif
for (i = 0; i < cset->nranges; ++i)
{
- cmp_buf[0] = cset->range_starts[i];
- cmp_buf[4] = cset->range_ends[i];
- if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
- && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+ if (cset->range_starts[i] <= wc
+ && wc <= cset->range_ends[i])
{
match_len = char_len;
goto check_node_accept_bytes_match;