This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC/RFA] Patches to speed up . (period)


The top time-eater part of regular expression matching is currently (and by a large amount) the handling of subexpressions. When they are found, the matcher goes through the string twice: once operating as a DFA, with a cost that is O(string length) and once operating as a "backwards NFA" with a worst-case cost of O(string length * NFA states) and with a big constant in front of it.

This constant can be lowered a bit by avoiding to go through the expensive multi-byte paths. Single-byte searches do this by mistake, because OP_PERIOD is always passed through check_node_accept_bytes even in SBCS, and we can easily fix this: this is the purpose of patch 1 of the three attached files, which I had already submitted a prototype of.

Other searches need to do this, but an important exception is that of UTF-8 searches that were optimized. To do so, we can lower the UTF-8 period to [00-7f]|[c2-fd][80-bf]+, or to a more complicated regex that only passes valid UTF-8 encodings. Patch 2 implements the more precise approach, which is what OP_UTF8_PERIOD token type in current CVS; patch 3 uses the simpler regex (anyway both current CVS and patch 3 accept surrogate low/high characters which are invalid UTF-8).

The DFA performance of the two patches is comparable, but patch 3's NFA behavior is much better because the automaton has fewer states.

To time the patches, I've made a 48MB file from "find /" on my machine, and passed it through 5 matchers:

1) current CVS
2) patch 1
3) patch 1+2 (precise UTF-8 optimization)
4) patch 1+3 (faster UTF-8 optimization)
5) PCRE, based on a very fast syntax-directed matcher.

For each matcher, I made 3 tests in the C locale and three tests in the it_IT.UTF-8 locale. I must say that PCRE knows very little about MBCS (many optimization it does are still applicable to UTF-8, though, and sed does induce some overhead in the UTF-8 cases too.

1) s,\(.*\)/\(.*\),\1 \2,
2) s,.*/,/\n, ; s,/\n, ,
3) s,.*/,/\n, ; s,.\n, ,

The result is the same, but scripts 2/3 do not employ subexpressions. Script 2 allows the fastmap to do more work. This is faster in glibc's DFA matcher but slower in PCRE's syntax-directed matcher. The timings are mixed, and that's why I presented the patches separately:

C locale C locale C locale UTF-8 UTF-8 UTF-8
script 1 script 2 script 3 script 1 script 2 script 3
CVS 86.21 7.88 16.02 101.30 24.93 40.66
patch 1 79.42 7.91 15.96 102.53 25.33 41.42 patch 1+2 79.30 7.91 16.41 158.43 17.81 25.84
patch 1+3 79.64 7.68 15.83 108.59 17.86 25.72
PCRE 4.60 5.08 10.59 11.26 15.07 20.05


Patches 2 and 3 have the same improvement on DFA-only queries, but the timing of patch 3 in the UTF-8 NFA query is awful. Patch 1 does improve C locale searches, but causes a 1% decrease of performance in UTF-8 searches. This is expected since I did not mean patch 1 to be applied independently.

However, given this numbers my preference would be to apply patch 1 only, since it does yield some wins, and wait if I can speed up grouping subexpressions until patch 2 is not a loss anymore (or at least the loss is balanced by the gains as is the case for patch 3 now). Patch 1+3 is also good for me, since the differences are localized in replace_utf8_period_node; you can make a choice.

All patches tested against the glibc and sed testsuites on powerpc-linux.

Paolo

2004-12-06  Paolo Bonzini  <bonzini@gnu.org>

	* regcomp.c (optimize_utf8_tree, replace_with_utf8_period,
	utf8_maps): New.
	(optimize_utf8): Use optimize_utf8_tree to optimize the parse tree.
	Do not handle SIMPLE_BRACKET, it is already done in
	parse_bracket_exp.
	(re_compile_fastmap_iter, calc_first): Do not handle
	OP_UTF8_PERIOD.
	* regex_internal.h (re_token_type_t): Do not define
	OP_UTF8_PERIOD.
	* regexec.c (group_nodes_into_DFAstates, check_node_accept,
	check_node_accept_bytes): Do not handle OP_UTF8_PERIOD.
	
--- orig/lib/regcomp.c
+++ mod/lib/regcomp.c
@@ -32,6 +32,7 @@ static void free_workarea_compile (regex
 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
 #ifdef RE_ENABLE_I18N
 static void optimize_utf8 (re_dfa_t *dfa);
+static bin_tree_t *optimize_utf8_tree (re_dfa_t *dfa, bin_tree_t *node);
 #endif
 struct subexp_optimize
 {
@@ -123,6 +124,7 @@ static reg_errcode_t build_charclass (un
 				      int *char_class_alloc,
 				      const unsigned char *class_name,
 				      reg_syntax_t syntax);
+static bin_tree_t *replace_with_utf8_period (re_dfa_t *dfa, bin_tree_t *node);
 #else  /* not RE_ENABLE_I18N */
 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 					const unsigned char *name);
@@ -418,9 +420,6 @@ re_compile_fastmap_iter (bufp, init_stat
 	}
 #endif /* RE_ENABLE_I18N */
       else if (type == OP_PERIOD
-#ifdef RE_ENABLE_I18N
-	       || type == OP_UTF8_PERIOD
-#endif /* RE_ENABLE_I18N */
 	       || type == END_OF_RE)
 	{
 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
@@ -579,15 +578,28 @@ weak_alias (__regerror, regerror)
    UTF-8 is used.  Otherwise we would allocate memory just to initialize
    it the same all the time.  UTF-8 is the preferred encoding so this is
    a worthwhile optimization.  */
+# if UINT_MAX == 0xffffffff
 static const bitset utf8_sb_map =
 {
   /* Set the first 128 bits.  */
-# if UINT_MAX == 0xffffffff
   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+static const bitset utf8_maps[9] =
+{
+  { 0, 0, 0, 0, 0, 0, 0xfffffffc, 0 },			  /* 0xc2-0xdf */
+  { 0, 0, 0, 0, 0, 0xffffffff, 0, 0 },			  /* 0xa0-0xbf */
+  { 0, 0, 0, 0, 0, 0, 0, 0xfffe },                        /* 0xe1-0xef */
+  { 0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0 },		  /* 0x90-0xbf */
+  { 0, 0, 0, 0, 0, 0, 0, 0xfe0000 },			  /* 0xf1-0xf7 */
+  { 0, 0, 0, 0, 0xffffff00, 0xffffffff, 0, 0 },		  /* 0x88-0xbf */
+  { 0, 0, 0, 0, 0, 0, 0, 0xe000000 },			  /* 0xf9-0xfb */
+  { 0, 0, 0, 0, 0xfffffff0, 0xffffffff, 0, 0 },		  /* 0x84-0xbf */
+  { 0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0 },		  /* 0x80-0xbf */
+};
 # else
 #  error "Add case for new unsigned int size"
 # endif
-};
 #endif
 
 
@@ -1067,23 +1079,38 @@ create_initial_state (dfa)
 }
 
 #ifdef RE_ENABLE_I18N
-/* If it is possible to do searching in single byte encoding instead of UTF-8
-   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
-   DFA nodes where needed.  */
+/* If it is possible to do searching in single byte encoding instead of UTF-8 to
+   speed things up, set dfa->mb_cur_max to 1, and change DFA nodes where needed.  */
+
+static bin_tree_t *
+optimize_utf8_tree (dfa, node)
+     re_dfa_t *dfa;
+     bin_tree_t *node;
+{
+  if (node->type == NON_TYPE
+      && dfa->nodes[node->node_idx].type == OP_PERIOD)
+    return replace_with_utf8_period (dfa, node);
+
+  else
+    {
+      if (node->left)
+	node->left = optimize_utf8_tree (dfa, node->left);
+      if (node->right)
+	node->right = optimize_utf8_tree (dfa, node->right);
+    }
+
+  return node;
+}
 
 static void
 optimize_utf8 (dfa)
      re_dfa_t *dfa;
 {
-  int node, i, mb_chars = 0, has_period = 0;
+  int node, has_period = 0;
 
   for (node = 0; node < dfa->nodes_len; ++node)
     switch (dfa->nodes[node].type)
       {
-      case CHARACTER:
-	if (dfa->nodes[node].opr.c >= 0x80)
-	  mb_chars = 1;
-	break;
       case ANCHOR:
 	switch (dfa->nodes[node].opr.idx)
 	  {
@@ -1100,6 +1127,7 @@ optimize_utf8 (dfa)
       case OP_PERIOD:
         has_period = 1;
         break;
+      case CHARACTER:
       case OP_BACK_REF:
       case OP_ALT:
       case END_OF_RE:
@@ -1107,31 +1135,148 @@ optimize_utf8 (dfa)
       case OP_DUP_QUESTION:
       case OP_OPEN_SUBEXP:
       case OP_CLOSE_SUBEXP:
-	break;
       case SIMPLE_BRACKET:
-	/* Just double check.  */
-        for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
-	  if (dfa->nodes[node].opr.sbcset[i])
-	    return;
 	break;
       default:
 	return;
       }
 
-  if (mb_chars || has_period)
-    for (node = 0; node < dfa->nodes_len; ++node)
-      {
-	if (dfa->nodes[node].type == CHARACTER
-	    && dfa->nodes[node].opr.c >= 0x80)
-	  dfa->nodes[node].mb_partial = 0;
-	else if (dfa->nodes[node].type == OP_PERIOD)
-	  dfa->nodes[node].type = OP_UTF8_PERIOD;
-      }
+  if (has_period)
+    dfa->str_tree = optimize_utf8_tree (dfa, dfa->str_tree);
 
   /* The search can be in single byte locale.  */
   dfa->mb_cur_max = 1;
   dfa->is_utf8 = 0;
-  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
+  dfa->has_mb_node = dfa->nbackref > 0;
+}
+
+/* Build a tree for
+   [00-7f]
+   |[c2-df][80-bf]
+   |    e0 [a0-bf][80-bf]
+   |[e1-ef][80-bf][80-bf]
+   |    f0 [90-bf][80-bf][80-bf]
+   |[f1-f7][80-bf][80-bf][80-bf]
+   |    f8 [88-bf][80-bf][80-bf][80-bf]
+   |[f9-fb][80-bf][80-bf][80-bf][80-bf]
+   |    fc [84-bf][80-bf][80-bf][80-bf][80-bf]
+   |    fd [80-bf][80-bf][80-bf][80-bf][80-bf]
+
+   This regular expression is factored for efficiency into this form:
+
+   ((((fd [80-bf] | fc [84-bf] | [f9-fb]) [80-bf] |
+       f8 [88-bf] | [f1-f7]) [80-bf] |
+       f0 [90-bf] | [e1-ef]) [80-bf] |
+       e0 [a0-bf] | [c2-df]) [80-bf] |
+   [00-7f]
+*/
+static bin_tree_t *
+replace_with_utf8_period (dfa, node)
+     re_dfa_t *dfa;
+     bin_tree_t *node;
+{
+  bin_tree_t *brackets[9], *sb_tree, *alt_tree;
+  bin_tree_t *begin[9], *char_tree;
+  re_token_t token;
+  int i, node_idx;
+  char c;
+
+  node_idx = node->node_idx;
+  assert (dfa->nodes[node_idx].type == OP_PERIOD);
+
+  /* Build trees for simple brackets.  */
+  token.type = SIMPLE_BRACKET;
+  token.opr.sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
+  if (BE (token.opr.sbcset == NULL, 0))
+    return NULL;
+  memcpy (token.opr.sbcset, utf8_sb_map, sizeof (utf8_sb_map));
+  if (!(dfa->syntax & RE_DOT_NEWLINE))
+    bitset_clear (token.opr.sbcset, '\n');
+  if (dfa->syntax & RE_DOT_NOT_NULL)
+    bitset_clear (token.opr.sbcset, '\0');
+  sb_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &token);
+  if (BE (sb_tree == NULL, 0))
+    return NULL;
+
+  for (i = 0; i < 9; i++)
+    {
+      token.opr.sbcset = (re_bitset_ptr_t) &utf8_maps[i];
+      brackets[i] = re_dfa_add_tree_node (dfa, NULL, NULL, &token);
+      dfa->nodes[brackets[i]->node_idx].duplicated = 1;
+      if (BE (brackets[i] == NULL, 0))
+        return NULL;
+    }
+
+  token.type = CHARACTER;
+  token.mb_partial = 0;
+
+  for (c = 0xe0, i = 0; i < 8; i += 2, c = (c >> 1) | 0x80)
+    {
+      token.opr.c = c;
+      char_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &token);
+      if (BE (char_tree == NULL, 0))
+        return NULL;
+
+      begin[i] = brackets[i];
+      begin[i+1] = create_tree (dfa, char_tree, brackets[i+1], CONCAT, 0);
+      if (BE (begin[i+1] == NULL, 0))
+        return NULL;
+    }
+
+  /* begin[8] is a bit different, because it is not a range.  */
+  token.opr.c = 0xfd;
+  begin[8] = re_dfa_add_tree_node (dfa, NULL, NULL, &token);
+  if (BE (begin[8] == NULL, 0))
+    return NULL;
+
+  begin[8] = create_tree (dfa, begin[8], brackets[8], CONCAT, 0);
+  if (BE (begin[8] == NULL, 0))
+    return NULL;
+
+  /* Before this loop, the content of BRACKETS and BEGIN is as follows:
+     brackets[0] = [c2-df], begin[0] = [c2-df]      1 tail byte needed
+     brackets[1] = [a0-bf], begin[1] = e0 [a0-bf]   1 tail byte needed
+     brackets[2] = [e1-ef], begin[2] = [e1-ef]      2 tail bytes needed
+     brackets[3] = [90-bf], begin[3] = f0 [90-bf]   2 tail bytes needed
+     brackets[4] = [f1-f7], begin[4] = [f1-f7]      3 tail bytes needed
+     brackets[5] = [88-bf], begin[5] = f8 [88-bf]   3 tail bytes needed
+     brackets[6] = [f9-fb], begin[6] = [f9-fb]      4 tail bytes needed
+     brackets[7] = [84-bf], begin[7] = fc [84-bf]   4 tail bytes needed
+     brackets[8] = [80-bf], begin[8] = fd [80-bf]   4 tail bytes needed
+
+     Create the full regular expression.  */
+  for (i = 6; i >= 0; i -= 2)
+    {
+      token.type = OP_ALT;
+      begin[i] = re_dfa_add_tree_node (dfa, begin[i], begin[i+1], &token);
+      if (BE (begin[i] == NULL, 0))
+        return NULL;
+
+      begin[i] = re_dfa_add_tree_node (dfa, begin[i], begin[i+2], &token);
+      if (BE (begin[i] == NULL, 0))
+        return NULL;
+
+      /* We need to use a different node for brackets[8] each time! */
+      token.type = SIMPLE_BRACKET;
+      token.opr.sbcset = (re_bitset_ptr_t) &utf8_maps[8];
+      brackets[8] = re_dfa_add_tree_node (dfa, NULL, NULL, &token);
+      dfa->nodes[brackets[8]->node_idx].duplicated = 1;
+      if (BE (brackets[8] == NULL, 0))
+        return NULL;
+
+      begin[i] = create_tree (dfa, begin[i], brackets[8], CONCAT, 0);
+      if (BE (begin[i] == NULL, 0))
+        return NULL;
+    }
+
+  /* TODO: this is hideous.  Parsing and creating DFA nodes should be
+     separated.  */
+  dfa->nodes[node_idx].type = OP_ALT;
+  dfa->nodes[node_idx].accept_mb = 0;
+  dfa->has_plural_match = 1;
+  alt_tree = create_tree (dfa, sb_tree, begin[0], 0, node_idx);
+  alt_tree->parent = node->parent;
+  return alt_tree;
 }
 #endif
 
@@ -1316,7 +1461,6 @@ calc_first (dfa, node)
     case OP_DUP_ASTERISK:
     case OP_DUP_QUESTION:
 #ifdef RE_ENABLE_I18N
-    case OP_UTF8_PERIOD:
     case COMPLEX_BRACKET:
 #endif /* RE_ENABLE_I18N */
     case SIMPLE_BRACKET:


--- orig/lib/regex_internal.h
+++ mod/lib/regex_internal.h
@@ -176,7 +176,6 @@ typedef enum
   OP_PERIOD = 5,
 #ifdef RE_ENABLE_I18N
   COMPLEX_BRACKET = 6,
-  OP_UTF8_PERIOD = 7,
 #endif /* RE_ENABLE_I18N */
 
   /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used


--- orig/lib/regexec.c
+++ mod/lib/regexec.c
@@ -3533,16 +3533,6 @@ group_nodes_into_DFAstates (dfa, state, 
 	  if (dfa->syntax & RE_DOT_NOT_NULL)
 	    bitset_clear (accepts, '\0');
 	}
-#ifdef RE_ENABLE_I18N
-      else if (type == OP_UTF8_PERIOD)
-        {
-	  memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
-	  if (!(dfa->syntax & RE_DOT_NEWLINE))
-	    bitset_clear (accepts, '\n');
-	  if (dfa->syntax & RE_DOT_NOT_NULL)
-	    bitset_clear (accepts, '\0');
-        }
-#endif
       else
 	continue;
 
@@ -3692,57 +3682,6 @@ check_node_accept_bytes (dfa, node_idx, 
   int char_len, elem_len;
   int i;
 
-  if (BE (node->type == OP_UTF8_PERIOD, 0))
-    {
-      unsigned char c = re_string_byte_at (input, str_idx), d;
-      if (BE (c < 0xc2, 1))
-	return 0;
-
-      if (str_idx + 2 > input->len)
-	return 0;
-
-      d = re_string_byte_at (input, str_idx + 1);
-      if (c < 0xe0)
-	return (d < 0x80 || d > 0xbf) ? 0 : 2;
-      else if (c < 0xf0)
-	{
-	  char_len = 3;
-	  if (c == 0xe0 && d < 0xa0)
-	    return 0;
-	}
-      else if (c < 0xf8)
-	{
-	  char_len = 4;
-	  if (c == 0xf0 && d < 0x90)
-	    return 0;
-	}
-      else if (c < 0xfc)
-	{
-	  char_len = 5;
-	  if (c == 0xf8 && d < 0x88)
-	    return 0;
-	}
-      else if (c < 0xfe)
-	{
-	  char_len = 6;
-	  if (c == 0xfc && d < 0x84)
-	    return 0;
-	}
-      else
-	return 0;
-
-      if (str_idx + char_len > input->len)
-	return 0;
-
-      for (i = 1; i < char_len; ++i)
-	{
-	  d = re_string_byte_at (input, str_idx + i);
-	  if (d < 0x80 || d > 0xbf)
-	    return 0;
-	}
-      return char_len;
-    }
-
   char_len = re_string_char_size_at (input, str_idx);
   if (node->type == OP_PERIOD)
     {
@@ -4007,12 +3946,6 @@ check_node_accept (mctx, node, idx)
       return node->opr.c == ch;
     case SIMPLE_BRACKET:
       return bitset_contain (node->opr.sbcset, ch);
-#ifdef RE_ENABLE_I18N
-    case OP_UTF8_PERIOD:
-      if (ch >= 0x80)
-        return 0;
-      /* FALLTHROUGH */
-#endif
     case OP_PERIOD:
       return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
 	       || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));




2004-12-06  Paolo Bonzini  <bonzini@gnu.org>

	* lib/regcomp.c (parse_bracket_exp): Do not modify DFA nodes
	that were already created.
	* lib/regex_internal.c (re_dfa_add_node): Set accept_mb field
	in the token if needed.
	(create_ci_newstate, create_cd_newstate): Set accept_mb field
	from the tokens' field.
	* lib/regex_internal.h (re_token_t): Add accept_mb field.
	(ACCEPT_MB_NODE): Removed.
	* lib/regexec.c (proceed_next_node, transit_states_mb,
	build_sifted_states, check_arrival_add_next_nodes): Use
	accept_mb instead of ACCEPT_MB_NODE.
	
--- orig/lib/regcomp.c
+++ mod/lib/regcomp.c
@@ -3282,57 +3282,61 @@
   /* Ensure only single byte characters are set.  */
   if (dfa->mb_cur_max > 1)
     bitset_mask (sbcset, dfa->sb_char);
-#endif /* RE_ENABLE_I18N */
 
-  /* Build a tree for simple bracket.  */
-  br_token.type = SIMPLE_BRACKET;
-  br_token.opr.sbcset = sbcset;
-  work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
-  if (BE (work_tree == NULL, 0))
-    goto parse_bracket_exp_espace;
-
-#ifdef RE_ENABLE_I18N
   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
 						     || mbcset->non_match)))
     {
-      re_token_t alt_token;
       bin_tree_t *mbc_tree;
       int sbc_idx;
       /* Build a tree for complex bracket.  */
       dfa->has_mb_node = 1;
+      br_token.type = COMPLEX_BRACKET;
+      br_token.opr.mbcset = mbcset;
+      mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
+      if (BE (mbc_tree == NULL, 0))
+	goto parse_bracket_exp_espace;
       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
 	if (sbcset[sbc_idx])
 	  break;
       /* If there are no bits set in sbcset, there is no point
 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
-      if (sbc_idx == BITSET_UINTS)
+      if (sbc_idx < BITSET_UINTS)
+	{
+          re_token_t alt_token;
+          /* Build a tree for simple bracket.  */
+          br_token.type = SIMPLE_BRACKET;
+          br_token.opr.sbcset = sbcset;
+          work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
+          if (BE (work_tree == NULL, 0))
+            goto parse_bracket_exp_espace;
+
+          /* Then join them by ALT node.  */
+          alt_token.type = OP_ALT;
+          dfa->has_plural_match = 1;
+          work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
+          if (BE (work_tree == NULL, 0))
+            goto parse_bracket_exp_espace;
+	}
+      else
 	{
 	  re_free (sbcset);
-	  dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
-	  dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
-	  return work_tree;
+	  work_tree = mbc_tree;
 	}
-      br_token.type = COMPLEX_BRACKET;
-      br_token.opr.mbcset = mbcset;
-      mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
-      if (BE (mbc_tree == NULL, 0))
-	goto parse_bracket_exp_espace;
-      /* Then join them by ALT node.  */
-      alt_token.type = OP_ALT;
-      dfa->has_plural_match = 1;
-      work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
-      if (BE (mbc_tree != NULL, 1))
-	return work_tree;
     }
   else
     {
+      /* Build a tree for simple bracket.  */
+      br_token.type = SIMPLE_BRACKET;
+      br_token.opr.sbcset = sbcset;
+      work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
+      if (BE (work_tree == NULL, 0))
+        goto parse_bracket_exp_espace;
+
       free_charset (mbcset);
-      return work_tree;
     }
-#else /* not RE_ENABLE_I18N */
+#endif /* not RE_ENABLE_I18N */
   return work_tree;
-#endif /* not RE_ENABLE_I18N */
 
  parse_bracket_exp_espace:
   *err = REG_ESPACE;


--- orig/lib/regex_internal.c
+++ mod/lib/regex_internal.c
@@ -1340,6 +1338,7 @@
      re_token_t token;
      int mode;
 {
+  int type = token.type;
   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
     {
       int new_nodes_alloc = dfa->nodes_alloc * 2;
@@ -1376,6 +1375,10 @@
   dfa->nodes[dfa->nodes_len].opt_subexp = 0;
   dfa->nodes[dfa->nodes_len].duplicated = 0;
   dfa->nodes[dfa->nodes_len].constraint = 0;
+#ifdef RE_ENABLE_I18N
+  dfa->nodes[dfa->nodes_len].accept_mb =
+    (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
+#endif
   return dfa->nodes_len++;
 }
 
@@ -1560,16 +1563,13 @@
       re_token_type_t type = node->type;
       if (type == CHARACTER && !node->constraint)
 	continue;
+#ifdef RE_ENABLE_I18N
+      newstate->accept_mb |= node->accept_mb;
+#endif /* RE_ENABLE_I18N */
 
       /* If the state has the halt node, the state is a halt state.  */
-      else if (type == END_OF_RE)
+      if (type == END_OF_RE)
 	newstate->halt = 1;
-#ifdef RE_ENABLE_I18N
-      else if (type == COMPLEX_BRACKET
-	       || type == OP_UTF8_PERIOD
-	       || (type == OP_PERIOD && dfa->mb_cur_max > 1))
-	newstate->accept_mb = 1;
-#endif /* RE_ENABLE_I18N */
       else if (type == OP_BACK_REF)
 	newstate->has_backref = 1;
       else if (type == ANCHOR || node->constraint)
@@ -1613,15 +1613,13 @@
 
       if (type == CHARACTER && !constraint)
 	continue;
+#ifdef RE_ENABLE_I18N
+      newstate->accept_mb |= node->accept_mb;
+#endif /* RE_ENABLE_I18N */
+
       /* If the state has the halt node, the state is a halt state.  */
-      else if (type == END_OF_RE)
+      if (type == END_OF_RE)
 	newstate->halt = 1;
-#ifdef RE_ENABLE_I18N
-      else if (type == COMPLEX_BRACKET
-	       || type == OP_UTF8_PERIOD
-	       || (type == OP_PERIOD && dfa->mb_cur_max > 1))
-	newstate->accept_mb = 1;
-#endif /* RE_ENABLE_I18N */
       else if (type == OP_BACK_REF)
 	newstate->has_backref = 1;
       else if (type == ANCHOR)


--- orig/lib/regex_internal.h
+++ mod/lib/regex_internal.h
@@ -284,6 +284,7 @@
   unsigned int duplicated : 1;
   unsigned int opt_subexp : 1;
 #ifdef RE_ENABLE_I18N
+  unsigned int accept_mb : 1;
   /* These 2 bits can be moved into the union if needed (e.g. if running out
      of bits; move opr.c to opr.c.c and move the flags to opr.c.flags).  */
   unsigned int mb_partial : 1;
@@ -292,8 +293,6 @@
 } re_token_t;
 
 #define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
-#define ACCEPT_MB_NODE(type) \
-  ((type) >= OP_PERIOD && (type) <= OP_UTF8_PERIOD)
 
 struct re_string_t
 {


--- orig/lib/regexec.c
+++ mod/lib/regexec.c
@@ -1254,7 +1254,7 @@
       re_token_type_t type = dfa->nodes[node].type;
 
 #ifdef RE_ENABLE_I18N
-      if (ACCEPT_MB_NODE (type))
+      if (dfa->nodes[node].accept_mb)
 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
       else
 #endif /* RE_ENABLE_I18N */
@@ -1620,7 +1620,7 @@
 	continue;
 #ifdef RE_ENABLE_I18N
       /* If the node may accept `multi byte'.  */
-      if (ACCEPT_MB_NODE (type))
+      if (dfa->nodes[prev_node].accept_mb)
 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
 					 str_idx, sctx->last_str_idx);
 #endif /* RE_ENABLE_I18N */
@@ -2476,7 +2476,7 @@
 	}
 
       /* How many bytes the node can accept?  */
-      if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
+      if (dfa->nodes[cur_node_idx].accept_mb)
 	naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
 					     re_string_cur_idx (&mctx->input));
       if (naccepted == 0)
@@ -3015,7 +3015,7 @@
 	continue;
 #ifdef RE_ENABLE_I18N
       /* If the node may accept `multi byte'.  */
-      if (ACCEPT_MB_NODE (type))
+      if (dfa->nodes[cur_node].accept_mb)
 	{
 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
 					       str_idx);



2004-12-06  Paolo Bonzini  <bonzini@gnu.org>

	* regcomp.c (optimize_utf8_tree, replace_with_utf8_period,
	utf8_maps): New.
	(optimize_utf8): Use optimize_utf8_tree to optimize the parse tree.
	Do not handle SIMPLE_BRACKET, it is already done in
	parse_bracket_exp.
	(re_compile_fastmap_iter, calc_first): Do not handle
	OP_UTF8_PERIOD.
	* regex_internal.h (re_token_type_t): Do not define
	OP_UTF8_PERIOD.
	* regexec.c (group_nodes_into_DFAstates, check_node_accept,
	check_node_accept_bytes): Do not handle OP_UTF8_PERIOD.
	
--- orig/lib/regcomp.c
+++ mod/lib/regcomp.c
@@ -32,6 +32,7 @@ static void free_workarea_compile (regex
 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
 #ifdef RE_ENABLE_I18N
 static void optimize_utf8 (re_dfa_t *dfa);
+static bin_tree_t *optimize_utf8_tree (re_dfa_t *dfa, bin_tree_t *node);
 #endif
 struct subexp_optimize
 {
@@ -123,6 +124,7 @@ static reg_errcode_t build_charclass (un
 				      int *char_class_alloc,
 				      const unsigned char *class_name,
 				      reg_syntax_t syntax);
+static bin_tree_t *replace_with_utf8_period (re_dfa_t *dfa, bin_tree_t *node);
 #else  /* not RE_ENABLE_I18N */
 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 					const unsigned char *name);
@@ -418,9 +420,6 @@ re_compile_fastmap_iter (bufp, init_stat
 	}
 #endif /* RE_ENABLE_I18N */
       else if (type == OP_PERIOD
-#ifdef RE_ENABLE_I18N
-	       || type == OP_UTF8_PERIOD
-#endif /* RE_ENABLE_I18N */
 	       || type == END_OF_RE)
 	{
 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
@@ -579,15 +578,21 @@ weak_alias (__regerror, regerror)
    UTF-8 is used.  Otherwise we would allocate memory just to initialize
    it the same all the time.  UTF-8 is the preferred encoding so this is
    a worthwhile optimization.  */
+# if UINT_MAX == 0xffffffff
 static const bitset utf8_sb_map =
 {
   /* Set the first 128 bits.  */
-# if UINT_MAX == 0xffffffff
   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+static const bitset utf8_maps[2] =
+{
+  { 0, 0, 0, 0, 0, 0, 0xfffffffc, 0x3fffffff },  /* 0xc2-0xfd */
+  { 0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0 },		  /* 0x80-0xbf */
+};
 # else
 #  error "Add case for new unsigned int size"
 # endif
-};
 #endif
 
 
@@ -1067,23 +1072,38 @@ create_initial_state (dfa)
 }
 
 #ifdef RE_ENABLE_I18N
-/* If it is possible to do searching in single byte encoding instead of UTF-8
-   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
-   DFA nodes where needed.  */
+/* If it is possible to do searching in single byte encoding instead of UTF-8 to
+   speed things up, set dfa->mb_cur_max to 1, and change DFA nodes where needed.  */
+
+static bin_tree_t *
+optimize_utf8_tree (dfa, node)
+     re_dfa_t *dfa;
+     bin_tree_t *node;
+{
+  if (node->type == NON_TYPE
+      && dfa->nodes[node->node_idx].type == OP_PERIOD)
+    return replace_with_utf8_period (dfa, node);
+
+  else
+    {
+      if (node->left)
+	node->left = optimize_utf8_tree (dfa, node->left);
+      if (node->right)
+	node->right = optimize_utf8_tree (dfa, node->right);
+    }
+
+  return node;
+}
 
 static void
 optimize_utf8 (dfa)
      re_dfa_t *dfa;
 {
-  int node, i, mb_chars = 0, has_period = 0;
+  int node, has_period = 0;
 
   for (node = 0; node < dfa->nodes_len; ++node)
     switch (dfa->nodes[node].type)
       {
-      case CHARACTER:
-	if (dfa->nodes[node].opr.c >= 0x80)
-	  mb_chars = 1;
-	break;
       case ANCHOR:
 	switch (dfa->nodes[node].opr.idx)
 	  {
@@ -1100,6 +1120,7 @@ optimize_utf8 (dfa)
       case OP_PERIOD:
         has_period = 1;
         break;
+      case CHARACTER:
       case OP_BACK_REF:
       case OP_ALT:
       case END_OF_RE:
@@ -1107,31 +1128,85 @@ optimize_utf8 (dfa)
       case OP_DUP_QUESTION:
       case OP_OPEN_SUBEXP:
       case OP_CLOSE_SUBEXP:
-	break;
       case SIMPLE_BRACKET:
-	/* Just double check.  */
-        for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
-	  if (dfa->nodes[node].opr.sbcset[i])
-	    return;
 	break;
       default:
 	return;
       }
 
-  if (mb_chars || has_period)
-    for (node = 0; node < dfa->nodes_len; ++node)
-      {
-	if (dfa->nodes[node].type == CHARACTER
-	    && dfa->nodes[node].opr.c >= 0x80)
-	  dfa->nodes[node].mb_partial = 0;
-	else if (dfa->nodes[node].type == OP_PERIOD)
-	  dfa->nodes[node].type = OP_UTF8_PERIOD;
-      }
+  if (has_period)
+    dfa->str_tree = optimize_utf8_tree (dfa, dfa->str_tree);
 
   /* The search can be in single byte locale.  */
   dfa->mb_cur_max = 1;
   dfa->is_utf8 = 0;
-  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
+  dfa->has_mb_node = dfa->nbackref > 0;
+}
+
+/* Build a tree for [00-7f]|[c2-fd][80-bf]+ to be used instead of NODE.  */
+static bin_tree_t *
+replace_with_utf8_period (dfa, node)
+     re_dfa_t *dfa;
+     bin_tree_t *node;
+{
+  bin_tree_t *sb_tree, *mb_tree, *first, *rest, *rest_ast, *alt_tree;
+  int node_idx;
+  re_token_t token;
+
+  node_idx = node->node_idx;
+  assert (dfa->nodes[node_idx].type == OP_PERIOD);
+
+  /* Build trees for simple brackets.  */
+  token.type = SIMPLE_BRACKET;
+  token.opr.sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
+  if (BE (token.opr.sbcset == NULL, 0))
+    return NULL;
+  memcpy (token.opr.sbcset, utf8_sb_map, sizeof (utf8_sb_map));
+  if (!(dfa->syntax & RE_DOT_NEWLINE))
+    bitset_clear (token.opr.sbcset, '\n');
+  if (dfa->syntax & RE_DOT_NOT_NULL)
+    bitset_clear (token.opr.sbcset, '\0');
+  sb_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &token);
+  if (BE (sb_tree == NULL, 0))
+    return NULL;
+
+  token.opr.sbcset = (re_bitset_ptr_t) &utf8_maps[0];
+  first = re_dfa_add_tree_node (dfa, NULL, NULL, &token);
+  if (BE (first == NULL, 0))
+    return NULL;
+  dfa->nodes[first->node_idx].duplicated = 1;
+
+  token.opr.sbcset = (re_bitset_ptr_t) &utf8_maps[1];
+  rest = re_dfa_add_tree_node (dfa, NULL, NULL, &token);
+  if (BE (rest == NULL, 0))
+    return NULL;
+  dfa->nodes[rest->node_idx].duplicated = 1;
+
+  mb_tree = create_tree (dfa, first, rest, CONCAT, 0);
+  if (BE (mb_tree == NULL, 0))
+    return NULL;
+
+  rest = re_dfa_add_tree_node (dfa, NULL, NULL, &token);
+  if (BE (rest == NULL, 0))
+    return NULL;
+  dfa->nodes[rest->node_idx].duplicated = 1;
+  token.type = OP_DUP_ASTERISK;
+  rest_ast = re_dfa_add_tree_node (dfa, rest, NULL, &token);
+  if (BE (rest == NULL, 0))
+    return NULL;
+
+  mb_tree = create_tree (dfa, mb_tree, rest_ast, CONCAT, 0);
+  if (BE (mb_tree == NULL, 0))
+    return NULL;
+
+  /* TODO: this is hideous.  Parsing and creating DFA nodes should be
+     separated.  */
+  dfa->nodes[node_idx].type = OP_ALT;
+  dfa->nodes[node_idx].accept_mb = 0;
+  dfa->has_plural_match = 1;
+  alt_tree = create_tree (dfa, sb_tree, mb_tree, 0, node_idx);
+  alt_tree->parent = node->parent;
+  return alt_tree;
 }
 #endif
 
@@ -1316,7 +1410,6 @@ calc_first (dfa, node)
     case OP_DUP_ASTERISK:
     case OP_DUP_QUESTION:
 #ifdef RE_ENABLE_I18N
-    case OP_UTF8_PERIOD:
     case COMPLEX_BRACKET:
 #endif /* RE_ENABLE_I18N */
     case SIMPLE_BRACKET:


--- orig/lib/regex_internal.h
+++ mod/lib/regex_internal.h
@@ -176,7 +176,6 @@ typedef enum
   OP_PERIOD = 5,
 #ifdef RE_ENABLE_I18N
   COMPLEX_BRACKET = 6,
-  OP_UTF8_PERIOD = 7,
 #endif /* RE_ENABLE_I18N */
 
   /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used


--- orig/lib/regexec.c
+++ mod/lib/regexec.c
@@ -3533,16 +3533,6 @@ group_nodes_into_DFAstates (dfa, state, 
 	  if (dfa->syntax & RE_DOT_NOT_NULL)
 	    bitset_clear (accepts, '\0');
 	}
-#ifdef RE_ENABLE_I18N
-      else if (type == OP_UTF8_PERIOD)
-        {
-	  memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
-	  if (!(dfa->syntax & RE_DOT_NEWLINE))
-	    bitset_clear (accepts, '\n');
-	  if (dfa->syntax & RE_DOT_NOT_NULL)
-	    bitset_clear (accepts, '\0');
-        }
-#endif
       else
 	continue;
 
@@ -3692,57 +3682,6 @@ check_node_accept_bytes (dfa, node_idx, 
   int char_len, elem_len;
   int i;
 
-  if (BE (node->type == OP_UTF8_PERIOD, 0))
-    {
-      unsigned char c = re_string_byte_at (input, str_idx), d;
-      if (BE (c < 0xc2, 1))
-	return 0;
-
-      if (str_idx + 2 > input->len)
-	return 0;
-
-      d = re_string_byte_at (input, str_idx + 1);
-      if (c < 0xe0)
-	return (d < 0x80 || d > 0xbf) ? 0 : 2;
-      else if (c < 0xf0)
-	{
-	  char_len = 3;
-	  if (c == 0xe0 && d < 0xa0)
-	    return 0;
-	}
-      else if (c < 0xf8)
-	{
-	  char_len = 4;
-	  if (c == 0xf0 && d < 0x90)
-	    return 0;
-	}
-      else if (c < 0xfc)
-	{
-	  char_len = 5;
-	  if (c == 0xf8 && d < 0x88)
-	    return 0;
-	}
-      else if (c < 0xfe)
-	{
-	  char_len = 6;
-	  if (c == 0xfc && d < 0x84)
-	    return 0;
-	}
-      else
-	return 0;
-
-      if (str_idx + char_len > input->len)
-	return 0;
-
-      for (i = 1; i < char_len; ++i)
-	{
-	  d = re_string_byte_at (input, str_idx + i);
-	  if (d < 0x80 || d > 0xbf)
-	    return 0;
-	}
-      return char_len;
-    }
-
   char_len = re_string_char_size_at (input, str_idx);
   if (node->type == OP_PERIOD)
     {
@@ -4007,12 +3946,6 @@ check_node_accept (mctx, node, idx)
       return node->opr.c == ch;
     case SIMPLE_BRACKET:
       return bitset_contain (node->opr.sbcset, ch);
-#ifdef RE_ENABLE_I18N
-    case OP_UTF8_PERIOD:
-      if (ch >= 0x80)
-        return 0;
-      /* FALLTHROUGH */
-#endif
     case OP_PERIOD:
       return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
 	       || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));




Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]