regex/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56      internal_function;
  57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                            int *p_match_first) internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes)
  69      internal_function;
  70 static reg_errcode_t set_regs (const regex_t *preg,
  71                                const re_match_context_t *mctx,
  72                                size_t nmatch, regmatch_t *pmatch,
  73                                int fl_backtrack) internal_function;
  74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75      internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 int node_idx, int str_idx, int max_str_idx)
  81      internal_function;
  82 #endif /* RE_ENABLE_I18N */
  83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                            re_sift_context_t *sctx)
  85      internal_function;
  86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                           re_sift_context_t *sctx, int str_idx,
  88                                           re_node_set *cur_dest)
  89      internal_function;
  90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                               re_sift_context_t *sctx,
  92                                               int str_idx,
  93                                               re_node_set *dest_nodes)
  94      internal_function;
  95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                             re_node_set *dest_nodes,
  97                                             const re_node_set *candidates)
  98      internal_function;
  99 static int check_dst_limits (const re_match_context_t *mctx,
 100                              re_node_set *limits,
 101                              int dst_node, int dst_idx, int src_node,
 102                              int src_idx) internal_function;
 103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                         int boundaries, int subexp_idx,
 105                                         int from_node, int bkref_idx)
 106      internal_function;
 107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                       int limit, int subexp_idx,
 109                                       int node, int str_idx,
 110                                       int bkref_idx) internal_function;
 111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                           re_node_set *dest_nodes,
 113                                           const re_node_set *candidates,
 114                                           re_node_set *limits,
 115                                           struct re_backref_cache_entry *bkref_ents,
 116                                           int str_idx) internal_function;
 117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                         re_sift_context_t *sctx,
 119                                         int str_idx, const re_node_set *candidates)
 120      internal_function;
 121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                         re_dfastate_t **dst,
 123                                         re_dfastate_t **src, int num)
 124      internal_function;
 125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                          re_match_context_t *mctx) internal_function;
 127 static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                      re_match_context_t *mctx,
 129                                      re_dfastate_t *state) internal_function;
 130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                             re_match_context_t *mctx,
 132                                             re_dfastate_t *next_state)
 133      internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate)
 141      internal_function;
 142 #endif
 143 #ifdef RE_ENABLE_I18N
 144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                        re_dfastate_t *pstate)
 146      internal_function;
 147 #endif /* RE_ENABLE_I18N */
 148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                           const re_node_set *nodes)
 150      internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx)
 153      internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str)
 158      internal_function;
 159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                              int subexp_idx, int type) internal_function;
 161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                     state_array_t *path, int top_node,
 163                                     int top_str, int last_node, int last_str,
 164                                     int type) internal_function;
 165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                    int str_idx,
 167                                                    re_node_set *cur_nodes,
 168                                                    re_node_set *next_nodes)
 169      internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type)
 173      internal_function;
 174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                    re_node_set *dst_nodes,
 176                                                    int target, int ex_subexp,
 177                                                    int type) internal_function;
 178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                          re_node_set *cur_nodes, int cur_str,
 180                                          int subexp_num, int type)
 181      internal_function;
 182 static int build_trtable (const re_dfa_t *dfa,
 183                           re_dfastate_t *state) internal_function;
 184 #ifdef RE_ENABLE_I18N
 185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                     const re_string_t *input, int idx)
 187      internal_function;
 188 # ifdef _LIBC
 189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                    size_t name_len)
 191      internal_function;
 192 # endif /* _LIBC */
 193 #endif /* RE_ENABLE_I18N */
 194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                        const re_dfastate_t *state,
 196                                        re_node_set *states_node,
 197                                        bitset_t *states_ch) internal_function;
 198 static int check_node_accept (const re_match_context_t *mctx,
 199                               const re_token_t *node, int idx)
 200      internal_function;
 201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202      internal_function;
 203 \f
 204 /* Entry point for POSIX code.  */
 205
 206 /* regexec searches for a given pattern, specified by PREG, in the
 207    string STRING.
 208
 209    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211    least NMATCH elements, and we set them to the offsets of the
 212    corresponding matched substrings.
 213
 214    EFLAGS specifies `execution flags' which affect matching: if
 215    REG_NOTBOL is set, then ^ does not match at the beginning of the
 216    string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218    We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220 int
 221 regexec (preg, string, nmatch, pmatch, eflags)
 222     const regex_t *__restrict preg;
 223     const char *__restrict string;
 224     size_t nmatch;
 225     regmatch_t pmatch[];
 226     int eflags;
 227 {
 228   reg_errcode_t err;
 229   int start, length;
 230 #ifdef _LIBC
 231   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 232 #endif
 233
 234   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 235     return REG_BADPAT;
 236
 237   if (eflags & REG_STARTEND)
 238     {
 239       start = pmatch[0].rm_so;
 240       length = pmatch[0].rm_eo;
 241     }
 242   else
 243     {
 244       start = 0;
 245       length = strlen (string);
 246     }
 247
 248   __libc_lock_lock (dfa->lock);
 249   if (preg->no_sub)
 250     err = re_search_internal (preg, string, length, start, length - start,
 251                               length, 0, NULL, eflags);
 252   else
 253     err = re_search_internal (preg, string, length, start, length - start,
 254                               length, nmatch, pmatch, eflags);
 255   __libc_lock_unlock (dfa->lock);
 256   return err != REG_NOERROR;
 257 }
 258
 259 #ifdef _LIBC
 260 # include <shlib-compat.h>
 261 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 262
 263 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 264 __typeof__ (__regexec) __compat_regexec;
 265
 266 int
 267 attribute_compat_text_section
 268 __compat_regexec (const regex_t *__restrict preg,
 269                   const char *__restrict string, size_t nmatch,
 270                   regmatch_t pmatch[], int eflags)
 271 {
 272   return regexec (preg, string, nmatch, pmatch,
 273                   eflags & (REG_NOTBOL | REG_NOTEOL));
 274 }
 275 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 276 # endif
 277 #endif
 278
 279 /* Entry points for GNU code.  */
 280
 281 /* re_match, re_search, re_match_2, re_search_2
 282
 283    The former two functions operate on STRING with length LENGTH,
 284    while the later two operate on concatenation of STRING1 and STRING2
 285    with lengths LENGTH1 and LENGTH2, respectively.
 286
 287    re_match() matches the compiled pattern in BUFP against the string,
 288    starting at index START.
 289
 290    re_search() first tries matching at index START, then it tries to match
 291    starting from index START + 1, and so on.  The last start position tried
 292    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 293    way as re_match().)
 294
 295    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 296    the first STOP characters of the concatenation of the strings should be
 297    concerned.
 298
 299    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 300    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 301    computed relative to the concatenation, not relative to the individual
 302    strings.)
 303
 304    On success, re_match* functions return the length of the match, re_search*
 305    return the position of the start of the match.  Return value -1 means no
 306    match was found and -2 indicates an internal error.  */
 307
 308 int
 309 re_match (bufp, string, length, start, regs)
 310     struct re_pattern_buffer *bufp;
 311     const char *string;
 312     int length, start;
 313     struct re_registers *regs;
 314 {
 315   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 316 }
 317 #ifdef _LIBC
 318 weak_alias (__re_match, re_match)
 319 #endif
 320
 321 int
 322 re_search (bufp, string, length, start, range, regs)
 323     struct re_pattern_buffer *bufp;
 324     const char *string;
 325     int length, start, range;
 326     struct re_registers *regs;
 327 {
 328   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 329 }
 330 #ifdef _LIBC
 331 weak_alias (__re_search, re_search)
 332 #endif
 333
 334 int
 335 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 336     struct re_pattern_buffer *bufp;
 337     const char *string1, *string2;
 338     int length1, length2, start, stop;
 339     struct re_registers *regs;
 340 {
 341   return re_search_2_stub (bufp, string1, length1, string2, length2,
 342                            start, 0, regs, stop, 1);
 343 }
 344 #ifdef _LIBC
 345 weak_alias (__re_match_2, re_match_2)
 346 #endif
 347
 348 int
 349 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 350     struct re_pattern_buffer *bufp;
 351     const char *string1, *string2;
 352     int length1, length2, start, range, stop;
 353     struct re_registers *regs;
 354 {
 355   return re_search_2_stub (bufp, string1, length1, string2, length2,
 356                            start, range, regs, stop, 0);
 357 }
 358 #ifdef _LIBC
 359 weak_alias (__re_search_2, re_search_2)
 360 #endif
 361
 362 static int
 363 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 364                   stop, ret_len)
 365     struct re_pattern_buffer *bufp;
 366     const char *string1, *string2;
 367     int length1, length2, start, range, stop, ret_len;
 368     struct re_registers *regs;
 369 {
 370   const char *str;
 371   int rval;
 372   int len = length1 + length2;
 373   int free_str = 0;
 374
 375   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 376     return -2;
 377
 378   /* Concatenate the strings.  */
 379   if (length2 > 0)
 380     if (length1 > 0)
 381       {
 382         char *s = re_malloc (char, len);
 383
 384         if (BE (s == NULL, 0))
 385           return -2;
 386 #ifdef _LIBC
 387         memcpy (__mempcpy (s, string1, length1), string2, length2);
 388 #else
 389         memcpy (s, string1, length1);
 390         memcpy (s + length1, string2, length2);
 391 #endif
 392         str = s;
 393         free_str = 1;
 394       }
 395     else
 396       str = string2;
 397   else
 398     str = string1;
 399
 400   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 401                          ret_len);
 402   if (free_str)
 403     re_free ((char *) str);
 404   return rval;
 405 }
 406
 407 /* The parameters have the same meaning as those of re_search.
 408    Additional parameters:
 409    If RET_LEN is nonzero the length of the match is returned (re_match style);
 410    otherwise the position of the match is returned.  */
 411
 412 static int
 413 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 414     struct re_pattern_buffer *bufp;
 415     const char *string;
 416     int length, start, range, stop, ret_len;
 417     struct re_registers *regs;
 418 {
 419   reg_errcode_t result;
 420   regmatch_t *pmatch;
 421   int nregs, rval;
 422   int eflags = 0;
 423 #ifdef _LIBC
 424   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 425 #endif
 426
 427   /* Check for out-of-range.  */
 428   if (BE (start < 0 || start > length, 0))
 429     return -1;
 430   if (BE (start + range > length, 0))
 431     range = length - start;
 432   else if (BE (start + range < 0, 0))
 433     range = -start;
 434
 435   __libc_lock_lock (dfa->lock);
 436
 437   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 438   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 439
 440   /* Compile fastmap if we haven't yet.  */
 441   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 442     re_compile_fastmap (bufp);
 443
 444   if (BE (bufp->no_sub, 0))
 445     regs = NULL;
 446
 447   /* We need at least 1 register.  */
 448   if (regs == NULL)
 449     nregs = 1;
 450   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 451                regs->num_regs < bufp->re_nsub + 1, 0))
 452     {
 453       nregs = regs->num_regs;
 454       if (BE (nregs < 1, 0))
 455         {
 456           /* Nothing can be copied to regs.  */
 457           regs = NULL;
 458           nregs = 1;
 459         }
 460     }
 461   else
 462     nregs = bufp->re_nsub + 1;
 463   pmatch = re_malloc (regmatch_t, nregs);
 464   if (BE (pmatch == NULL, 0))
 465     {
 466       rval = -2;
 467       goto out;
 468     }
 469
 470   result = re_search_internal (bufp, string, length, start, range, stop,
 471                                nregs, pmatch, eflags);
 472
 473   rval = 0;
 474
 475   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 476   if (result != REG_NOERROR)
 477     rval = -1;
 478   else if (regs != NULL)
 479     {
 480       /* If caller wants register contents data back, copy them.  */
 481       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 482                                            bufp->regs_allocated);
 483       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 484         rval = -2;
 485     }
 486
 487   if (BE (rval == 0, 1))
 488     {
 489       if (ret_len)
 490         {
 491           assert (pmatch[0].rm_so == start);
 492           rval = pmatch[0].rm_eo - start;
 493         }
 494       else
 495         rval = pmatch[0].rm_so;
 496     }
 497   re_free (pmatch);
 498  out:
 499   __libc_lock_unlock (dfa->lock);
 500   return rval;
 501 }
 502
 503 static unsigned
 504 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 505     struct re_registers *regs;
 506     regmatch_t *pmatch;
 507     int nregs, regs_allocated;
 508 {
 509   int rval = REGS_REALLOCATE;
 510   int i;
 511   int need_regs = nregs + 1;
 512   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 513      uses.  */
 514
 515   /* Have the register data arrays been allocated?  */
 516   if (regs_allocated == REGS_UNALLOCATED)
 517     { /* No.  So allocate them with malloc.  */
 518       regs->start = re_malloc (regoff_t, need_regs);
 519       regs->end = re_malloc (regoff_t, need_regs);
 520       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 521         return REGS_UNALLOCATED;
 522       regs->num_regs = need_regs;
 523     }
 524   else if (regs_allocated == REGS_REALLOCATE)
 525     { /* Yes.  If we need more elements than were already
 526          allocated, reallocate them.  If we need fewer, just
 527          leave it alone.  */
 528       if (BE (need_regs > regs->num_regs, 0))
 529         {
 530           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 531           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 532           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 533             return REGS_UNALLOCATED;
 534           regs->start = new_start;
 535           regs->end = new_end;
 536           regs->num_regs = need_regs;
 537         }
 538     }
 539   else
 540     {
 541       assert (regs_allocated == REGS_FIXED);
 542       /* This function may not be called with REGS_FIXED and nregs too big.  */
 543       assert (regs->num_regs >= nregs);
 544       rval = REGS_FIXED;
 545     }
 546
 547   /* Copy the regs.  */
 548   for (i = 0; i < nregs; ++i)
 549     {
 550       regs->start[i] = pmatch[i].rm_so;
 551       regs->end[i] = pmatch[i].rm_eo;
 552     }
 553   for ( ; i < regs->num_regs; ++i)
 554     regs->start[i] = regs->end[i] = -1;
 555
 556   return rval;
 557 }
 558
 559 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 560    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 561    this memory for recording register information.  STARTS and ENDS
 562    must be allocated using the malloc library routine, and must each
 563    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 564
 565    If NUM_REGS == 0, then subsequent matches should allocate their own
 566    register data.
 567
 568    Unless this function is called, the first search or match using
 569    PATTERN_BUFFER will allocate its own register data, without
 570    freeing the old data.  */
 571
 572 void
 573 re_set_registers (bufp, regs, num_regs, starts, ends)
 574     struct re_pattern_buffer *bufp;
 575     struct re_registers *regs;
 576     unsigned num_regs;
 577     regoff_t *starts, *ends;
 578 {
 579   if (num_regs)
 580     {
 581       bufp->regs_allocated = REGS_REALLOCATE;
 582       regs->num_regs = num_regs;
 583       regs->start = starts;
 584       regs->end = ends;
 585     }
 586   else
 587     {
 588       bufp->regs_allocated = REGS_UNALLOCATED;
 589       regs->num_regs = 0;
 590       regs->start = regs->end = (regoff_t *) 0;
 591     }
 592 }
 593 #ifdef _LIBC
 594 weak_alias (__re_set_registers, re_set_registers)
 595 #endif
 596 \f
 597 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 598    them unless specifically requested.  */
 599
 600 #if defined _REGEX_RE_COMP || defined _LIBC
 601 int
 602 # ifdef _LIBC
 603 weak_function
 604 # endif
 605 re_exec (s)
 606      const char *s;
 607 {
 608   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 609 }
 610 #endif /* _REGEX_RE_COMP */
 611 \f
 612 /* Internal entry point.  */
 613
 614 /* Searches for a compiled pattern PREG in the string STRING, whose
 615    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 616    mingings with regexec.  START, and RANGE have the same meanings
 617    with re_search.
 618    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 619    otherwise return the error code.
 620    Note: We assume front end functions already check ranges.
 621    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 622
 623 static reg_errcode_t
 624 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 625                     eflags)
 626     const regex_t *preg;
 627     const char *string;
 628     int length, start, range, stop, eflags;
 629     size_t nmatch;
 630     regmatch_t pmatch[];
 631 {
 632   reg_errcode_t err;
 633   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 634   int left_lim, right_lim, incr;
 635   int fl_longest_match, match_first, match_kind, match_last = -1;
 636   int extra_nmatch;
 637   int sb, ch;
 638 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 639   re_match_context_t mctx = { .dfa = dfa };
 640 #else
 641   re_match_context_t mctx;
 642 #endif
 643   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 644                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 645   RE_TRANSLATE_TYPE t = preg->translate;
 646
 647 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 648   memset (&mctx, '\0', sizeof (re_match_context_t));
 649   mctx.dfa = dfa;
 650 #endif
 651
 652   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 653   nmatch -= extra_nmatch;
 654
 655   /* Check if the DFA haven't been compiled.  */
 656   if (BE (preg->used == 0 || dfa->init_state == NULL
 657           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 658           || dfa->init_state_begbuf == NULL, 0))
 659     return REG_NOMATCH;
 660
 661 #ifdef DEBUG
 662   /* We assume front-end functions already check them.  */
 663   assert (start + range >= 0 && start + range <= length);
 664 #endif
 665
 666   /* If initial states with non-begbuf contexts have no elements,
 667      the regex must be anchored.  If preg->newline_anchor is set,
 668      we'll never use init_state_nl, so do not check it.  */
 669   if (dfa->init_state->nodes.nelem == 0
 670       && dfa->init_state_word->nodes.nelem == 0
 671       && (dfa->init_state_nl->nodes.nelem == 0
 672           || !preg->newline_anchor))
 673     {
 674       if (start != 0 && start + range != 0)
 675         return REG_NOMATCH;
 676       start = range = 0;
 677     }
 678
 679   /* We must check the longest matching, if nmatch > 0.  */
 680   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 681
 682   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 683                             preg->translate, preg->syntax & RE_ICASE, dfa);
 684   if (BE (err != REG_NOERROR, 0))
 685     goto free_return;
 686   mctx.input.stop = stop;
 687   mctx.input.raw_stop = stop;
 688   mctx.input.newline_anchor = preg->newline_anchor;
 689
 690   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 691   if (BE (err != REG_NOERROR, 0))
 692     goto free_return;
 693
 694   /* We will log all the DFA states through which the dfa pass,
 695      if nmatch > 1, or this dfa has "multibyte node", which is a
 696      back-reference or a node which can accept multibyte character or
 697      multi character collating element.  */
 698   if (nmatch > 1 || dfa->has_mb_node)
 699     {
 700       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 701       if (BE (mctx.state_log == NULL, 0))
 702         {
 703           err = REG_ESPACE;
 704           goto free_return;
 705         }
 706     }
 707   else
 708     mctx.state_log = NULL;
 709
 710   match_first = start;
 711   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 712                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 713
 714   /* Check incrementally whether of not the input string match.  */
 715   incr = (range < 0) ? -1 : 1;
 716   left_lim = (range < 0) ? start + range : start;
 717   right_lim = (range < 0) ? start : start + range;
 718   sb = dfa->mb_cur_max == 1;
 719   match_kind =
 720     (fastmap
 721      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 722         | (range >= 0 ? 2 : 0)
 723         | (t != NULL ? 1 : 0))
 724      : 8);
 725
 726   for (;; match_first += incr)
 727     {
 728       err = REG_NOMATCH;
 729       if (match_first < left_lim || right_lim < match_first)
 730         goto free_return;
 731
 732       /* Advance as rapidly as possible through the string, until we
 733          find a plausible place to start matching.  This may be done
 734          with varying efficiency, so there are various possibilities:
 735          only the most common of them are specialized, in order to
 736          save on code size.  We use a switch statement for speed.  */
 737       switch (match_kind)
 738         {
 739         case 8:
 740           /* No fastmap.  */
 741           break;
 742
 743         case 7:
 744           /* Fastmap with single-byte translation, match forward.  */
 745           while (BE (match_first < right_lim, 1)
 746                  && !fastmap[t[(unsigned char) string[match_first]]])
 747             ++match_first;
 748           goto forward_match_found_start_or_reached_end;
 749
 750         case 6:
 751           /* Fastmap without translation, match forward.  */
 752           while (BE (match_first < right_lim, 1)
 753                  && !fastmap[(unsigned char) string[match_first]])
 754             ++match_first;
 755
 756         forward_match_found_start_or_reached_end:
 757           if (BE (match_first == right_lim, 0))
 758             {
 759               ch = match_first >= length
 760                        ? 0 : (unsigned char) string[match_first];
 761               if (!fastmap[t ? t[ch] : ch])
 762                 goto free_return;
 763             }
 764           break;
 765
 766         case 4:
 767         case 5:
 768           /* Fastmap without multi-byte translation, match backwards.  */
 769           while (match_first >= left_lim)
 770             {
 771               ch = match_first >= length
 772                        ? 0 : (unsigned char) string[match_first];
 773               if (fastmap[t ? t[ch] : ch])
 774                 break;
 775               --match_first;
 776             }
 777           if (match_first < left_lim)
 778             goto free_return;
 779           break;
 780
 781         default:
 782           /* In this case, we can't determine easily the current byte,
 783              since it might be a component byte of a multibyte
 784              character.  Then we use the constructed buffer instead.  */
 785           for (;;)
 786             {
 787               /* If MATCH_FIRST is out of the valid range, reconstruct the
 788                  buffers.  */
 789               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 790               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 791                 {
 792                   err = re_string_reconstruct (&mctx.input, match_first,
 793                                                eflags);
 794                   if (BE (err != REG_NOERROR, 0))
 795                     goto free_return;
 796
 797                   offset = match_first - mctx.input.raw_mbs_idx;
 798                 }
 799               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 800                  Note that MATCH_FIRST must not be smaller than 0.  */
 801               ch = (match_first >= length
 802                     ? 0 : re_string_byte_at (&mctx.input, offset));
 803               if (fastmap[ch])
 804                 break;
 805               match_first += incr;
 806               if (match_first < left_lim || match_first > right_lim)
 807                 {
 808                   err = REG_NOMATCH;
 809                   goto free_return;
 810                 }
 811             }
 812           break;
 813         }
 814
 815       /* Reconstruct the buffers so that the matcher can assume that
 816          the matching starts from the beginning of the buffer.  */
 817       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 818       if (BE (err != REG_NOERROR, 0))
 819         goto free_return;
 820
 821 #ifdef RE_ENABLE_I18N
 822      /* Don't consider this char as a possible match start if it part,
 823         yet isn't the head, of a multibyte character.  */
 824       if (!sb && !re_string_first_byte (&mctx.input, 0))
 825         continue;
 826 #endif
 827
 828       /* It seems to be appropriate one, then use the matcher.  */
 829       /* We assume that the matching starts from 0.  */
 830       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 831       match_last = check_matching (&mctx, fl_longest_match,
 832                                    range >= 0 ? &match_first : NULL);
 833       if (match_last != -1)
 834         {
 835           if (BE (match_last == -2, 0))
 836             {
 837               err = REG_ESPACE;
 838               goto free_return;
 839             }
 840           else
 841             {
 842               mctx.match_last = match_last;
 843               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 844                 {
 845                   re_dfastate_t *pstate = mctx.state_log[match_last];
 846                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 847                                                              match_last);
 848                 }
 849               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 850                   || dfa->nbackref)
 851                 {
 852                   err = prune_impossible_nodes (&mctx);
 853                   if (err == REG_NOERROR)
 854                     break;
 855                   if (BE (err != REG_NOMATCH, 0))
 856                     goto free_return;
 857                   match_last = -1;
 858                 }
 859               else
 860                 break; /* We found a match.  */
 861             }
 862         }
 863
 864       match_ctx_clean (&mctx);
 865     }
 866
 867 #ifdef DEBUG
 868   assert (match_last != -1);
 869   assert (err == REG_NOERROR);
 870 #endif
 871
 872   /* Set pmatch[] if we need.  */
 873   if (nmatch > 0)
 874     {
 875       int reg_idx;
 876
 877       /* Initialize registers.  */
 878       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 879         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 880
 881       /* Set the points where matching start/end.  */
 882       pmatch[0].rm_so = 0;
 883       pmatch[0].rm_eo = mctx.match_last;
 884
 885       if (!preg->no_sub && nmatch > 1)
 886         {
 887           err = set_regs (preg, &mctx, nmatch, pmatch,
 888                           dfa->has_plural_match && dfa->nbackref > 0);
 889           if (BE (err != REG_NOERROR, 0))
 890             goto free_return;
 891         }
 892
 893       /* At last, add the offset to the each registers, since we slided
 894          the buffers so that we could assume that the matching starts
 895          from 0.  */
 896       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 897         if (pmatch[reg_idx].rm_so != -1)
 898           {
 899 #ifdef RE_ENABLE_I18N
 900             if (BE (mctx.input.offsets_needed != 0, 0))
 901               {
 902                 pmatch[reg_idx].rm_so =
 903                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 904                    ? mctx.input.valid_raw_len
 905                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 906                 pmatch[reg_idx].rm_eo =
 907                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 908                    ? mctx.input.valid_raw_len
 909                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 910               }
 911 #else
 912             assert (mctx.input.offsets_needed == 0);
 913 #endif
 914             pmatch[reg_idx].rm_so += match_first;
 915             pmatch[reg_idx].rm_eo += match_first;
 916           }
 917       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 918         {
 919           pmatch[nmatch + reg_idx].rm_so = -1;
 920           pmatch[nmatch + reg_idx].rm_eo = -1;
 921         }
 922
 923       if (dfa->subexp_map)
 924         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 925           if (dfa->subexp_map[reg_idx] != reg_idx)
 926             {
 927               pmatch[reg_idx + 1].rm_so
 928                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 929               pmatch[reg_idx + 1].rm_eo
 930                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 931             }
 932     }
 933
 934  free_return:
 935   re_free (mctx.state_log);
 936   if (dfa->nbackref)
 937     match_ctx_free (&mctx);
 938   re_string_destruct (&mctx.input);
 939   return err;
 940 }
 941
 942 static reg_errcode_t
 943 prune_impossible_nodes (mctx)
 944      re_match_context_t *mctx;
 945 {
 946   const re_dfa_t *const dfa = mctx->dfa;
 947   int halt_node, match_last;
 948   reg_errcode_t ret;
 949   re_dfastate_t **sifted_states;
 950   re_dfastate_t **lim_states = NULL;
 951   re_sift_context_t sctx;
 952 #ifdef DEBUG
 953   assert (mctx->state_log != NULL);
 954 #endif
 955   match_last = mctx->match_last;
 956   halt_node = mctx->last_node;
 957   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 958   if (BE (sifted_states == NULL, 0))
 959     {
 960       ret = REG_ESPACE;
 961       goto free_return;
 962     }
 963   if (dfa->nbackref)
 964     {
 965       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 966       if (BE (lim_states == NULL, 0))
 967         {
 968           ret = REG_ESPACE;
 969           goto free_return;
 970         }
 971       while (1)
 972         {
 973           memset (lim_states, '\0',
 974                   sizeof (re_dfastate_t *) * (match_last + 1));
 975           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 976                          match_last);
 977           ret = sift_states_backward (mctx, &sctx);
 978           re_node_set_free (&sctx.limits);
 979           if (BE (ret != REG_NOERROR, 0))
 980               goto free_return;
 981           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 982             break;
 983           do
 984             {
 985               --match_last;
 986               if (match_last < 0)
 987                 {
 988                   ret = REG_NOMATCH;
 989                   goto free_return;
 990                 }
 991             } while (mctx->state_log[match_last] == NULL
 992                      || !mctx->state_log[match_last]->halt);
 993           halt_node = check_halt_state_context (mctx,
 994                                                 mctx->state_log[match_last],
 995                                                 match_last);
 996         }
 997       ret = merge_state_array (dfa, sifted_states, lim_states,
 998                                match_last + 1);
 999       re_free (lim_states);
1000       lim_states = NULL;
1001       if (BE (ret != REG_NOERROR, 0))
1002         goto free_return;
1003     }
1004   else
1005     {
1006       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1007       ret = sift_states_backward (mctx, &sctx);
1008       re_node_set_free (&sctx.limits);
1009       if (BE (ret != REG_NOERROR, 0))
1010         goto free_return;
1011     }
1012   re_free (mctx->state_log);
1013   mctx->state_log = sifted_states;
1014   sifted_states = NULL;
1015   mctx->last_node = halt_node;
1016   mctx->match_last = match_last;
1017   ret = REG_NOERROR;
1018  free_return:
1019   re_free (sifted_states);
1020   re_free (lim_states);
1021   return ret;
1022 }
1023
1024 /* Acquire an initial state and return it.
1025    We must select appropriate initial state depending on the context,
1026    since initial states may have constraints like "\<", "^", etc..  */
1027
1028 static inline re_dfastate_t *
1029 __attribute ((always_inline)) internal_function
1030 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1031                             int idx)
1032 {
1033   const re_dfa_t *const dfa = mctx->dfa;
1034   if (dfa->init_state->has_constraint)
1035     {
1036       unsigned int context;
1037       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1038       if (IS_WORD_CONTEXT (context))
1039         return dfa->init_state_word;
1040       else if (IS_ORDINARY_CONTEXT (context))
1041         return dfa->init_state;
1042       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1043         return dfa->init_state_begbuf;
1044       else if (IS_NEWLINE_CONTEXT (context))
1045         return dfa->init_state_nl;
1046       else if (IS_BEGBUF_CONTEXT (context))
1047         {
1048           /* It is relatively rare case, then calculate on demand.  */
1049           return re_acquire_state_context (err, dfa,
1050                                            dfa->init_state->entrance_nodes,
1051                                            context);
1052         }
1053       else
1054         /* Must not happen?  */
1055         return dfa->init_state;
1056     }
1057   else
1058     return dfa->init_state;
1059 }
1060
1061 /* Check whether the regular expression match input string INPUT or not,
1062    and return the index where the matching end, return -1 if not match,
1063    or return -2 in case of an error.
1064    FL_LONGEST_MATCH means we want the POSIX longest matching.
1065    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1066    next place where we may want to try matching.
1067    Note that the matcher assume that the maching starts from the current
1068    index of the buffer.  */
1069
1070 static int
1071 internal_function
1072 check_matching (re_match_context_t *mctx, int fl_longest_match,
1073                 int *p_match_first)
1074 {
1075   const re_dfa_t *const dfa = mctx->dfa;
1076   reg_errcode_t err;
1077   int match = 0;
1078   int match_last = -1;
1079   int cur_str_idx = re_string_cur_idx (&mctx->input);
1080   re_dfastate_t *cur_state;
1081   int at_init_state = p_match_first != NULL;
1082   int next_start_idx = cur_str_idx;
1083
1084   err = REG_NOERROR;
1085   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1086   /* An initial state must not be NULL (invalid).  */
1087   if (BE (cur_state == NULL, 0))
1088     {
1089       assert (err == REG_ESPACE);
1090       return -2;
1091     }
1092
1093   if (mctx->state_log != NULL)
1094     {
1095       mctx->state_log[cur_str_idx] = cur_state;
1096
1097       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1098          later.  E.g. Processing back references.  */
1099       if (BE (dfa->nbackref, 0))
1100         {
1101           at_init_state = 0;
1102           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1103           if (BE (err != REG_NOERROR, 0))
1104             return err;
1105
1106           if (cur_state->has_backref)
1107             {
1108               err = transit_state_bkref (mctx, &cur_state->nodes);
1109               if (BE (err != REG_NOERROR, 0))
1110                 return err;
1111             }
1112         }
1113     }
1114
1115   /* If the RE accepts NULL string.  */
1116   if (BE (cur_state->halt, 0))
1117     {
1118       if (!cur_state->has_constraint
1119           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1120         {
1121           if (!fl_longest_match)
1122             return cur_str_idx;
1123           else
1124             {
1125               match_last = cur_str_idx;
1126               match = 1;
1127             }
1128         }
1129     }
1130
1131   while (!re_string_eoi (&mctx->input))
1132     {
1133       re_dfastate_t *old_state = cur_state;
1134       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1135
1136       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1137           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1138               && mctx->input.valid_len < mctx->input.len))
1139         {
1140           err = extend_buffers (mctx);
1141           if (BE (err != REG_NOERROR, 0))
1142             {
1143               assert (err == REG_ESPACE);
1144               return -2;
1145             }
1146         }
1147
1148       cur_state = transit_state (&err, mctx, cur_state);
1149       if (mctx->state_log != NULL)
1150         cur_state = merge_state_with_log (&err, mctx, cur_state);
1151
1152       if (cur_state == NULL)
1153         {
1154           /* Reached the invalid state or an error.  Try to recover a valid
1155              state using the state log, if available and if we have not
1156              already found a valid (even if not the longest) match.  */
1157           if (BE (err != REG_NOERROR, 0))
1158             return -2;
1159
1160           if (mctx->state_log == NULL
1161               || (match && !fl_longest_match)
1162               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1163             break;
1164         }
1165
1166       if (BE (at_init_state, 0))
1167         {
1168           if (old_state == cur_state)
1169             next_start_idx = next_char_idx;
1170           else
1171             at_init_state = 0;
1172         }
1173
1174       if (cur_state->halt)
1175         {
1176           /* Reached a halt state.
1177              Check the halt state can satisfy the current context.  */
1178           if (!cur_state->has_constraint
1179               || check_halt_state_context (mctx, cur_state,
1180                                            re_string_cur_idx (&mctx->input)))
1181             {
1182               /* We found an appropriate halt state.  */
1183               match_last = re_string_cur_idx (&mctx->input);
1184               match = 1;
1185
1186               /* We found a match, do not modify match_first below.  */
1187               p_match_first = NULL;
1188               if (!fl_longest_match)
1189                 break;
1190             }
1191         }
1192     }
1193
1194   if (p_match_first)
1195     *p_match_first += next_start_idx;
1196
1197   return match_last;
1198 }
1199
1200 /* Check NODE match the current context.  */
1201
1202 static int
1203 internal_function
1204 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1205 {
1206   re_token_type_t type = dfa->nodes[node].type;
1207   unsigned int constraint = dfa->nodes[node].constraint;
1208   if (type != END_OF_RE)
1209     return 0;
1210   if (!constraint)
1211     return 1;
1212   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1213     return 0;
1214   return 1;
1215 }
1216
1217 /* Check the halt state STATE match the current context.
1218    Return 0 if not match, if the node, STATE has, is a halt node and
1219    match the context, return the node.  */
1220
1221 static int
1222 internal_function
1223 check_halt_state_context (const re_match_context_t *mctx,
1224                           const re_dfastate_t *state, int idx)
1225 {
1226   int i;
1227   unsigned int context;
1228 #ifdef DEBUG
1229   assert (state->halt);
1230 #endif
1231   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1232   for (i = 0; i < state->nodes.nelem; ++i)
1233     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1234       return state->nodes.elems[i];
1235   return 0;
1236 }
1237
1238 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1239    corresponding to the DFA).
1240    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1241    of errors.  */
1242
1243 static int
1244 internal_function
1245 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1246                    int *pidx, int node, re_node_set *eps_via_nodes,
1247                    struct re_fail_stack_t *fs)
1248 {
1249   const re_dfa_t *const dfa = mctx->dfa;
1250   int i, err;
1251   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1252     {
1253       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1254       re_node_set *edests = &dfa->edests[node];
1255       int dest_node;
1256       err = re_node_set_insert (eps_via_nodes, node);
1257       if (BE (err < 0, 0))
1258         return -2;
1259       /* Pick up a valid destination, or return -1 if none is found.  */
1260       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1261         {
1262           int candidate = edests->elems[i];
1263           if (!re_node_set_contains (cur_nodes, candidate))
1264             continue;
1265           if (dest_node == -1)
1266             dest_node = candidate;
1267
1268           else
1269             {
1270               /* In order to avoid infinite loop like "(a*)*", return the second
1271                  epsilon-transition if the first was already considered.  */
1272               if (re_node_set_contains (eps_via_nodes, dest_node))
1273                 return candidate;
1274
1275               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1276               else if (fs != NULL
1277                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1278                                            eps_via_nodes))
1279                 return -2;
1280
1281               /* We know we are going to exit.  */
1282               break;
1283             }
1284         }
1285       return dest_node;
1286     }
1287   else
1288     {
1289       int naccepted = 0;
1290       re_token_type_t type = dfa->nodes[node].type;
1291
1292 #ifdef RE_ENABLE_I18N
1293       if (dfa->nodes[node].accept_mb)
1294         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1295       else
1296 #endif /* RE_ENABLE_I18N */
1297       if (type == OP_BACK_REF)
1298         {
1299           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1300           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1301           if (fs != NULL)
1302             {
1303               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1304                 return -1;
1305               else if (naccepted)
1306                 {
1307                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1308                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1309                               naccepted) != 0)
1310                     return -1;
1311                 }
1312             }
1313
1314           if (naccepted == 0)
1315             {
1316               int dest_node;
1317               err = re_node_set_insert (eps_via_nodes, node);
1318               if (BE (err < 0, 0))
1319                 return -2;
1320               dest_node = dfa->edests[node].elems[0];
1321               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1322                                         dest_node))
1323                 return dest_node;
1324             }
1325         }
1326
1327       if (naccepted != 0
1328           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1329         {
1330           int dest_node = dfa->nexts[node];
1331           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1332           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1333                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1334                                                dest_node)))
1335             return -1;
1336           re_node_set_empty (eps_via_nodes);
1337           return dest_node;
1338         }
1339     }
1340   return -1;
1341 }
1342
1343 static reg_errcode_t
1344 internal_function
1345 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1346                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1347 {
1348   reg_errcode_t err;
1349   int num = fs->num++;
1350   if (fs->num == fs->alloc)
1351     {
1352       struct re_fail_stack_ent_t *new_array;
1353       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1354                                        * fs->alloc * 2));
1355       if (new_array == NULL)
1356         return REG_ESPACE;
1357       fs->alloc *= 2;
1358       fs->stack = new_array;
1359     }
1360   fs->stack[num].idx = str_idx;
1361   fs->stack[num].node = dest_node;
1362   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1363   if (fs->stack[num].regs == NULL)
1364     return REG_ESPACE;
1365   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1366   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1367   return err;
1368 }
1369
1370 static int
1371 internal_function
1372 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1373                 regmatch_t *regs, re_node_set *eps_via_nodes)
1374 {
1375   int num = --fs->num;
1376   assert (num >= 0);
1377   *pidx = fs->stack[num].idx;
1378   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1379   re_node_set_free (eps_via_nodes);
1380   re_free (fs->stack[num].regs);
1381   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1382   return fs->stack[num].node;
1383 }
1384
1385 /* Set the positions where the subexpressions are starts/ends to registers
1386    PMATCH.
1387    Note: We assume that pmatch[0] is already set, and
1388    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1389
1390 static reg_errcode_t
1391 internal_function
1392 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1393           regmatch_t *pmatch, int fl_backtrack)
1394 {
1395   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1396   int idx, cur_node;
1397   re_node_set eps_via_nodes;
1398   struct re_fail_stack_t *fs;
1399   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1400   regmatch_t *prev_idx_match;
1401   int prev_idx_match_malloced = 0;
1402
1403 #ifdef DEBUG
1404   assert (nmatch > 1);
1405   assert (mctx->state_log != NULL);
1406 #endif
1407   if (fl_backtrack)
1408     {
1409       fs = &fs_body;
1410       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1411       if (fs->stack == NULL)
1412         return REG_ESPACE;
1413     }
1414   else
1415     fs = NULL;
1416
1417   cur_node = dfa->init_node;
1418   re_node_set_init_empty (&eps_via_nodes);
1419
1420   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1421     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1422   else
1423     {
1424       prev_idx_match = re_malloc (regmatch_t, nmatch);
1425       if (prev_idx_match == NULL)
1426         {
1427           free_fail_stack_return (fs);
1428           return REG_ESPACE;
1429         }
1430       prev_idx_match_malloced = 1;
1431     }
1432   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1433
1434   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1435     {
1436       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1437
1438       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1439         {
1440           int reg_idx;
1441           if (fs)
1442             {
1443               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1444                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1445                   break;
1446               if (reg_idx == nmatch)
1447                 {
1448                   re_node_set_free (&eps_via_nodes);
1449                   if (prev_idx_match_malloced)
1450                     re_free (prev_idx_match);
1451                   return free_fail_stack_return (fs);
1452                 }
1453               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454                                          &eps_via_nodes);
1455             }
1456           else
1457             {
1458               re_node_set_free (&eps_via_nodes);
1459               if (prev_idx_match_malloced)
1460                 re_free (prev_idx_match);
1461               return REG_NOERROR;
1462             }
1463         }
1464
1465       /* Proceed to next node.  */
1466       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1467                                     &eps_via_nodes, fs);
1468
1469       if (BE (cur_node < 0, 0))
1470         {
1471           if (BE (cur_node == -2, 0))
1472             {
1473               re_node_set_free (&eps_via_nodes);
1474               if (prev_idx_match_malloced)
1475                 re_free (prev_idx_match);
1476               free_fail_stack_return (fs);
1477               return REG_ESPACE;
1478             }
1479           if (fs)
1480             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1481                                        &eps_via_nodes);
1482           else
1483             {
1484               re_node_set_free (&eps_via_nodes);
1485               if (prev_idx_match_malloced)
1486                 re_free (prev_idx_match);
1487               return REG_NOMATCH;
1488             }
1489         }
1490     }
1491   re_node_set_free (&eps_via_nodes);
1492   if (prev_idx_match_malloced)
1493     re_free (prev_idx_match);
1494   return free_fail_stack_return (fs);
1495 }
1496
1497 static reg_errcode_t
1498 internal_function
1499 free_fail_stack_return (struct re_fail_stack_t *fs)
1500 {
1501   if (fs)
1502     {
1503       int fs_idx;
1504       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1505         {
1506           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1507           re_free (fs->stack[fs_idx].regs);
1508         }
1509       re_free (fs->stack);
1510     }
1511   return REG_NOERROR;
1512 }
1513
1514 static void
1515 internal_function
1516 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1517              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1518 {
1519   int type = dfa->nodes[cur_node].type;
1520   if (type == OP_OPEN_SUBEXP)
1521     {
1522       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1523
1524       /* We are at the first node of this sub expression.  */
1525       if (reg_num < nmatch)
1526         {
1527           pmatch[reg_num].rm_so = cur_idx;
1528           pmatch[reg_num].rm_eo = -1;
1529         }
1530     }
1531   else if (type == OP_CLOSE_SUBEXP)
1532     {
1533       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1534       if (reg_num < nmatch)
1535         {
1536           /* We are at the last node of this sub expression.  */
1537           if (pmatch[reg_num].rm_so < cur_idx)
1538             {
1539               pmatch[reg_num].rm_eo = cur_idx;
1540               /* This is a non-empty match or we are not inside an optional
1541                  subexpression.  Accept this right away.  */
1542               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1543             }
1544           else
1545             {
1546               if (dfa->nodes[cur_node].opt_subexp
1547                   && prev_idx_match[reg_num].rm_so != -1)
1548                 /* We transited through an empty match for an optional
1549                    subexpression, like (a?)*, and this is not the subexp's
1550                    first match.  Copy back the old content of the registers
1551                    so that matches of an inner subexpression are undone as
1552                    well, like in ((a?))*.  */
1553                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1554               else
1555                 /* We completed a subexpression, but it may be part of
1556                    an optional one, so do not update PREV_IDX_MATCH.  */
1557                 pmatch[reg_num].rm_eo = cur_idx;
1558             }
1559         }
1560     }
1561 }
1562
1563 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1564    and sift the nodes in each states according to the following rules.
1565    Updated state_log will be wrote to STATE_LOG.
1566
1567    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1568      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1569         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1570         the LAST_NODE, we throw away the node `a'.
1571      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1572         string `s' and transit to `b':
1573         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1574            away the node `a'.
1575         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1576             thrown away, we throw away the node `a'.
1577      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1578         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1579            node `a'.
1580         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1581             we throw away the node `a'.  */
1582
1583 #define STATE_NODE_CONTAINS(state,node) \
1584   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1585
1586 static reg_errcode_t
1587 internal_function
1588 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1589 {
1590   reg_errcode_t err;
1591   int null_cnt = 0;
1592   int str_idx = sctx->last_str_idx;
1593   re_node_set cur_dest;
1594
1595 #ifdef DEBUG
1596   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1597 #endif
1598
1599   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1600      transit to the last_node and the last_node itself.  */
1601   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1602   if (BE (err != REG_NOERROR, 0))
1603     return err;
1604   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1605   if (BE (err != REG_NOERROR, 0))
1606     goto free_return;
1607
1608   /* Then check each states in the state_log.  */
1609   while (str_idx > 0)
1610     {
1611       /* Update counters.  */
1612       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1613       if (null_cnt > mctx->max_mb_elem_len)
1614         {
1615           memset (sctx->sifted_states, '\0',
1616                   sizeof (re_dfastate_t *) * str_idx);
1617           re_node_set_free (&cur_dest);
1618           return REG_NOERROR;
1619         }
1620       re_node_set_empty (&cur_dest);
1621       --str_idx;
1622
1623       if (mctx->state_log[str_idx])
1624         {
1625           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1626           if (BE (err != REG_NOERROR, 0))
1627             goto free_return;
1628         }
1629
1630       /* Add all the nodes which satisfy the following conditions:
1631          - It can epsilon transit to a node in CUR_DEST.
1632          - It is in CUR_SRC.
1633          And update state_log.  */
1634       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1635       if (BE (err != REG_NOERROR, 0))
1636         goto free_return;
1637     }
1638   err = REG_NOERROR;
1639  free_return:
1640   re_node_set_free (&cur_dest);
1641   return err;
1642 }
1643
1644 static reg_errcode_t
1645 internal_function
1646 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1647                      int str_idx, re_node_set *cur_dest)
1648 {
1649   const re_dfa_t *const dfa = mctx->dfa;
1650   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1651   int i;
1652
1653   /* Then build the next sifted state.
1654      We build the next sifted state on `cur_dest', and update
1655      `sifted_states[str_idx]' with `cur_dest'.
1656      Note:
1657      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1658      `cur_src' points the node_set of the old `state_log[str_idx]'
1659      (with the epsilon nodes pre-filtered out).  */
1660   for (i = 0; i < cur_src->nelem; i++)
1661     {
1662       int prev_node = cur_src->elems[i];
1663       int naccepted = 0;
1664       int ret;
1665
1666 #ifdef DEBUG
1667       re_token_type_t type = dfa->nodes[prev_node].type;
1668       assert (!IS_EPSILON_NODE (type));
1669 #endif
1670 #ifdef RE_ENABLE_I18N
1671       /* If the node may accept `multi byte'.  */
1672       if (dfa->nodes[prev_node].accept_mb)
1673         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1674                                          str_idx, sctx->last_str_idx);
1675 #endif /* RE_ENABLE_I18N */
1676
1677       /* We don't check backreferences here.
1678          See update_cur_sifted_state().  */
1679       if (!naccepted
1680           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1681           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1682                                   dfa->nexts[prev_node]))
1683         naccepted = 1;
1684
1685       if (naccepted == 0)
1686         continue;
1687
1688       if (sctx->limits.nelem)
1689         {
1690           int to_idx = str_idx + naccepted;
1691           if (check_dst_limits (mctx, &sctx->limits,
1692                                 dfa->nexts[prev_node], to_idx,
1693                                 prev_node, str_idx))
1694             continue;
1695         }
1696       ret = re_node_set_insert (cur_dest, prev_node);
1697       if (BE (ret == -1, 0))
1698         return REG_ESPACE;
1699     }
1700
1701   return REG_NOERROR;
1702 }
1703
1704 /* Helper functions.  */
1705
1706 static reg_errcode_t
1707 internal_function
1708 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1709 {
1710   int top = mctx->state_log_top;
1711
1712   if (next_state_log_idx >= mctx->input.bufs_len
1713       || (next_state_log_idx >= mctx->input.valid_len
1714           && mctx->input.valid_len < mctx->input.len))
1715     {
1716       reg_errcode_t err;
1717       err = extend_buffers (mctx);
1718       if (BE (err != REG_NOERROR, 0))
1719         return err;
1720     }
1721
1722   if (top < next_state_log_idx)
1723     {
1724       memset (mctx->state_log + top + 1, '\0',
1725               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1726       mctx->state_log_top = next_state_log_idx;
1727     }
1728   return REG_NOERROR;
1729 }
1730
1731 static reg_errcode_t
1732 internal_function
1733 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1734                    re_dfastate_t **src, int num)
1735 {
1736   int st_idx;
1737   reg_errcode_t err;
1738   for (st_idx = 0; st_idx < num; ++st_idx)
1739     {
1740       if (dst[st_idx] == NULL)
1741         dst[st_idx] = src[st_idx];
1742       else if (src[st_idx] != NULL)
1743         {
1744           re_node_set merged_set;
1745           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1746                                         &src[st_idx]->nodes);
1747           if (BE (err != REG_NOERROR, 0))
1748             return err;
1749           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1750           re_node_set_free (&merged_set);
1751           if (BE (err != REG_NOERROR, 0))
1752             return err;
1753         }
1754     }
1755   return REG_NOERROR;
1756 }
1757
1758 static reg_errcode_t
1759 internal_function
1760 update_cur_sifted_state (const re_match_context_t *mctx,
1761                          re_sift_context_t *sctx, int str_idx,
1762                          re_node_set *dest_nodes)
1763 {
1764   const re_dfa_t *const dfa = mctx->dfa;
1765   reg_errcode_t err = REG_NOERROR;
1766   const re_node_set *candidates;
1767   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1768                 : &mctx->state_log[str_idx]->nodes);
1769
1770   if (dest_nodes->nelem == 0)
1771     sctx->sifted_states[str_idx] = NULL;
1772   else
1773     {
1774       if (candidates)
1775         {
1776           /* At first, add the nodes which can epsilon transit to a node in
1777              DEST_NODE.  */
1778           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1779           if (BE (err != REG_NOERROR, 0))
1780             return err;
1781
1782           /* Then, check the limitations in the current sift_context.  */
1783           if (sctx->limits.nelem)
1784             {
1785               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1786                                          mctx->bkref_ents, str_idx);
1787               if (BE (err != REG_NOERROR, 0))
1788                 return err;
1789             }
1790         }
1791
1792       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1793       if (BE (err != REG_NOERROR, 0))
1794         return err;
1795     }
1796
1797   if (candidates && mctx->state_log[str_idx]->has_backref)
1798     {
1799       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1800       if (BE (err != REG_NOERROR, 0))
1801         return err;
1802     }
1803   return REG_NOERROR;
1804 }
1805
1806 static reg_errcode_t
1807 internal_function
1808 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1809                        const re_node_set *candidates)
1810 {
1811   reg_errcode_t err = REG_NOERROR;
1812   int i;
1813
1814   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1815   if (BE (err != REG_NOERROR, 0))
1816     return err;
1817
1818   if (!state->inveclosure.alloc)
1819     {
1820       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1821       if (BE (err != REG_NOERROR, 0))
1822         return REG_ESPACE;
1823       for (i = 0; i < dest_nodes->nelem; i++)
1824         re_node_set_merge (&state->inveclosure,
1825                            dfa->inveclosures + dest_nodes->elems[i]);
1826     }
1827   return re_node_set_add_intersect (dest_nodes, candidates,
1828                                     &state->inveclosure);
1829 }
1830
1831 static reg_errcode_t
1832 internal_function
1833 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1834                        const re_node_set *candidates)
1835 {
1836     int ecl_idx;
1837     reg_errcode_t err;
1838     re_node_set *inv_eclosure = dfa->inveclosures + node;
1839     re_node_set except_nodes;
1840     re_node_set_init_empty (&except_nodes);
1841     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1842       {
1843         int cur_node = inv_eclosure->elems[ecl_idx];
1844         if (cur_node == node)
1845           continue;
1846         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1847           {
1848             int edst1 = dfa->edests[cur_node].elems[0];
1849             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1850                          ? dfa->edests[cur_node].elems[1] : -1);
1851             if ((!re_node_set_contains (inv_eclosure, edst1)
1852                  && re_node_set_contains (dest_nodes, edst1))
1853                 || (edst2 > 0
1854                     && !re_node_set_contains (inv_eclosure, edst2)
1855                     && re_node_set_contains (dest_nodes, edst2)))
1856               {
1857                 err = re_node_set_add_intersect (&except_nodes, candidates,
1858                                                  dfa->inveclosures + cur_node);
1859                 if (BE (err != REG_NOERROR, 0))
1860                   {
1861                     re_node_set_free (&except_nodes);
1862                     return err;
1863                   }
1864               }
1865           }
1866       }
1867     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1868       {
1869         int cur_node = inv_eclosure->elems[ecl_idx];
1870         if (!re_node_set_contains (&except_nodes, cur_node))
1871           {
1872             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1873             re_node_set_remove_at (dest_nodes, idx);
1874           }
1875       }
1876     re_node_set_free (&except_nodes);
1877     return REG_NOERROR;
1878 }
1879
1880 static int
1881 internal_function
1882 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1883                   int dst_node, int dst_idx, int src_node, int src_idx)
1884 {
1885   const re_dfa_t *const dfa = mctx->dfa;
1886   int lim_idx, src_pos, dst_pos;
1887
1888   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1889   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1890   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1891     {
1892       int subexp_idx;
1893       struct re_backref_cache_entry *ent;
1894       ent = mctx->bkref_ents + limits->elems[lim_idx];
1895       subexp_idx = dfa->nodes[ent->node].opr.idx;
1896
1897       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1898                                            subexp_idx, dst_node, dst_idx,
1899                                            dst_bkref_idx);
1900       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1901                                            subexp_idx, src_node, src_idx,
1902                                            src_bkref_idx);
1903
1904       /* In case of:
1905          <src> <dst> ( <subexp> )
1906          ( <subexp> ) <src> <dst>
1907          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1908       if (src_pos == dst_pos)
1909         continue; /* This is unrelated limitation.  */
1910       else
1911         return 1;
1912     }
1913   return 0;
1914 }
1915
1916 static int
1917 internal_function
1918 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1919                              int subexp_idx, int from_node, int bkref_idx)
1920 {
1921   const re_dfa_t *const dfa = mctx->dfa;
1922   const re_node_set *eclosures = dfa->eclosures + from_node;
1923   int node_idx;
1924
1925   /* Else, we are on the boundary: examine the nodes on the epsilon
1926      closure.  */
1927   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1928     {
1929       int node = eclosures->elems[node_idx];
1930       switch (dfa->nodes[node].type)
1931         {
1932         case OP_BACK_REF:
1933           if (bkref_idx != -1)
1934             {
1935               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1936               do
1937                 {
1938                   int dst, cpos;
1939
1940                   if (ent->node != node)
1941                     continue;
1942
1943                   if (subexp_idx < BITSET_WORD_BITS
1944                       && !(ent->eps_reachable_subexps_map
1945                            & ((bitset_word_t) 1 << subexp_idx)))
1946                     continue;
1947
1948                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1949                      OP_CLOSE_SUBEXP cases below.  But, if the
1950                      destination node is the same node as the source
1951                      node, don't recurse because it would cause an
1952                      infinite loop: a regex that exhibits this behavior
1953                      is ()\1*\1*  */
1954                   dst = dfa->edests[node].elems[0];
1955                   if (dst == from_node)
1956                     {
1957                       if (boundaries & 1)
1958                         return -1;
1959                       else /* if (boundaries & 2) */
1960                         return 0;
1961                     }
1962
1963                   cpos =
1964                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1965                                                  dst, bkref_idx);
1966                   if (cpos == -1 /* && (boundaries & 1) */)
1967                     return -1;
1968                   if (cpos == 0 && (boundaries & 2))
1969                     return 0;
1970
1971                   if (subexp_idx < BITSET_WORD_BITS)
1972                     ent->eps_reachable_subexps_map
1973                       &= ~((bitset_word_t) 1 << subexp_idx);
1974                 }
1975               while (ent++->more);
1976             }
1977           break;
1978
1979         case OP_OPEN_SUBEXP:
1980           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1981             return -1;
1982           break;
1983
1984         case OP_CLOSE_SUBEXP:
1985           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1986             return 0;
1987           break;
1988
1989         default:
1990             break;
1991         }
1992     }
1993
1994   return (boundaries & 2) ? 1 : 0;
1995 }
1996
1997 static int
1998 internal_function
1999 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2000                            int subexp_idx, int from_node, int str_idx,
2001                            int bkref_idx)
2002 {
2003   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2004   int boundaries;
2005
2006   /* If we are outside the range of the subexpression, return -1 or 1.  */
2007   if (str_idx < lim->subexp_from)
2008     return -1;
2009
2010   if (lim->subexp_to < str_idx)
2011     return 1;
2012
2013   /* If we are within the subexpression, return 0.  */
2014   boundaries = (str_idx == lim->subexp_from);
2015   boundaries |= (str_idx == lim->subexp_to) << 1;
2016   if (boundaries == 0)
2017     return 0;
2018
2019   /* Else, examine epsilon closure.  */
2020   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2021                                       from_node, bkref_idx);
2022 }
2023
2024 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2025    which are against limitations from DEST_NODES. */
2026
2027 static reg_errcode_t
2028 internal_function
2029 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2030                      const re_node_set *candidates, re_node_set *limits,
2031                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2032 {
2033   reg_errcode_t err;
2034   int node_idx, lim_idx;
2035
2036   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2037     {
2038       int subexp_idx;
2039       struct re_backref_cache_entry *ent;
2040       ent = bkref_ents + limits->elems[lim_idx];
2041
2042       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2043         continue; /* This is unrelated limitation.  */
2044
2045       subexp_idx = dfa->nodes[ent->node].opr.idx;
2046       if (ent->subexp_to == str_idx)
2047         {
2048           int ops_node = -1;
2049           int cls_node = -1;
2050           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2051             {
2052               int node = dest_nodes->elems[node_idx];
2053               re_token_type_t type = dfa->nodes[node].type;
2054               if (type == OP_OPEN_SUBEXP
2055                   && subexp_idx == dfa->nodes[node].opr.idx)
2056                 ops_node = node;
2057               else if (type == OP_CLOSE_SUBEXP
2058                        && subexp_idx == dfa->nodes[node].opr.idx)
2059                 cls_node = node;
2060             }
2061
2062           /* Check the limitation of the open subexpression.  */
2063           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2064           if (ops_node >= 0)
2065             {
2066               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2067                                            candidates);
2068               if (BE (err != REG_NOERROR, 0))
2069                 return err;
2070             }
2071
2072           /* Check the limitation of the close subexpression.  */
2073           if (cls_node >= 0)
2074             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2075               {
2076                 int node = dest_nodes->elems[node_idx];
2077                 if (!re_node_set_contains (dfa->inveclosures + node,
2078                                            cls_node)
2079                     && !re_node_set_contains (dfa->eclosures + node,
2080                                               cls_node))
2081                   {
2082                     /* It is against this limitation.
2083                        Remove it form the current sifted state.  */
2084                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2085                                                  candidates);
2086                     if (BE (err != REG_NOERROR, 0))
2087                       return err;
2088                     --node_idx;
2089                   }
2090               }
2091         }
2092       else /* (ent->subexp_to != str_idx)  */
2093         {
2094           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2095             {
2096               int node = dest_nodes->elems[node_idx];
2097               re_token_type_t type = dfa->nodes[node].type;
2098               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2099                 {
2100                   if (subexp_idx != dfa->nodes[node].opr.idx)
2101                     continue;
2102                   /* It is against this limitation.
2103                      Remove it form the current sifted state.  */
2104                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2105                                                candidates);
2106                   if (BE (err != REG_NOERROR, 0))
2107                     return err;
2108                 }
2109             }
2110         }
2111     }
2112   return REG_NOERROR;
2113 }
2114
2115 static reg_errcode_t
2116 internal_function
2117 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2118                    int str_idx, const re_node_set *candidates)
2119 {
2120   const re_dfa_t *const dfa = mctx->dfa;
2121   reg_errcode_t err;
2122   int node_idx, node;
2123   re_sift_context_t local_sctx;
2124   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2125
2126   if (first_idx == -1)
2127     return REG_NOERROR;
2128
2129   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2130
2131   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2132     {
2133       int enabled_idx;
2134       re_token_type_t type;
2135       struct re_backref_cache_entry *entry;
2136       node = candidates->elems[node_idx];
2137       type = dfa->nodes[node].type;
2138       /* Avoid infinite loop for the REs like "()\1+".  */
2139       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2140         continue;
2141       if (type != OP_BACK_REF)
2142         continue;
2143
2144       entry = mctx->bkref_ents + first_idx;
2145       enabled_idx = first_idx;
2146       do
2147         {
2148           int subexp_len;
2149           int to_idx;
2150           int dst_node;
2151           int ret;
2152           re_dfastate_t *cur_state;
2153
2154           if (entry->node != node)
2155             continue;
2156           subexp_len = entry->subexp_to - entry->subexp_from;
2157           to_idx = str_idx + subexp_len;
2158           dst_node = (subexp_len ? dfa->nexts[node]
2159                       : dfa->edests[node].elems[0]);
2160
2161           if (to_idx > sctx->last_str_idx
2162               || sctx->sifted_states[to_idx] == NULL
2163               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2164               || check_dst_limits (mctx, &sctx->limits, node,
2165                                    str_idx, dst_node, to_idx))
2166             continue;
2167
2168           if (local_sctx.sifted_states == NULL)
2169             {
2170               local_sctx = *sctx;
2171               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2172               if (BE (err != REG_NOERROR, 0))
2173                 goto free_return;
2174             }
2175           local_sctx.last_node = node;
2176           local_sctx.last_str_idx = str_idx;
2177           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2178           if (BE (ret < 0, 0))
2179             {
2180               err = REG_ESPACE;
2181               goto free_return;
2182             }
2183           cur_state = local_sctx.sifted_states[str_idx];
2184           err = sift_states_backward (mctx, &local_sctx);
2185           if (BE (err != REG_NOERROR, 0))
2186             goto free_return;
2187           if (sctx->limited_states != NULL)
2188             {
2189               err = merge_state_array (dfa, sctx->limited_states,
2190                                        local_sctx.sifted_states,
2191                                        str_idx + 1);
2192               if (BE (err != REG_NOERROR, 0))
2193                 goto free_return;
2194             }
2195           local_sctx.sifted_states[str_idx] = cur_state;
2196           re_node_set_remove (&local_sctx.limits, enabled_idx);
2197
2198           /* mctx->bkref_ents may have changed, reload the pointer.  */
2199           entry = mctx->bkref_ents + enabled_idx;
2200         }
2201       while (enabled_idx++, entry++->more);
2202     }
2203   err = REG_NOERROR;
2204  free_return:
2205   if (local_sctx.sifted_states != NULL)
2206     {
2207       re_node_set_free (&local_sctx.limits);
2208     }
2209
2210   return err;
2211 }
2212
2213
2214 #ifdef RE_ENABLE_I18N
2215 static int
2216 internal_function
2217 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2218                      int node_idx, int str_idx, int max_str_idx)
2219 {
2220   const re_dfa_t *const dfa = mctx->dfa;
2221   int naccepted;
2222   /* Check the node can accept `multi byte'.  */
2223   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2224   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2225       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2226                             dfa->nexts[node_idx]))
2227     /* The node can't accept the `multi byte', or the
2228        destination was already thrown away, then the node
2229        could't accept the current input `multi byte'.   */
2230     naccepted = 0;
2231   /* Otherwise, it is sure that the node could accept
2232      `naccepted' bytes input.  */
2233   return naccepted;
2234 }
2235 #endif /* RE_ENABLE_I18N */
2236
2237 \f
2238 /* Functions for state transition.  */
2239
2240 /* Return the next state to which the current state STATE will transit by
2241    accepting the current input byte, and update STATE_LOG if necessary.
2242    If STATE can accept a multibyte char/collating element/back reference
2243    update the destination of STATE_LOG.  */
2244
2245 static re_dfastate_t *
2246 internal_function
2247 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2248                re_dfastate_t *state)
2249 {
2250   re_dfastate_t **trtable;
2251   unsigned char ch;
2252
2253 #ifdef RE_ENABLE_I18N
2254   /* If the current state can accept multibyte.  */
2255   if (BE (state->accept_mb, 0))
2256     {
2257       *err = transit_state_mb (mctx, state);
2258       if (BE (*err != REG_NOERROR, 0))
2259         return NULL;
2260     }
2261 #endif /* RE_ENABLE_I18N */
2262
2263   /* Then decide the next state with the single byte.  */
2264 #if 0
2265   if (0)
2266     /* don't use transition table  */
2267     return transit_state_sb (err, mctx, state);
2268 #endif
2269
2270   /* Use transition table  */
2271   ch = re_string_fetch_byte (&mctx->input);
2272   for (;;)
2273     {
2274       trtable = state->trtable;
2275       if (BE (trtable != NULL, 1))
2276         return trtable[ch];
2277
2278       trtable = state->word_trtable;
2279       if (BE (trtable != NULL, 1))
2280         {
2281           unsigned int context;
2282           context
2283             = re_string_context_at (&mctx->input,
2284                                     re_string_cur_idx (&mctx->input) - 1,
2285                                     mctx->eflags);
2286           if (IS_WORD_CONTEXT (context))
2287             return trtable[ch + SBC_MAX];
2288           else
2289             return trtable[ch];
2290         }
2291
2292       if (!build_trtable (mctx->dfa, state))
2293         {
2294           *err = REG_ESPACE;
2295           return NULL;
2296         }
2297
2298       /* Retry, we now have a transition table.  */
2299     }
2300 }
2301
2302 /* Update the state_log if we need */
2303 re_dfastate_t *
2304 internal_function
2305 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2306                       re_dfastate_t *next_state)
2307 {
2308   const re_dfa_t *const dfa = mctx->dfa;
2309   int cur_idx = re_string_cur_idx (&mctx->input);
2310
2311   if (cur_idx > mctx->state_log_top)
2312     {
2313       mctx->state_log[cur_idx] = next_state;
2314       mctx->state_log_top = cur_idx;
2315     }
2316   else if (mctx->state_log[cur_idx] == 0)
2317     {
2318       mctx->state_log[cur_idx] = next_state;
2319     }
2320   else
2321     {
2322       re_dfastate_t *pstate;
2323       unsigned int context;
2324       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2325       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2326          the destination of a multibyte char/collating element/
2327          back reference.  Then the next state is the union set of
2328          these destinations and the results of the transition table.  */
2329       pstate = mctx->state_log[cur_idx];
2330       log_nodes = pstate->entrance_nodes;
2331       if (next_state != NULL)
2332         {
2333           table_nodes = next_state->entrance_nodes;
2334           *err = re_node_set_init_union (&next_nodes, table_nodes,
2335                                              log_nodes);
2336           if (BE (*err != REG_NOERROR, 0))
2337             return NULL;
2338         }
2339       else
2340         next_nodes = *log_nodes;
2341       /* Note: We already add the nodes of the initial state,
2342          then we don't need to add them here.  */
2343
2344       context = re_string_context_at (&mctx->input,
2345                                       re_string_cur_idx (&mctx->input) - 1,
2346                                       mctx->eflags);
2347       next_state = mctx->state_log[cur_idx]
2348         = re_acquire_state_context (err, dfa, &next_nodes, context);
2349       /* We don't need to check errors here, since the return value of
2350          this function is next_state and ERR is already set.  */
2351
2352       if (table_nodes != NULL)
2353         re_node_set_free (&next_nodes);
2354     }
2355
2356   if (BE (dfa->nbackref, 0) && next_state != NULL)
2357     {
2358       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2359          later.  We must check them here, since the back references in the
2360          next state might use them.  */
2361       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2362                                         cur_idx);
2363       if (BE (*err != REG_NOERROR, 0))
2364         return NULL;
2365
2366       /* If the next state has back references.  */
2367       if (next_state->has_backref)
2368         {
2369           *err = transit_state_bkref (mctx, &next_state->nodes);
2370           if (BE (*err != REG_NOERROR, 0))
2371             return NULL;
2372           next_state = mctx->state_log[cur_idx];
2373         }
2374     }
2375
2376   return next_state;
2377 }
2378
2379 /* Skip bytes in the input that correspond to part of a
2380    multi-byte match, then look in the log for a state
2381    from which to restart matching.  */
2382 re_dfastate_t *
2383 internal_function
2384 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2385 {
2386   re_dfastate_t *cur_state;
2387   do
2388     {
2389       int max = mctx->state_log_top;
2390       int cur_str_idx = re_string_cur_idx (&mctx->input);
2391
2392       do
2393         {
2394           if (++cur_str_idx > max)
2395             return NULL;
2396           re_string_skip_bytes (&mctx->input, 1);
2397         }
2398       while (mctx->state_log[cur_str_idx] == NULL);
2399
2400       cur_state = merge_state_with_log (err, mctx, NULL);
2401     }
2402   while (*err == REG_NOERROR && cur_state == NULL);
2403   return cur_state;
2404 }
2405
2406 /* Helper functions for transit_state.  */
2407
2408 /* From the node set CUR_NODES, pick up the nodes whose types are
2409    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2410    expression. And register them to use them later for evaluating the
2411    correspoding back references.  */
2412
2413 static reg_errcode_t
2414 internal_function
2415 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2416                            int str_idx)
2417 {
2418   const re_dfa_t *const dfa = mctx->dfa;
2419   int node_idx;
2420   reg_errcode_t err;
2421
2422   /* TODO: This isn't efficient.
2423            Because there might be more than one nodes whose types are
2424            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2425            nodes.
2426            E.g. RE: (a){2}  */
2427   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2428     {
2429       int node = cur_nodes->elems[node_idx];
2430       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2431           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2432           && (dfa->used_bkref_map
2433               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2434         {
2435           err = match_ctx_add_subtop (mctx, node, str_idx);
2436           if (BE (err != REG_NOERROR, 0))
2437             return err;
2438         }
2439     }
2440   return REG_NOERROR;
2441 }
2442
2443 #if 0
2444 /* Return the next state to which the current state STATE will transit by
2445    accepting the current input byte.  */
2446
2447 static re_dfastate_t *
2448 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2449                   re_dfastate_t *state)
2450 {
2451   const re_dfa_t *const dfa = mctx->dfa;
2452   re_node_set next_nodes;
2453   re_dfastate_t *next_state;
2454   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2455   unsigned int context;
2456
2457   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2458   if (BE (*err != REG_NOERROR, 0))
2459     return NULL;
2460   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2461     {
2462       int cur_node = state->nodes.elems[node_cnt];
2463       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2464         {
2465           *err = re_node_set_merge (&next_nodes,
2466                                     dfa->eclosures + dfa->nexts[cur_node]);
2467           if (BE (*err != REG_NOERROR, 0))
2468             {
2469               re_node_set_free (&next_nodes);
2470               return NULL;
2471             }
2472         }
2473     }
2474   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2475   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2476   /* We don't need to check errors here, since the return value of
2477      this function is next_state and ERR is already set.  */
2478
2479   re_node_set_free (&next_nodes);
2480   re_string_skip_bytes (&mctx->input, 1);
2481   return next_state;
2482 }
2483 #endif
2484
2485 #ifdef RE_ENABLE_I18N
2486 static reg_errcode_t
2487 internal_function
2488 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2489 {
2490   const re_dfa_t *const dfa = mctx->dfa;
2491   reg_errcode_t err;
2492   int i;
2493
2494   for (i = 0; i < pstate->nodes.nelem; ++i)
2495     {
2496       re_node_set dest_nodes, *new_nodes;
2497       int cur_node_idx = pstate->nodes.elems[i];
2498       int naccepted, dest_idx;
2499       unsigned int context;
2500       re_dfastate_t *dest_state;
2501
2502       if (!dfa->nodes[cur_node_idx].accept_mb)
2503         continue;
2504
2505       if (dfa->nodes[cur_node_idx].constraint)
2506         {
2507           context = re_string_context_at (&mctx->input,
2508                                           re_string_cur_idx (&mctx->input),
2509                                           mctx->eflags);
2510           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2511                                            context))
2512             continue;
2513         }
2514
2515       /* How many bytes the node can accept?  */
2516       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2517                                            re_string_cur_idx (&mctx->input));
2518       if (naccepted == 0)
2519         continue;
2520
2521       /* The node can accepts `naccepted' bytes.  */
2522       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2523       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2524                                : mctx->max_mb_elem_len);
2525       err = clean_state_log_if_needed (mctx, dest_idx);
2526       if (BE (err != REG_NOERROR, 0))
2527         return err;
2528 #ifdef DEBUG
2529       assert (dfa->nexts[cur_node_idx] != -1);
2530 #endif
2531       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2532
2533       dest_state = mctx->state_log[dest_idx];
2534       if (dest_state == NULL)
2535         dest_nodes = *new_nodes;
2536       else
2537         {
2538           err = re_node_set_init_union (&dest_nodes,
2539                                         dest_state->entrance_nodes, new_nodes);
2540           if (BE (err != REG_NOERROR, 0))
2541             return err;
2542         }
2543       context = re_string_context_at (&mctx->input, dest_idx - 1,
2544                                       mctx->eflags);
2545       mctx->state_log[dest_idx]
2546         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2547       if (dest_state != NULL)
2548         re_node_set_free (&dest_nodes);
2549       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2550         return err;
2551     }
2552   return REG_NOERROR;
2553 }
2554 #endif /* RE_ENABLE_I18N */
2555
2556 static reg_errcode_t
2557 internal_function
2558 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2559 {
2560   const re_dfa_t *const dfa = mctx->dfa;
2561   reg_errcode_t err;
2562   int i;
2563   int cur_str_idx = re_string_cur_idx (&mctx->input);
2564
2565   for (i = 0; i < nodes->nelem; ++i)
2566     {
2567       int dest_str_idx, prev_nelem, bkc_idx;
2568       int node_idx = nodes->elems[i];
2569       unsigned int context;
2570       const re_token_t *node = dfa->nodes + node_idx;
2571       re_node_set *new_dest_nodes;
2572
2573       /* Check whether `node' is a backreference or not.  */
2574       if (node->type != OP_BACK_REF)
2575         continue;
2576
2577       if (node->constraint)
2578         {
2579           context = re_string_context_at (&mctx->input, cur_str_idx,
2580                                           mctx->eflags);
2581           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2582             continue;
2583         }
2584
2585       /* `node' is a backreference.
2586          Check the substring which the substring matched.  */
2587       bkc_idx = mctx->nbkref_ents;
2588       err = get_subexp (mctx, node_idx, cur_str_idx);
2589       if (BE (err != REG_NOERROR, 0))
2590         goto free_return;
2591
2592       /* And add the epsilon closures (which is `new_dest_nodes') of
2593          the backreference to appropriate state_log.  */
2594 #ifdef DEBUG
2595       assert (dfa->nexts[node_idx] != -1);
2596 #endif
2597       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2598         {
2599           int subexp_len;
2600           re_dfastate_t *dest_state;
2601           struct re_backref_cache_entry *bkref_ent;
2602           bkref_ent = mctx->bkref_ents + bkc_idx;
2603           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2604             continue;
2605           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2606           new_dest_nodes = (subexp_len == 0
2607                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2608                             : dfa->eclosures + dfa->nexts[node_idx]);
2609           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2610                           - bkref_ent->subexp_from);
2611           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2612                                           mctx->eflags);
2613           dest_state = mctx->state_log[dest_str_idx];
2614           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2615                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2616           /* Add `new_dest_node' to state_log.  */
2617           if (dest_state == NULL)
2618             {
2619               mctx->state_log[dest_str_idx]
2620                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2621                                             context);
2622               if (BE (mctx->state_log[dest_str_idx] == NULL
2623                       && err != REG_NOERROR, 0))
2624                 goto free_return;
2625             }
2626           else
2627             {
2628               re_node_set dest_nodes;
2629               err = re_node_set_init_union (&dest_nodes,
2630                                             dest_state->entrance_nodes,
2631                                             new_dest_nodes);
2632               if (BE (err != REG_NOERROR, 0))
2633                 {
2634                   re_node_set_free (&dest_nodes);
2635                   goto free_return;
2636                 }
2637               mctx->state_log[dest_str_idx]
2638                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2639               re_node_set_free (&dest_nodes);
2640               if (BE (mctx->state_log[dest_str_idx] == NULL
2641                       && err != REG_NOERROR, 0))
2642                 goto free_return;
2643             }
2644           /* We need to check recursively if the backreference can epsilon
2645              transit.  */
2646           if (subexp_len == 0
2647               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2648             {
2649               err = check_subexp_matching_top (mctx, new_dest_nodes,
2650                                                cur_str_idx);
2651               if (BE (err != REG_NOERROR, 0))
2652                 goto free_return;
2653               err = transit_state_bkref (mctx, new_dest_nodes);
2654               if (BE (err != REG_NOERROR, 0))
2655                 goto free_return;
2656             }
2657         }
2658     }
2659   err = REG_NOERROR;
2660  free_return:
2661   return err;
2662 }
2663
2664 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2665    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2666    Note that we might collect inappropriate candidates here.
2667    However, the cost of checking them strictly here is too high, then we
2668    delay these checking for prune_impossible_nodes().  */
2669
2670 static reg_errcode_t
2671 internal_function
2672 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2673 {
2674   const re_dfa_t *const dfa = mctx->dfa;
2675   int subexp_num, sub_top_idx;
2676   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2677   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2678   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2679   if (cache_idx != -1)
2680     {
2681       const struct re_backref_cache_entry *entry
2682         = mctx->bkref_ents + cache_idx;
2683       do
2684         if (entry->node == bkref_node)
2685           return REG_NOERROR; /* We already checked it.  */
2686       while (entry++->more);
2687     }
2688
2689   subexp_num = dfa->nodes[bkref_node].opr.idx;
2690
2691   /* For each sub expression  */
2692   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2693     {
2694       reg_errcode_t err;
2695       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2696       re_sub_match_last_t *sub_last;
2697       int sub_last_idx, sl_str, bkref_str_off;
2698
2699       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2700         continue; /* It isn't related.  */
2701
2702       sl_str = sub_top->str_idx;
2703       bkref_str_off = bkref_str_idx;
2704       /* At first, check the last node of sub expressions we already
2705          evaluated.  */
2706       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2707         {
2708           int sl_str_diff;
2709           sub_last = sub_top->lasts[sub_last_idx];
2710           sl_str_diff = sub_last->str_idx - sl_str;
2711           /* The matched string by the sub expression match with the substring
2712              at the back reference?  */
2713           if (sl_str_diff > 0)
2714             {
2715               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2716                 {
2717                   /* Not enough chars for a successful match.  */
2718                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2719                     break;
2720
2721                   err = clean_state_log_if_needed (mctx,
2722                                                    bkref_str_off
2723                                                    + sl_str_diff);
2724                   if (BE (err != REG_NOERROR, 0))
2725                     return err;
2726                   buf = (const char *) re_string_get_buffer (&mctx->input);
2727                 }
2728               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2729                 /* We don't need to search this sub expression any more.  */
2730                 break;
2731             }
2732           bkref_str_off += sl_str_diff;
2733           sl_str += sl_str_diff;
2734           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2735                                 bkref_str_idx);
2736
2737           /* Reload buf, since the preceding call might have reallocated
2738              the buffer.  */
2739           buf = (const char *) re_string_get_buffer (&mctx->input);
2740
2741           if (err == REG_NOMATCH)
2742             continue;
2743           if (BE (err != REG_NOERROR, 0))
2744             return err;
2745         }
2746
2747       if (sub_last_idx < sub_top->nlasts)
2748         continue;
2749       if (sub_last_idx > 0)
2750         ++sl_str;
2751       /* Then, search for the other last nodes of the sub expression.  */
2752       for (; sl_str <= bkref_str_idx; ++sl_str)
2753         {
2754           int cls_node, sl_str_off;
2755           const re_node_set *nodes;
2756           sl_str_off = sl_str - sub_top->str_idx;
2757           /* The matched string by the sub expression match with the substring
2758              at the back reference?  */
2759           if (sl_str_off > 0)
2760             {
2761               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2762                 {
2763                   /* If we are at the end of the input, we cannot match.  */
2764                   if (bkref_str_off >= mctx->input.len)
2765                     break;
2766
2767                   err = extend_buffers (mctx);
2768                   if (BE (err != REG_NOERROR, 0))
2769                     return err;
2770
2771                   buf = (const char *) re_string_get_buffer (&mctx->input);
2772                 }
2773               if (buf [bkref_str_off++] != buf[sl_str - 1])
2774                 break; /* We don't need to search this sub expression
2775                           any more.  */
2776             }
2777           if (mctx->state_log[sl_str] == NULL)
2778             continue;
2779           /* Does this state have a ')' of the sub expression?  */
2780           nodes = &mctx->state_log[sl_str]->nodes;
2781           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2782                                        OP_CLOSE_SUBEXP);
2783           if (cls_node == -1)
2784             continue; /* No.  */
2785           if (sub_top->path == NULL)
2786             {
2787               sub_top->path = calloc (sizeof (state_array_t),
2788                                       sl_str - sub_top->str_idx + 1);
2789               if (sub_top->path == NULL)
2790                 return REG_ESPACE;
2791             }
2792           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2793              in the current context?  */
2794           err = check_arrival (mctx, sub_top->path, sub_top->node,
2795                                sub_top->str_idx, cls_node, sl_str,
2796                                OP_CLOSE_SUBEXP);
2797           if (err == REG_NOMATCH)
2798               continue;
2799           if (BE (err != REG_NOERROR, 0))
2800               return err;
2801           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2802           if (BE (sub_last == NULL, 0))
2803             return REG_ESPACE;
2804           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2805                                 bkref_str_idx);
2806           if (err == REG_NOMATCH)
2807             continue;
2808         }
2809     }
2810   return REG_NOERROR;
2811 }
2812
2813 /* Helper functions for get_subexp().  */
2814
2815 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2816    If it can arrive, register the sub expression expressed with SUB_TOP
2817    and SUB_LAST.  */
2818
2819 static reg_errcode_t
2820 internal_function
2821 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2822                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2823 {
2824   reg_errcode_t err;
2825   int to_idx;
2826   /* Can the subexpression arrive the back reference?  */
2827   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2828                        sub_last->str_idx, bkref_node, bkref_str,
2829                        OP_OPEN_SUBEXP);
2830   if (err != REG_NOERROR)
2831     return err;
2832   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2833                              sub_last->str_idx);
2834   if (BE (err != REG_NOERROR, 0))
2835     return err;
2836   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2837   return clean_state_log_if_needed (mctx, to_idx);
2838 }
2839
2840 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2841    Search '(' if FL_OPEN, or search ')' otherwise.
2842    TODO: This function isn't efficient...
2843          Because there might be more than one nodes whose types are
2844          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2845          nodes.
2846          E.g. RE: (a){2}  */
2847
2848 static int
2849 internal_function
2850 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2851                   int subexp_idx, int type)
2852 {
2853   int cls_idx;
2854   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2855     {
2856       int cls_node = nodes->elems[cls_idx];
2857       const re_token_t *node = dfa->nodes + cls_node;
2858       if (node->type == type
2859           && node->opr.idx == subexp_idx)
2860         return cls_node;
2861     }
2862   return -1;
2863 }
2864
2865 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2866    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2867    heavily reused.
2868    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2869
2870 static reg_errcode_t
2871 internal_function
2872 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2873                int top_str, int last_node, int last_str, int type)
2874 {
2875   const re_dfa_t *const dfa = mctx->dfa;
2876   reg_errcode_t err = REG_NOERROR;
2877   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2878   re_dfastate_t *cur_state = NULL;
2879   re_node_set *cur_nodes, next_nodes;
2880   re_dfastate_t **backup_state_log;
2881   unsigned int context;
2882
2883   subexp_num = dfa->nodes[top_node].opr.idx;
2884   /* Extend the buffer if we need.  */
2885   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2886     {
2887       re_dfastate_t **new_array;
2888       int old_alloc = path->alloc;
2889       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2890       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2891       if (BE (new_array == NULL, 0))
2892         {
2893           path->alloc = old_alloc;
2894           return REG_ESPACE;
2895         }
2896       path->array = new_array;
2897       memset (new_array + old_alloc, '\0',
2898               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2899     }
2900
2901   str_idx = path->next_idx ?: top_str;
2902
2903   /* Temporary modify MCTX.  */
2904   backup_state_log = mctx->state_log;
2905   backup_cur_idx = mctx->input.cur_idx;
2906   mctx->state_log = path->array;
2907   mctx->input.cur_idx = str_idx;
2908
2909   /* Setup initial node set.  */
2910   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2911   if (str_idx == top_str)
2912     {
2913       err = re_node_set_init_1 (&next_nodes, top_node);
2914       if (BE (err != REG_NOERROR, 0))
2915         return err;
2916       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2917       if (BE (err != REG_NOERROR, 0))
2918         {
2919           re_node_set_free (&next_nodes);
2920           return err;
2921         }
2922     }
2923   else
2924     {
2925       cur_state = mctx->state_log[str_idx];
2926       if (cur_state && cur_state->has_backref)
2927         {
2928           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2929           if (BE (err != REG_NOERROR, 0))
2930             return err;
2931         }
2932       else
2933         re_node_set_init_empty (&next_nodes);
2934     }
2935   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2936     {
2937       if (next_nodes.nelem)
2938         {
2939           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2940                                     subexp_num, type);
2941           if (BE (err != REG_NOERROR, 0))
2942             {
2943               re_node_set_free (&next_nodes);
2944               return err;
2945             }
2946         }
2947       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2948       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2949         {
2950           re_node_set_free (&next_nodes);
2951           return err;
2952         }
2953       mctx->state_log[str_idx] = cur_state;
2954     }
2955
2956   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2957     {
2958       re_node_set_empty (&next_nodes);
2959       if (mctx->state_log[str_idx + 1])
2960         {
2961           err = re_node_set_merge (&next_nodes,
2962                                    &mctx->state_log[str_idx + 1]->nodes);
2963           if (BE (err != REG_NOERROR, 0))
2964             {
2965               re_node_set_free (&next_nodes);
2966               return err;
2967             }
2968         }
2969       if (cur_state)
2970         {
2971           err = check_arrival_add_next_nodes (mctx, str_idx,
2972                                               &cur_state->non_eps_nodes,
2973                                               &next_nodes);
2974           if (BE (err != REG_NOERROR, 0))
2975             {
2976               re_node_set_free (&next_nodes);
2977               return err;
2978             }
2979         }
2980       ++str_idx;
2981       if (next_nodes.nelem)
2982         {
2983           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2984           if (BE (err != REG_NOERROR, 0))
2985             {
2986               re_node_set_free (&next_nodes);
2987               return err;
2988             }
2989           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2990                                     subexp_num, type);
2991           if (BE (err != REG_NOERROR, 0))
2992             {
2993               re_node_set_free (&next_nodes);
2994               return err;
2995             }
2996         }
2997       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2998       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2999       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3000         {
3001           re_node_set_free (&next_nodes);
3002           return err;
3003         }
3004       mctx->state_log[str_idx] = cur_state;
3005       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3006     }
3007   re_node_set_free (&next_nodes);
3008   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3009                : &mctx->state_log[last_str]->nodes);
3010   path->next_idx = str_idx;
3011
3012   /* Fix MCTX.  */
3013   mctx->state_log = backup_state_log;
3014   mctx->input.cur_idx = backup_cur_idx;
3015
3016   /* Then check the current node set has the node LAST_NODE.  */
3017   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3018     return REG_NOERROR;
3019
3020   return REG_NOMATCH;
3021 }
3022
3023 /* Helper functions for check_arrival.  */
3024
3025 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3026    to NEXT_NODES.
3027    TODO: This function is similar to the functions transit_state*(),
3028          however this function has many additional works.
3029          Can't we unify them?  */
3030
3031 static reg_errcode_t
3032 internal_function
3033 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3034                               re_node_set *cur_nodes, re_node_set *next_nodes)
3035 {
3036   const re_dfa_t *const dfa = mctx->dfa;
3037   int result;
3038   int cur_idx;
3039 #ifdef RE_ENABLE_I18N
3040   reg_errcode_t err = REG_NOERROR;
3041 #endif
3042   re_node_set union_set;
3043   re_node_set_init_empty (&union_set);
3044   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3045     {
3046       int naccepted = 0;
3047       int cur_node = cur_nodes->elems[cur_idx];
3048 #ifdef DEBUG
3049       re_token_type_t type = dfa->nodes[cur_node].type;
3050       assert (!IS_EPSILON_NODE (type));
3051 #endif
3052 #ifdef RE_ENABLE_I18N
3053       /* If the node may accept `multi byte'.  */
3054       if (dfa->nodes[cur_node].accept_mb)
3055         {
3056           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3057                                                str_idx);
3058           if (naccepted > 1)
3059             {
3060               re_dfastate_t *dest_state;
3061               int next_node = dfa->nexts[cur_node];
3062               int next_idx = str_idx + naccepted;
3063               dest_state = mctx->state_log[next_idx];
3064               re_node_set_empty (&union_set);
3065               if (dest_state)
3066                 {
3067                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3068                   if (BE (err != REG_NOERROR, 0))
3069                     {
3070                       re_node_set_free (&union_set);
3071                       return err;
3072                     }
3073                 }
3074               result = re_node_set_insert (&union_set, next_node);
3075               if (BE (result < 0, 0))
3076                 {
3077                   re_node_set_free (&union_set);
3078                   return REG_ESPACE;
3079                 }
3080               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3081                                                             &union_set);
3082               if (BE (mctx->state_log[next_idx] == NULL
3083                       && err != REG_NOERROR, 0))
3084                 {
3085                   re_node_set_free (&union_set);
3086                   return err;
3087                 }
3088             }
3089         }
3090 #endif /* RE_ENABLE_I18N */
3091       if (naccepted
3092           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3093         {
3094           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3095           if (BE (result < 0, 0))
3096             {
3097               re_node_set_free (&union_set);
3098               return REG_ESPACE;
3099             }
3100         }
3101     }
3102   re_node_set_free (&union_set);
3103   return REG_NOERROR;
3104 }
3105
3106 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3107    CUR_NODES, however exclude the nodes which are:
3108     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3109     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3110 */
3111
3112 static reg_errcode_t
3113 internal_function
3114 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3115                           int ex_subexp, int type)
3116 {
3117   reg_errcode_t err;
3118   int idx, outside_node;
3119   re_node_set new_nodes;
3120 #ifdef DEBUG
3121   assert (cur_nodes->nelem);
3122 #endif
3123   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3124   if (BE (err != REG_NOERROR, 0))
3125     return err;
3126   /* Create a new node set NEW_NODES with the nodes which are epsilon
3127      closures of the node in CUR_NODES.  */
3128
3129   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3130     {
3131       int cur_node = cur_nodes->elems[idx];
3132       const re_node_set *eclosure = dfa->eclosures + cur_node;
3133       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3134       if (outside_node == -1)
3135         {
3136           /* There are no problematic nodes, just merge them.  */
3137           err = re_node_set_merge (&new_nodes, eclosure);
3138           if (BE (err != REG_NOERROR, 0))
3139             {
3140               re_node_set_free (&new_nodes);
3141               return err;
3142             }
3143         }
3144       else
3145         {
3146           /* There are problematic nodes, re-calculate incrementally.  */
3147           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3148                                               ex_subexp, type);
3149           if (BE (err != REG_NOERROR, 0))
3150             {
3151               re_node_set_free (&new_nodes);
3152               return err;
3153             }
3154         }
3155     }
3156   re_node_set_free (cur_nodes);
3157   *cur_nodes = new_nodes;
3158   return REG_NOERROR;
3159 }
3160
3161 /* Helper function for check_arrival_expand_ecl.
3162    Check incrementally the epsilon closure of TARGET, and if it isn't
3163    problematic append it to DST_NODES.  */
3164
3165 static reg_errcode_t
3166 internal_function
3167 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3168                               int target, int ex_subexp, int type)
3169 {
3170   int cur_node;
3171   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3172     {
3173       int err;
3174
3175       if (dfa->nodes[cur_node].type == type
3176           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3177         {
3178           if (type == OP_CLOSE_SUBEXP)
3179             {
3180               err = re_node_set_insert (dst_nodes, cur_node);
3181               if (BE (err == -1, 0))
3182                 return REG_ESPACE;
3183             }
3184           break;
3185         }
3186       err = re_node_set_insert (dst_nodes, cur_node);
3187       if (BE (err == -1, 0))
3188         return REG_ESPACE;
3189       if (dfa->edests[cur_node].nelem == 0)
3190         break;
3191       if (dfa->edests[cur_node].nelem == 2)
3192         {
3193           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3194                                               dfa->edests[cur_node].elems[1],
3195                                               ex_subexp, type);
3196           if (BE (err != REG_NOERROR, 0))
3197             return err;
3198         }
3199       cur_node = dfa->edests[cur_node].elems[0];
3200     }
3201   return REG_NOERROR;
3202 }
3203
3204
3205 /* For all the back references in the current state, calculate the
3206    destination of the back references by the appropriate entry
3207    in MCTX->BKREF_ENTS.  */
3208
3209 static reg_errcode_t
3210 internal_function
3211 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3212                     int cur_str, int subexp_num, int type)
3213 {
3214   const re_dfa_t *const dfa = mctx->dfa;
3215   reg_errcode_t err;
3216   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3217   struct re_backref_cache_entry *ent;
3218
3219   if (cache_idx_start == -1)
3220     return REG_NOERROR;
3221
3222  restart:
3223   ent = mctx->bkref_ents + cache_idx_start;
3224   do
3225     {
3226       int to_idx, next_node;
3227
3228       /* Is this entry ENT is appropriate?  */
3229       if (!re_node_set_contains (cur_nodes, ent->node))
3230         continue; /* No.  */
3231
3232       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3233       /* Calculate the destination of the back reference, and append it
3234          to MCTX->STATE_LOG.  */
3235       if (to_idx == cur_str)
3236         {
3237           /* The backreference did epsilon transit, we must re-check all the
3238              node in the current state.  */
3239           re_node_set new_dests;
3240           reg_errcode_t err2, err3;
3241           next_node = dfa->edests[ent->node].elems[0];
3242           if (re_node_set_contains (cur_nodes, next_node))
3243             continue;
3244           err = re_node_set_init_1 (&new_dests, next_node);
3245           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3246           err3 = re_node_set_merge (cur_nodes, &new_dests);
3247           re_node_set_free (&new_dests);
3248           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3249                   || err3 != REG_NOERROR, 0))
3250             {
3251               err = (err != REG_NOERROR ? err
3252                      : (err2 != REG_NOERROR ? err2 : err3));
3253               return err;
3254             }
3255           /* TODO: It is still inefficient...  */
3256           goto restart;
3257         }
3258       else
3259         {
3260           re_node_set union_set;
3261           next_node = dfa->nexts[ent->node];
3262           if (mctx->state_log[to_idx])
3263             {
3264               int ret;
3265               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3266                                         next_node))
3267                 continue;
3268               err = re_node_set_init_copy (&union_set,
3269                                            &mctx->state_log[to_idx]->nodes);
3270               ret = re_node_set_insert (&union_set, next_node);
3271               if (BE (err != REG_NOERROR || ret < 0, 0))
3272                 {
3273                   re_node_set_free (&union_set);
3274                   err = err != REG_NOERROR ? err : REG_ESPACE;
3275                   return err;
3276                 }
3277             }
3278           else
3279             {
3280               err = re_node_set_init_1 (&union_set, next_node);
3281               if (BE (err != REG_NOERROR, 0))
3282                 return err;
3283             }
3284           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3285           re_node_set_free (&union_set);
3286           if (BE (mctx->state_log[to_idx] == NULL
3287                   && err != REG_NOERROR, 0))
3288             return err;
3289         }
3290     }
3291   while (ent++->more);
3292   return REG_NOERROR;
3293 }
3294
3295 /* Build transition table for the state.
3296    Return 1 if succeeded, otherwise return NULL.  */
3297
3298 static int
3299 internal_function
3300 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3301 {
3302   reg_errcode_t err;
3303   int i, j, ch, need_word_trtable = 0;
3304   bitset_word_t elem, mask;
3305   bool dests_node_malloced = false;
3306   bool dest_states_malloced = false;
3307   int ndests; /* Number of the destination states from `state'.  */
3308   re_dfastate_t **trtable;
3309   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3310   re_node_set follows, *dests_node;
3311   bitset_t *dests_ch;
3312   bitset_t acceptable;
3313
3314   struct dests_alloc
3315   {
3316     re_node_set dests_node[SBC_MAX];
3317     bitset_t dests_ch[SBC_MAX];
3318   } *dests_alloc;
3319
3320   /* We build DFA states which corresponds to the destination nodes
3321      from `state'.  `dests_node[i]' represents the nodes which i-th
3322      destination state contains, and `dests_ch[i]' represents the
3323      characters which i-th destination state accepts.  */
3324   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3325     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3326   else
3327     {
3328       dests_alloc = re_malloc (struct dests_alloc, 1);
3329       if (BE (dests_alloc == NULL, 0))
3330         return 0;
3331       dests_node_malloced = true;
3332     }
3333   dests_node = dests_alloc->dests_node;
3334   dests_ch = dests_alloc->dests_ch;
3335
3336   /* Initialize transiton table.  */
3337   state->word_trtable = state->trtable = NULL;
3338
3339   /* At first, group all nodes belonging to `state' into several
3340      destinations.  */
3341   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3342   if (BE (ndests <= 0, 0))
3343     {
3344       if (dests_node_malloced)
3345         free (dests_alloc);
3346       /* Return 0 in case of an error, 1 otherwise.  */
3347       if (ndests == 0)
3348         {
3349           state->trtable = (re_dfastate_t **)
3350             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3351           return 1;
3352         }
3353       return 0;
3354     }
3355
3356   err = re_node_set_alloc (&follows, ndests + 1);
3357   if (BE (err != REG_NOERROR, 0))
3358     goto out_free;
3359
3360   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3361                          + ndests * 3 * sizeof (re_dfastate_t *)))
3362     dest_states = (re_dfastate_t **)
3363       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3364   else
3365     {
3366       dest_states = (re_dfastate_t **)
3367         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3368       if (BE (dest_states == NULL, 0))
3369         {
3370 out_free:
3371           if (dest_states_malloced)
3372             free (dest_states);
3373           re_node_set_free (&follows);
3374           for (i = 0; i < ndests; ++i)
3375             re_node_set_free (dests_node + i);
3376           if (dests_node_malloced)
3377             free (dests_alloc);
3378           return 0;
3379         }
3380       dest_states_malloced = true;
3381     }
3382   dest_states_word = dest_states + ndests;
3383   dest_states_nl = dest_states_word + ndests;
3384   bitset_empty (acceptable);
3385
3386   /* Then build the states for all destinations.  */
3387   for (i = 0; i < ndests; ++i)
3388     {
3389       int next_node;
3390       re_node_set_empty (&follows);
3391       /* Merge the follows of this destination states.  */
3392       for (j = 0; j < dests_node[i].nelem; ++j)
3393         {
3394           next_node = dfa->nexts[dests_node[i].elems[j]];
3395           if (next_node != -1)
3396             {
3397               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3398               if (BE (err != REG_NOERROR, 0))
3399                 goto out_free;
3400             }
3401         }
3402       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3403       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3404         goto out_free;
3405       /* If the new state has context constraint,
3406          build appropriate states for these contexts.  */
3407       if (dest_states[i]->has_constraint)
3408         {
3409           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3410                                                           CONTEXT_WORD);
3411           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3412             goto out_free;
3413
3414           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3415             need_word_trtable = 1;
3416
3417           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3418                                                         CONTEXT_NEWLINE);
3419           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3420             goto out_free;
3421         }
3422       else
3423         {
3424           dest_states_word[i] = dest_states[i];
3425           dest_states_nl[i] = dest_states[i];
3426         }
3427       bitset_merge (acceptable, dests_ch[i]);
3428     }
3429
3430   if (!BE (need_word_trtable, 0))
3431     {
3432       /* We don't care about whether the following character is a word
3433          character, or we are in a single-byte character set so we can
3434          discern by looking at the character code: allocate a
3435          256-entry transition table.  */
3436       trtable = state->trtable =
3437         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3438       if (BE (trtable == NULL, 0))
3439         goto out_free;
3440
3441       /* For all characters ch...:  */
3442       for (i = 0; i < BITSET_WORDS; ++i)
3443         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3444              elem;
3445              mask <<= 1, elem >>= 1, ++ch)
3446           if (BE (elem & 1, 0))
3447             {
3448               /* There must be exactly one destination which accepts
3449                  character ch.  See group_nodes_into_DFAstates.  */
3450               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3451                 ;
3452
3453               /* j-th destination accepts the word character ch.  */
3454               if (dfa->word_char[i] & mask)
3455                 trtable[ch] = dest_states_word[j];
3456               else
3457                 trtable[ch] = dest_states[j];
3458             }
3459     }
3460   else
3461     {
3462       /* We care about whether the following character is a word
3463          character, and we are in a multi-byte character set: discern
3464          by looking at the character code: build two 256-entry
3465          transition tables, one starting at trtable[0] and one
3466          starting at trtable[SBC_MAX].  */
3467       trtable = state->word_trtable =
3468         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3469       if (BE (trtable == NULL, 0))
3470         goto out_free;
3471
3472       /* For all characters ch...:  */
3473       for (i = 0; i < BITSET_WORDS; ++i)
3474         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3475              elem;
3476              mask <<= 1, elem >>= 1, ++ch)
3477           if (BE (elem & 1, 0))
3478             {
3479               /* There must be exactly one destination which accepts
3480                  character ch.  See group_nodes_into_DFAstates.  */
3481               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3482                 ;
3483
3484               /* j-th destination accepts the word character ch.  */
3485               trtable[ch] = dest_states[j];
3486               trtable[ch + SBC_MAX] = dest_states_word[j];
3487             }
3488     }
3489
3490   /* new line */
3491   if (bitset_contain (acceptable, NEWLINE_CHAR))
3492     {
3493       /* The current state accepts newline character.  */
3494       for (j = 0; j < ndests; ++j)
3495         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3496           {
3497             /* k-th destination accepts newline character.  */
3498             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3499             if (need_word_trtable)
3500               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3501             /* There must be only one destination which accepts
3502                newline.  See group_nodes_into_DFAstates.  */
3503             break;
3504           }
3505     }
3506
3507   if (dest_states_malloced)
3508     free (dest_states);
3509
3510   re_node_set_free (&follows);
3511   for (i = 0; i < ndests; ++i)
3512     re_node_set_free (dests_node + i);
3513
3514   if (dests_node_malloced)
3515     free (dests_alloc);
3516
3517   return 1;
3518 }
3519
3520 /* Group all nodes belonging to STATE into several destinations.
3521    Then for all destinations, set the nodes belonging to the destination
3522    to DESTS_NODE[i] and set the characters accepted by the destination
3523    to DEST_CH[i].  This function return the number of destinations.  */
3524
3525 static int
3526 internal_function
3527 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3528                             re_node_set *dests_node, bitset_t *dests_ch)
3529 {
3530   reg_errcode_t err;
3531   int result;
3532   int i, j, k;
3533   int ndests; /* Number of the destinations from `state'.  */
3534   bitset_t accepts; /* Characters a node can accept.  */
3535   const re_node_set *cur_nodes = &state->nodes;
3536   bitset_empty (accepts);
3537   ndests = 0;
3538
3539   /* For all the nodes belonging to `state',  */
3540   for (i = 0; i < cur_nodes->nelem; ++i)
3541     {
3542       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3543       re_token_type_t type = node->type;
3544       unsigned int constraint = node->constraint;
3545
3546       /* Enumerate all single byte character this node can accept.  */
3547       if (type == CHARACTER)
3548         bitset_set (accepts, node->opr.c);
3549       else if (type == SIMPLE_BRACKET)
3550         {
3551           bitset_merge (accepts, node->opr.sbcset);
3552         }
3553       else if (type == OP_PERIOD)
3554         {
3555 #ifdef RE_ENABLE_I18N
3556           if (dfa->mb_cur_max > 1)
3557             bitset_merge (accepts, dfa->sb_char);
3558           else
3559 #endif
3560             bitset_set_all (accepts);
3561           if (!(dfa->syntax & RE_DOT_NEWLINE))
3562             bitset_clear (accepts, '\n');
3563           if (dfa->syntax & RE_DOT_NOT_NULL)
3564             bitset_clear (accepts, '\0');
3565         }
3566 #ifdef RE_ENABLE_I18N
3567       else if (type == OP_UTF8_PERIOD)
3568         {
3569           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3570           if (!(dfa->syntax & RE_DOT_NEWLINE))
3571             bitset_clear (accepts, '\n');
3572           if (dfa->syntax & RE_DOT_NOT_NULL)
3573             bitset_clear (accepts, '\0');
3574         }
3575 #endif
3576       else
3577         continue;
3578
3579       /* Check the `accepts' and sift the characters which are not
3580          match it the context.  */
3581       if (constraint)
3582         {
3583           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3584             {
3585               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3586               bitset_empty (accepts);
3587               if (accepts_newline)
3588                 bitset_set (accepts, NEWLINE_CHAR);
3589               else
3590                 continue;
3591             }
3592           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3593             {
3594               bitset_empty (accepts);
3595               continue;
3596             }
3597
3598           if (constraint & NEXT_WORD_CONSTRAINT)
3599             {
3600               bitset_word_t any_set = 0;
3601               if (type == CHARACTER && !node->word_char)
3602                 {
3603                   bitset_empty (accepts);
3604                   continue;
3605                 }
3606 #ifdef RE_ENABLE_I18N
3607               if (dfa->mb_cur_max > 1)
3608                 for (j = 0; j < BITSET_WORDS; ++j)
3609                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3610               else
3611 #endif
3612                 for (j = 0; j < BITSET_WORDS; ++j)
3613                   any_set |= (accepts[j] &= dfa->word_char[j]);
3614               if (!any_set)
3615                 continue;
3616             }
3617           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3618             {
3619               bitset_word_t any_set = 0;
3620               if (type == CHARACTER && node->word_char)
3621                 {
3622                   bitset_empty (accepts);
3623                   continue;
3624                 }
3625 #ifdef RE_ENABLE_I18N
3626               if (dfa->mb_cur_max > 1)
3627                 for (j = 0; j < BITSET_WORDS; ++j)
3628                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3629               else
3630 #endif
3631                 for (j = 0; j < BITSET_WORDS; ++j)
3632                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3633               if (!any_set)
3634                 continue;
3635             }
3636         }
3637
3638       /* Then divide `accepts' into DFA states, or create a new
3639          state.  Above, we make sure that accepts is not empty.  */
3640       for (j = 0; j < ndests; ++j)
3641         {
3642           bitset_t intersec; /* Intersection sets, see below.  */
3643           bitset_t remains;
3644           /* Flags, see below.  */
3645           bitset_word_t has_intersec, not_subset, not_consumed;
3646
3647           /* Optimization, skip if this state doesn't accept the character.  */
3648           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3649             continue;
3650
3651           /* Enumerate the intersection set of this state and `accepts'.  */
3652           has_intersec = 0;
3653           for (k = 0; k < BITSET_WORDS; ++k)
3654             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3655           /* And skip if the intersection set is empty.  */
3656           if (!has_intersec)
3657             continue;
3658
3659           /* Then check if this state is a subset of `accepts'.  */
3660           not_subset = not_consumed = 0;
3661           for (k = 0; k < BITSET_WORDS; ++k)
3662             {
3663               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3664               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3665             }
3666
3667           /* If this state isn't a subset of `accepts', create a
3668              new group state, which has the `remains'. */
3669           if (not_subset)
3670             {
3671               bitset_copy (dests_ch[ndests], remains);
3672               bitset_copy (dests_ch[j], intersec);
3673               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3674               if (BE (err != REG_NOERROR, 0))
3675                 goto error_return;
3676               ++ndests;
3677             }
3678
3679           /* Put the position in the current group. */
3680           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3681           if (BE (result < 0, 0))
3682             goto error_return;
3683
3684           /* If all characters are consumed, go to next node. */
3685           if (!not_consumed)
3686             break;
3687         }
3688       /* Some characters remain, create a new group. */
3689       if (j == ndests)
3690         {
3691           bitset_copy (dests_ch[ndests], accepts);
3692           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3693           if (BE (err != REG_NOERROR, 0))
3694             goto error_return;
3695           ++ndests;
3696           bitset_empty (accepts);
3697         }
3698     }
3699   return ndests;
3700  error_return:
3701   for (j = 0; j < ndests; ++j)
3702     re_node_set_free (dests_node + j);
3703   return -1;
3704 }
3705
3706 #ifdef RE_ENABLE_I18N
3707 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3708    Return the number of the bytes the node accepts.
3709    STR_IDX is the current index of the input string.
3710
3711    This function handles the nodes which can accept one character, or
3712    one collating element like '.', '[a-z]', opposite to the other nodes
3713    can only accept one byte.  */
3714
3715 static int
3716 internal_function
3717 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3718                          const re_string_t *input, int str_idx)
3719 {
3720   const re_token_t *node = dfa->nodes + node_idx;
3721   int char_len, elem_len;
3722   int i;
3723
3724   if (BE (node->type == OP_UTF8_PERIOD, 0))
3725     {
3726       unsigned char c = re_string_byte_at (input, str_idx), d;
3727       if (BE (c < 0xc2, 1))
3728         return 0;
3729
3730       if (str_idx + 2 > input->len)
3731         return 0;
3732
3733       d = re_string_byte_at (input, str_idx + 1);
3734       if (c < 0xe0)
3735         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3736       else if (c < 0xf0)
3737         {
3738           char_len = 3;
3739           if (c == 0xe0 && d < 0xa0)
3740             return 0;
3741         }
3742       else if (c < 0xf8)
3743         {
3744           char_len = 4;
3745           if (c == 0xf0 && d < 0x90)
3746             return 0;
3747         }
3748       else if (c < 0xfc)
3749         {
3750           char_len = 5;
3751           if (c == 0xf8 && d < 0x88)
3752             return 0;
3753         }
3754       else if (c < 0xfe)
3755         {
3756           char_len = 6;
3757           if (c == 0xfc && d < 0x84)
3758             return 0;
3759         }
3760       else
3761         return 0;
3762
3763       if (str_idx + char_len > input->len)
3764         return 0;
3765
3766       for (i = 1; i < char_len; ++i)
3767         {
3768           d = re_string_byte_at (input, str_idx + i);
3769           if (d < 0x80 || d > 0xbf)
3770             return 0;
3771         }
3772       return char_len;
3773     }
3774
3775   char_len = re_string_char_size_at (input, str_idx);
3776   if (node->type == OP_PERIOD)
3777     {
3778       if (char_len <= 1)
3779         return 0;
3780       /* FIXME: I don't think this if is needed, as both '\n'
3781          and '\0' are char_len == 1.  */
3782       /* '.' accepts any one character except the following two cases.  */
3783       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3784            re_string_byte_at (input, str_idx) == '\n') ||
3785           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3786            re_string_byte_at (input, str_idx) == '\0'))
3787         return 0;
3788       return char_len;
3789     }
3790
3791   elem_len = re_string_elem_size_at (input, str_idx);
3792   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3793     return 0;
3794
3795   if (node->type == COMPLEX_BRACKET)
3796     {
3797       const re_charset_t *cset = node->opr.mbcset;
3798 # ifdef _LIBC
3799       const unsigned char *pin
3800         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3801       int j;
3802       uint32_t nrules;
3803 # endif /* _LIBC */
3804       int match_len = 0;
3805       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3806                     ? re_string_wchar_at (input, str_idx) : 0);
3807
3808       /* match with multibyte character?  */
3809       for (i = 0; i < cset->nmbchars; ++i)
3810         if (wc == cset->mbchars[i])
3811           {
3812             match_len = char_len;
3813             goto check_node_accept_bytes_match;
3814           }
3815       /* match with character_class?  */
3816       for (i = 0; i < cset->nchar_classes; ++i)
3817         {
3818           wctype_t wt = cset->char_classes[i];
3819           if (__iswctype (wc, wt))
3820             {
3821               match_len = char_len;
3822               goto check_node_accept_bytes_match;
3823             }
3824         }
3825
3826 # ifdef _LIBC
3827       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3828       if (nrules != 0)
3829         {
3830           unsigned int in_collseq = 0;
3831           const int32_t *table, *indirect;
3832           const unsigned char *weights, *extra;
3833           const char *collseqwc;
3834           /* This #include defines a local function!  */
3835 #  include <locale/weight.h>
3836
3837           /* match with collating_symbol?  */
3838           if (cset->ncoll_syms)
3839             extra = (const unsigned char *)
3840               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3841           for (i = 0; i < cset->ncoll_syms; ++i)
3842             {
3843               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3844               /* Compare the length of input collating element and
3845                  the length of current collating element.  */
3846               if (*coll_sym != elem_len)
3847                 continue;
3848               /* Compare each bytes.  */
3849               for (j = 0; j < *coll_sym; j++)
3850                 if (pin[j] != coll_sym[1 + j])
3851                   break;
3852               if (j == *coll_sym)
3853                 {
3854                   /* Match if every bytes is equal.  */
3855                   match_len = j;
3856                   goto check_node_accept_bytes_match;
3857                 }
3858             }
3859
3860           if (cset->nranges)
3861             {
3862               if (elem_len <= char_len)
3863                 {
3864                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3865                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3866                 }
3867               else
3868                 in_collseq = find_collation_sequence_value (pin, elem_len);
3869             }
3870           /* match with range expression?  */
3871           for (i = 0; i < cset->nranges; ++i)
3872             if (cset->range_starts[i] <= in_collseq
3873                 && in_collseq <= cset->range_ends[i])
3874               {
3875                 match_len = elem_len;
3876                 goto check_node_accept_bytes_match;
3877               }
3878
3879           /* match with equivalence_class?  */
3880           if (cset->nequiv_classes)
3881             {
3882               const unsigned char *cp = pin;
3883               table = (const int32_t *)
3884                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3885               weights = (const unsigned char *)
3886                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3887               extra = (const unsigned char *)
3888                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3889               indirect = (const int32_t *)
3890                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3891               int32_t idx = findidx (&cp);
3892               if (idx > 0)
3893                 for (i = 0; i < cset->nequiv_classes; ++i)
3894                   {
3895                     int32_t equiv_class_idx = cset->equiv_classes[i];
3896                     size_t weight_len = weights[idx & 0xffffff];
3897                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3898                         && (idx >> 24) == (equiv_class_idx >> 24))
3899                       {
3900                         int cnt = 0;
3901
3902                         idx &= 0xffffff;
3903                         equiv_class_idx &= 0xffffff;
3904
3905                         while (cnt <= weight_len
3906                                && (weights[equiv_class_idx + 1 + cnt]
3907                                    == weights[idx + 1 + cnt]))
3908                           ++cnt;
3909                         if (cnt > weight_len)
3910                           {
3911                             match_len = elem_len;
3912                             goto check_node_accept_bytes_match;
3913                           }
3914                       }
3915                   }
3916             }
3917         }
3918       else
3919 # endif /* _LIBC */
3920         {
3921           /* match with range expression?  */
3922 #if __GNUC__ >= 2
3923           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3924 #else
3925           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3926           cmp_buf[2] = wc;
3927 #endif
3928           for (i = 0; i < cset->nranges; ++i)
3929             {
3930               cmp_buf[0] = cset->range_starts[i];
3931               cmp_buf[4] = cset->range_ends[i];
3932               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3933                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3934                 {
3935                   match_len = char_len;
3936                   goto check_node_accept_bytes_match;
3937                 }
3938             }
3939         }
3940     check_node_accept_bytes_match:
3941       if (!cset->non_match)
3942         return match_len;
3943       else
3944         {
3945           if (match_len > 0)
3946             return 0;
3947           else
3948             return (elem_len > char_len) ? elem_len : char_len;
3949         }
3950     }
3951   return 0;
3952 }
3953
3954 # ifdef _LIBC
3955 static unsigned int
3956 internal_function
3957 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3958 {
3959   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3960   if (nrules == 0)
3961     {
3962       if (mbs_len == 1)
3963         {
3964           /* No valid character.  Match it as a single byte character.  */
3965           const unsigned char *collseq = (const unsigned char *)
3966             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3967           return collseq[mbs[0]];
3968         }
3969       return UINT_MAX;
3970     }
3971   else
3972     {
3973       int32_t idx;
3974       const unsigned char *extra = (const unsigned char *)
3975         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3976       int32_t extrasize = (const unsigned char *)
3977         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3978
3979       for (idx = 0; idx < extrasize;)
3980         {
3981           int mbs_cnt, found = 0;
3982           int32_t elem_mbs_len;
3983           /* Skip the name of collating element name.  */
3984           idx = idx + extra[idx] + 1;
3985           elem_mbs_len = extra[idx++];
3986           if (mbs_len == elem_mbs_len)
3987             {
3988               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3989                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3990                   break;
3991               if (mbs_cnt == elem_mbs_len)
3992                 /* Found the entry.  */
3993                 found = 1;
3994             }
3995           /* Skip the byte sequence of the collating element.  */
3996           idx += elem_mbs_len;
3997           /* Adjust for the alignment.  */
3998           idx = (idx + 3) & ~3;
3999           /* Skip the collation sequence value.  */
4000           idx += sizeof (uint32_t);
4001           /* Skip the wide char sequence of the collating element.  */
4002           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4003           /* If we found the entry, return the sequence value.  */
4004           if (found)
4005             return *(uint32_t *) (extra + idx);
4006           /* Skip the collation sequence value.  */
4007           idx += sizeof (uint32_t);
4008         }
4009       return UINT_MAX;
4010     }
4011 }
4012 # endif /* _LIBC */
4013 #endif /* RE_ENABLE_I18N */
4014
4015 /* Check whether the node accepts the byte which is IDX-th
4016    byte of the INPUT.  */
4017
4018 static int
4019 internal_function
4020 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4021                    int idx)
4022 {
4023   unsigned char ch;
4024   ch = re_string_byte_at (&mctx->input, idx);
4025   switch (node->type)
4026     {
4027     case CHARACTER:
4028       if (node->opr.c != ch)
4029         return 0;
4030       break;
4031
4032     case SIMPLE_BRACKET:
4033       if (!bitset_contain (node->opr.sbcset, ch))
4034         return 0;
4035       break;
4036
4037 #ifdef RE_ENABLE_I18N
4038     case OP_UTF8_PERIOD:
4039       if (ch >= 0x80)
4040         return 0;
4041       /* FALLTHROUGH */
4042 #endif
4043     case OP_PERIOD:
4044       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4045           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4046         return 0;
4047       break;
4048
4049     default:
4050       return 0;
4051     }
4052
4053   if (node->constraint)
4054     {
4055       /* The node has constraints.  Check whether the current context
4056          satisfies the constraints.  */
4057       unsigned int context = re_string_context_at (&mctx->input, idx,
4058                                                    mctx->eflags);
4059       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4060         return 0;
4061     }
4062
4063   return 1;
4064 }
4065
4066 /* Extend the buffers, if the buffers have run out.  */
4067
4068 static reg_errcode_t
4069 internal_function
4070 extend_buffers (re_match_context_t *mctx)
4071 {
4072   reg_errcode_t ret;
4073   re_string_t *pstr = &mctx->input;
4074
4075   /* Double the lengthes of the buffers.  */
4076   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4077   if (BE (ret != REG_NOERROR, 0))
4078     return ret;
4079
4080   if (mctx->state_log != NULL)
4081     {
4082       /* And double the length of state_log.  */
4083       /* XXX We have no indication of the size of this buffer.  If this
4084          allocation fail we have no indication that the state_log array
4085          does not have the right size.  */
4086       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4087                                               pstr->bufs_len + 1);
4088       if (BE (new_array == NULL, 0))
4089         return REG_ESPACE;
4090       mctx->state_log = new_array;
4091     }
4092
4093   /* Then reconstruct the buffers.  */
4094   if (pstr->icase)
4095     {
4096 #ifdef RE_ENABLE_I18N
4097       if (pstr->mb_cur_max > 1)
4098         {
4099           ret = build_wcs_upper_buffer (pstr);
4100           if (BE (ret != REG_NOERROR, 0))
4101             return ret;
4102         }
4103       else
4104 #endif /* RE_ENABLE_I18N  */
4105         build_upper_buffer (pstr);
4106     }
4107   else
4108     {
4109 #ifdef RE_ENABLE_I18N
4110       if (pstr->mb_cur_max > 1)
4111         build_wcs_buffer (pstr);
4112       else
4113 #endif /* RE_ENABLE_I18N  */
4114         {
4115           if (pstr->trans != NULL)
4116             re_string_translate_buffer (pstr);
4117         }
4118     }
4119   return REG_NOERROR;
4120 }
4121
4122 \f
4123 /* Functions for matching context.  */
4124
4125 /* Initialize MCTX.  */
4126
4127 static reg_errcode_t
4128 internal_function
4129 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4130 {
4131   mctx->eflags = eflags;
4132   mctx->match_last = -1;
4133   if (n > 0)
4134     {
4135       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4136       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4137       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4138         return REG_ESPACE;
4139     }
4140   /* Already zero-ed by the caller.
4141      else
4142        mctx->bkref_ents = NULL;
4143      mctx->nbkref_ents = 0;
4144      mctx->nsub_tops = 0;  */
4145   mctx->abkref_ents = n;
4146   mctx->max_mb_elem_len = 1;
4147   mctx->asub_tops = n;
4148   return REG_NOERROR;
4149 }
4150
4151 /* Clean the entries which depend on the current input in MCTX.
4152    This function must be invoked when the matcher changes the start index
4153    of the input, or changes the input string.  */
4154
4155 static void
4156 internal_function
4157 match_ctx_clean (re_match_context_t *mctx)
4158 {
4159   int st_idx;
4160   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4161     {
4162       int sl_idx;
4163       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4164       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4165         {
4166           re_sub_match_last_t *last = top->lasts[sl_idx];
4167           re_free (last->path.array);
4168           re_free (last);
4169         }
4170       re_free (top->lasts);
4171       if (top->path)
4172         {
4173           re_free (top->path->array);
4174           re_free (top->path);
4175         }
4176       free (top);
4177     }
4178
4179   mctx->nsub_tops = 0;
4180   mctx->nbkref_ents = 0;
4181 }
4182
4183 /* Free all the memory associated with MCTX.  */
4184
4185 static void
4186 internal_function
4187 match_ctx_free (re_match_context_t *mctx)
4188 {
4189   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4190   match_ctx_clean (mctx);
4191   re_free (mctx->sub_tops);
4192   re_free (mctx->bkref_ents);
4193 }
4194
4195 /* Add a new backreference entry to MCTX.
4196    Note that we assume that caller never call this function with duplicate
4197    entry, and call with STR_IDX which isn't smaller than any existing entry.
4198 */
4199
4200 static reg_errcode_t
4201 internal_function
4202 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4203                      int to)
4204 {
4205   if (mctx->nbkref_ents >= mctx->abkref_ents)
4206     {
4207       struct re_backref_cache_entry* new_entry;
4208       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4209                               mctx->abkref_ents * 2);
4210       if (BE (new_entry == NULL, 0))
4211         {
4212           re_free (mctx->bkref_ents);
4213           return REG_ESPACE;
4214         }
4215       mctx->bkref_ents = new_entry;
4216       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4217               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4218       mctx->abkref_ents *= 2;
4219     }
4220   if (mctx->nbkref_ents > 0
4221       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4222     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4223
4224   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4225   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4226   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4227   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4228
4229   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4230      If bit N is clear, means that this entry won't epsilon-transition to
4231      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4232      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4233      such node.
4234
4235      A backreference does not epsilon-transition unless it is empty, so set
4236      to all zeros if FROM != TO.  */
4237   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4238     = (from == to ? ~0 : 0);
4239
4240   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4241   if (mctx->max_mb_elem_len < to - from)
4242     mctx->max_mb_elem_len = to - from;
4243   return REG_NOERROR;
4244 }
4245
4246 /* Search for the first entry which has the same str_idx, or -1 if none is
4247    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4248
4249 static int
4250 internal_function
4251 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4252 {
4253   int left, right, mid, last;
4254   last = right = mctx->nbkref_ents;
4255   for (left = 0; left < right;)
4256     {
4257       mid = (left + right) / 2;
4258       if (mctx->bkref_ents[mid].str_idx < str_idx)
4259         left = mid + 1;
4260       else
4261         right = mid;
4262     }
4263   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4264     return left;
4265   else
4266     return -1;
4267 }
4268
4269 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4270    at STR_IDX.  */
4271
4272 static reg_errcode_t
4273 internal_function
4274 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4275 {
4276 #ifdef DEBUG
4277   assert (mctx->sub_tops != NULL);
4278   assert (mctx->asub_tops > 0);
4279 #endif
4280   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4281     {
4282       int new_asub_tops = mctx->asub_tops * 2;
4283       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4284                                                    re_sub_match_top_t *,
4285                                                    new_asub_tops);
4286       if (BE (new_array == NULL, 0))
4287         return REG_ESPACE;
4288       mctx->sub_tops = new_array;
4289       mctx->asub_tops = new_asub_tops;
4290     }
4291   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4292   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4293     return REG_ESPACE;
4294   mctx->sub_tops[mctx->nsub_tops]->node = node;
4295   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4296   return REG_NOERROR;
4297 }
4298
4299 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4300    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4301
4302 static re_sub_match_last_t *
4303 internal_function
4304 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4305 {
4306   re_sub_match_last_t *new_entry;
4307   if (BE (subtop->nlasts == subtop->alasts, 0))
4308     {
4309       int new_alasts = 2 * subtop->alasts + 1;
4310       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4311                                                     re_sub_match_last_t *,
4312                                                     new_alasts);
4313       if (BE (new_array == NULL, 0))
4314         return NULL;
4315       subtop->lasts = new_array;
4316       subtop->alasts = new_alasts;
4317     }
4318   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4319   if (BE (new_entry != NULL, 1))
4320     {
4321       subtop->lasts[subtop->nlasts] = new_entry;
4322       new_entry->node = node;
4323       new_entry->str_idx = str_idx;
4324       ++subtop->nlasts;
4325     }
4326   return new_entry;
4327 }
4328
4329 static void
4330 internal_function
4331 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4332                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4333 {
4334   sctx->sifted_states = sifted_sts;
4335   sctx->limited_states = limited_sts;
4336   sctx->last_node = last_node;
4337   sctx->last_str_idx = last_str_idx;
4338   re_node_set_init_empty (&sctx->limits);
4339 }