root/maint/gnulib/lib/regcomp.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. re_compile_pattern
  2. weak_alias
  3. weak_alias
  4. weak_alias
  5. re_compile_fastmap_iter
  6. regcomp
  7. libc_hidden_def
  8. free_dfa_content
  9. regfree
  10. libc_hidden_def
  11. libc_freeres_fn
  12. re_compile_internal
  13. init_dfa
  14. init_word_char
  15. free_workarea_compile
  16. create_initial_state
  17. optimize_utf8
  18. analyze
  19. postorder
  20. preorder
  21. optimize_subexps
  22. lower_subexps
  23. lower_subexp
  24. calc_first
  25. calc_next
  26. link_nfa_nodes
  27. duplicate_node_closure
  28. search_duplicated_node
  29. duplicate_node
  30. calc_inveclosure
  31. calc_eclosure
  32. calc_eclosure_iter
  33. fetch_token
  34. peek_token
  35. peek_token_bracket
  36. parse
  37. parse_reg_exp
  38. parse_branch
  39. parse_expression
  40. parse_sub_exp
  41. parse_dup_op
  42. parse_byte
  43. build_range_exp
  44. build_collating_symbol
  45. seek_collating_symbol_entry
  46. lookup_collation_sequence_value
  47. build_range_exp
  48. build_collating_symbol
  49. parse_bracket_exp
  50. parse_bracket_element
  51. parse_bracket_symbol
  52. build_equiv_class
  53. build_charclass
  54. build_charclass_op
  55. fetch_number
  56. free_charset
  57. create_tree
  58. create_token_tree
  59. mark_opt_subexp
  60. free_token
  61. free_tree
  62. duplicate_tree

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5 
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10 
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15 
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19 
  20 #ifdef _LIBC
  21 # include <locale/weight.h>
  22 #endif
  23 
  24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  25                                           size_t length, reg_syntax_t syntax);
  26 static void re_compile_fastmap_iter (regex_t *bufp,
  27                                      const re_dfastate_t *init_state,
  28                                      char *fastmap);
  29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  30 static void free_charset (re_charset_t *cset);
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax);
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  89                                         re_charset_t *mbcset,
  90                                         Idx *equiv_class_alloc,
  91                                         const unsigned char *name);
  92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  93                                       bitset_t sbcset,
  94                                       re_charset_t *mbcset,
  95                                       Idx *char_class_alloc,
  96                                       const char *class_name,
  97                                       reg_syntax_t syntax);
  98 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
  99                                        RE_TRANSLATE_TYPE trans,
 100                                        const char *class_name,
 101                                        const char *extra,
 102                                        bool non_match, reg_errcode_t *err);
 103 static bin_tree_t *create_tree (re_dfa_t *dfa,
 104                                 bin_tree_t *left, bin_tree_t *right,
 105                                 re_token_type_t type);
 106 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 107                                       bin_tree_t *left, bin_tree_t *right,
 108                                       const re_token_t *token);
 109 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 110 static void free_token (re_token_t *node);
 111 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 112 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 113 
 114 /* This table gives an error message for each of the error codes listed
 115    in regex.h.  Obviously the order here has to be same as there.
 116    POSIX doesn't require that we do anything for REG_NOERROR,
 117    but why not be nice?  */
 118 
 119 static const char __re_error_msgid[] =
 120   {
 121 #define REG_NOERROR_IDX 0
 122     gettext_noop ("Success")    /* REG_NOERROR */
 123     "\0"
 124 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 125     gettext_noop ("No match")   /* REG_NOMATCH */
 126     "\0"
 127 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 128     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 129     "\0"
 130 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 131     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 132     "\0"
 133 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 134     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 135     "\0"
 136 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 137     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 138     "\0"
 139 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 140     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 141     "\0"
 142 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 143     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
 144     "\0"
 145 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
 146     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 147     "\0"
 148 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 149     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 150     "\0"
 151 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 152     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 153     "\0"
 154 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 155     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 156     "\0"
 157 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 158     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 159     "\0"
 160 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 161     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 162     "\0"
 163 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 164     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 165     "\0"
 166 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 167     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 168     "\0"
 169 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 170     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 171   };
 172 
 173 static const size_t __re_error_msgid_idx[] =
 174   {
 175     REG_NOERROR_IDX,
 176     REG_NOMATCH_IDX,
 177     REG_BADPAT_IDX,
 178     REG_ECOLLATE_IDX,
 179     REG_ECTYPE_IDX,
 180     REG_EESCAPE_IDX,
 181     REG_ESUBREG_IDX,
 182     REG_EBRACK_IDX,
 183     REG_EPAREN_IDX,
 184     REG_EBRACE_IDX,
 185     REG_BADBR_IDX,
 186     REG_ERANGE_IDX,
 187     REG_ESPACE_IDX,
 188     REG_BADRPT_IDX,
 189     REG_EEND_IDX,
 190     REG_ESIZE_IDX,
 191     REG_ERPAREN_IDX
 192   };
 193 
 194 /* Entry points for GNU code.  */
 195 
 196 /* re_compile_pattern is the GNU regular expression compiler: it
 197    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 198    Returns 0 if the pattern was valid, otherwise an error string.
 199 
 200    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 201    are set in BUFP on entry.  */
 202 
 203 const char *
 204 re_compile_pattern (const char *pattern, size_t length,
     /* [previous][next][first][last][top][bottom][index][help] */
 205                     struct re_pattern_buffer *bufp)
 206 {
 207   reg_errcode_t ret;
 208 
 209   /* And GNU code determines whether or not to get register information
 210      by passing null for the REGS argument to re_match, etc., not by
 211      setting no_sub, unless RE_NO_SUB is set.  */
 212   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 213 
 214   /* Match anchors at newline.  */
 215   bufp->newline_anchor = 1;
 216 
 217   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 218 
 219   if (!ret)
 220     return NULL;
 221   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 222 }
 223 weak_alias (__re_compile_pattern, re_compile_pattern)
     /* [previous][next][first][last][top][bottom][index][help] */
 224 
 225 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 226    also be assigned to arbitrarily: each pattern buffer stores its own
 227    syntax, so it can be changed between regex compilations.  */
 228 /* This has no initializer because initialized variables in Emacs
 229    become read-only after dumping.  */
 230 reg_syntax_t re_syntax_options;
 231 
 232 
 233 /* Specify the precise syntax of regexps for compilation.  This provides
 234    for compatibility for various utilities which historically have
 235    different, incompatible syntaxes.
 236 
 237    The argument SYNTAX is a bit mask comprised of the various bits
 238    defined in regex.h.  We return the old syntax.  */
 239 
 240 reg_syntax_t
 241 re_set_syntax (reg_syntax_t syntax)
 242 {
 243   reg_syntax_t ret = re_syntax_options;
 244 
 245   re_syntax_options = syntax;
 246   return ret;
 247 }
 248 weak_alias (__re_set_syntax, re_set_syntax)
     /* [previous][next][first][last][top][bottom][index][help] */
 249 
 250 int
 251 re_compile_fastmap (struct re_pattern_buffer *bufp)
 252 {
 253   re_dfa_t *dfa = bufp->buffer;
 254   char *fastmap = bufp->fastmap;
 255 
 256   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 257   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 258   if (dfa->init_state != dfa->init_state_word)
 259     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 260   if (dfa->init_state != dfa->init_state_nl)
 261     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 262   if (dfa->init_state != dfa->init_state_begbuf)
 263     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 264   bufp->fastmap_accurate = 1;
 265   return 0;
 266 }
 267 weak_alias (__re_compile_fastmap, re_compile_fastmap)
     /* [previous][next][first][last][top][bottom][index][help] */
 268 
 269 static __always_inline void
 270 re_set_fastmap (char *fastmap, bool icase, int ch)
 271 {
 272   fastmap[ch] = 1;
 273   if (icase)
 274     fastmap[tolower (ch)] = 1;
 275 }
 276 
 277 /* Helper function for re_compile_fastmap.
 278    Compile fastmap for the initial_state INIT_STATE.  */
 279 
 280 static void
 281 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
     /* [previous][next][first][last][top][bottom][index][help] */
 282                          char *fastmap)
 283 {
 284   re_dfa_t *dfa = bufp->buffer;
 285   Idx node_cnt;
 286   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 287   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 288     {
 289       Idx node = init_state->nodes.elems[node_cnt];
 290       re_token_type_t type = dfa->nodes[node].type;
 291 
 292       if (type == CHARACTER)
 293         {
 294           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 295           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 296             {
 297               unsigned char buf[MB_LEN_MAX];
 298               unsigned char *p;
 299               wchar_t wc;
 300               mbstate_t state;
 301 
 302               p = buf;
 303               *p++ = dfa->nodes[node].opr.c;
 304               while (++node < dfa->nodes_len
 305                      && dfa->nodes[node].type == CHARACTER
 306                      && dfa->nodes[node].mb_partial)
 307                 *p++ = dfa->nodes[node].opr.c;
 308               memset (&state, '\0', sizeof (state));
 309               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 310                              &state) == p - buf
 311                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 312                       != (size_t) -1))
 313                 re_set_fastmap (fastmap, false, buf[0]);
 314             }
 315         }
 316       else if (type == SIMPLE_BRACKET)
 317         {
 318           int i, ch;
 319           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 320             {
 321               int j;
 322               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 323               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 324                 if (w & ((bitset_word_t) 1 << j))
 325                   re_set_fastmap (fastmap, icase, ch);
 326             }
 327         }
 328       else if (type == COMPLEX_BRACKET)
 329         {
 330           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 331           Idx i;
 332 
 333 #ifdef _LIBC
 334           /* See if we have to try all bytes which start multiple collation
 335              elements.
 336              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 337                   collation element, and don't catch 'b' since 'b' is
 338                   the only collation element which starts from 'b' (and
 339                   it is caught by SIMPLE_BRACKET).  */
 340               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 341                   && (cset->ncoll_syms || cset->nranges))
 342                 {
 343                   const int32_t *table = (const int32_t *)
 344                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 345                   for (i = 0; i < SBC_MAX; ++i)
 346                     if (table[i] < 0)
 347                       re_set_fastmap (fastmap, icase, i);
 348                 }
 349 #endif /* _LIBC */
 350 
 351           /* See if we have to start the match at all multibyte characters,
 352              i.e. where we would not find an invalid sequence.  This only
 353              applies to multibyte character sets; for single byte character
 354              sets, the SIMPLE_BRACKET again suffices.  */
 355           if (dfa->mb_cur_max > 1
 356               && (cset->nchar_classes || cset->non_match || cset->nranges
 357 #ifdef _LIBC
 358                   || cset->nequiv_classes
 359 #endif /* _LIBC */
 360                  ))
 361             {
 362               unsigned char c = 0;
 363               do
 364                 {
 365                   mbstate_t mbs;
 366                   memset (&mbs, 0, sizeof (mbs));
 367                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 368                     re_set_fastmap (fastmap, false, (int) c);
 369                 }
 370               while (++c != 0);
 371             }
 372 
 373           else
 374             {
 375               /* ... Else catch all bytes which can start the mbchars.  */
 376               for (i = 0; i < cset->nmbchars; ++i)
 377                 {
 378                   char buf[256];
 379                   mbstate_t state;
 380                   memset (&state, '\0', sizeof (state));
 381                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 382                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 383                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 384                     {
 385                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 386                           != (size_t) -1)
 387                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 388                     }
 389                 }
 390             }
 391         }
 392       else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
 393         {
 394           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 395           if (type == END_OF_RE)
 396             bufp->can_be_null = 1;
 397           return;
 398         }
 399     }
 400 }
 401 
 402 /* Entry point for POSIX code.  */
 403 /* regcomp takes a regular expression as a string and compiles it.
 404 
 405    PREG is a regex_t *.  We do not expect any fields to be initialized,
 406    since POSIX says we shouldn't.  Thus, we set
 407 
 408      'buffer' to the compiled pattern;
 409      'used' to the length of the compiled pattern;
 410      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 411        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 412        RE_SYNTAX_POSIX_BASIC;
 413      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 414      'fastmap' to an allocated space for the fastmap;
 415      'fastmap_accurate' to zero;
 416      're_nsub' to the number of subexpressions in PATTERN.
 417 
 418    PATTERN is the address of the pattern string.
 419 
 420    CFLAGS is a series of bits which affect compilation.
 421 
 422      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 423      use POSIX basic syntax.
 424 
 425      If REG_NEWLINE is set, then . and [^...] don't match newline.
 426      Also, regexec will try a match beginning after every newline.
 427 
 428      If REG_ICASE is set, then we considers upper- and lowercase
 429      versions of letters to be equivalent when matching.
 430 
 431      If REG_NOSUB is set, then when PREG is passed to regexec, that
 432      routine will report only success or failure, and nothing about the
 433      registers.
 434 
 435    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 436    the return codes and their meanings.)  */
 437 
 438 int
 439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
     /* [previous][next][first][last][top][bottom][index][help] */
 440 {
 441   reg_errcode_t ret;
 442   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 443                          : RE_SYNTAX_POSIX_BASIC);
 444 
 445   preg->buffer = NULL;
 446   preg->allocated = 0;
 447   preg->used = 0;
 448 
 449   /* Try to allocate space for the fastmap.  */
 450   preg->fastmap = re_malloc (char, SBC_MAX);
 451   if (__glibc_unlikely (preg->fastmap == NULL))
 452     return REG_ESPACE;
 453 
 454   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 455 
 456   /* If REG_NEWLINE is set, newlines are treated differently.  */
 457   if (cflags & REG_NEWLINE)
 458     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 459       syntax &= ~RE_DOT_NEWLINE;
 460       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 461       /* It also changes the matching behavior.  */
 462       preg->newline_anchor = 1;
 463     }
 464   else
 465     preg->newline_anchor = 0;
 466   preg->no_sub = !!(cflags & REG_NOSUB);
 467   preg->translate = NULL;
 468 
 469   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 470 
 471   /* POSIX doesn't distinguish between an unmatched open-group and an
 472      unmatched close-group: both are REG_EPAREN.  */
 473   if (ret == REG_ERPAREN)
 474     ret = REG_EPAREN;
 475 
 476   /* We have already checked preg->fastmap != NULL.  */
 477   if (__glibc_likely (ret == REG_NOERROR))
 478     /* Compute the fastmap now, since regexec cannot modify the pattern
 479        buffer.  This function never fails in this implementation.  */
 480     (void) re_compile_fastmap (preg);
 481   else
 482     {
 483       /* Some error occurred while compiling the expression.  */
 484       re_free (preg->fastmap);
 485       preg->fastmap = NULL;
 486     }
 487 
 488   return (int) ret;
 489 }
 490 libc_hidden_def (__regcomp)
     /* [previous][next][first][last][top][bottom][index][help] */
 491 weak_alias (__regcomp, regcomp)
 492 
 493 /* Returns a message corresponding to an error code, ERRCODE, returned
 494    from either regcomp or regexec.   We don't use PREG here.  */
 495 
 496 size_t
 497 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
 498           size_t errbuf_size)
 499 {
 500   const char *msg;
 501   size_t msg_size;
 502   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
 503 
 504   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
 505     /* Only error codes returned by the rest of the code should be passed
 506        to this routine.  If we are given anything else, or if other regex
 507        code generates an invalid error code, then the program has a bug.
 508        Dump core so we can fix it.  */
 509     abort ();
 510 
 511   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 512 
 513   msg_size = strlen (msg) + 1; /* Includes the null.  */
 514 
 515   if (__glibc_likely (errbuf_size != 0))
 516     {
 517       size_t cpy_size = msg_size;
 518       if (__glibc_unlikely (msg_size > errbuf_size))
 519         {
 520           cpy_size = errbuf_size - 1;
 521           errbuf[cpy_size] = '\0';
 522         }
 523       memcpy (errbuf, msg, cpy_size);
 524     }
 525 
 526   return msg_size;
 527 }
 528 weak_alias (__regerror, regerror)
 529 
 530 
 531 /* This static array is used for the map to single-byte characters when
 532    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 533    it the same all the time.  UTF-8 is the preferred encoding so this is
 534    a worthwhile optimization.  */
 535 static const bitset_t utf8_sb_map =
 536 {
 537   /* Set the first 128 bits.  */
 538 #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
 539   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 540 #else
 541 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 542 #  error "bitset_word_t is narrower than 32 bits"
 543 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 544   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 545 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 546   BITSET_WORD_MAX, BITSET_WORD_MAX,
 547 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 548   BITSET_WORD_MAX,
 549 # endif
 550   (BITSET_WORD_MAX
 551    >> (SBC_MAX % BITSET_WORD_BITS == 0
 552        ? 0
 553        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 554 #endif
 555 };
 556 
 557 
 558 static void
 559 free_dfa_content (re_dfa_t *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
 560 {
 561   Idx i, j;
 562 
 563   if (dfa->nodes)
 564     for (i = 0; i < dfa->nodes_len; ++i)
 565       free_token (dfa->nodes + i);
 566   re_free (dfa->nexts);
 567   for (i = 0; i < dfa->nodes_len; ++i)
 568     {
 569       if (dfa->eclosures != NULL)
 570         re_node_set_free (dfa->eclosures + i);
 571       if (dfa->inveclosures != NULL)
 572         re_node_set_free (dfa->inveclosures + i);
 573       if (dfa->edests != NULL)
 574         re_node_set_free (dfa->edests + i);
 575     }
 576   re_free (dfa->edests);
 577   re_free (dfa->eclosures);
 578   re_free (dfa->inveclosures);
 579   re_free (dfa->nodes);
 580 
 581   if (dfa->state_table)
 582     for (i = 0; i <= dfa->state_hash_mask; ++i)
 583       {
 584         struct re_state_table_entry *entry = dfa->state_table + i;
 585         for (j = 0; j < entry->num; ++j)
 586           {
 587             re_dfastate_t *state = entry->array[j];
 588             free_state (state);
 589           }
 590         re_free (entry->array);
 591       }
 592   re_free (dfa->state_table);
 593   if (dfa->sb_char != utf8_sb_map)
 594     re_free (dfa->sb_char);
 595   re_free (dfa->subexp_map);
 596 #ifdef DEBUG
 597   re_free (dfa->re_str);
 598 #endif
 599 
 600   re_free (dfa);
 601 }
 602 
 603 
 604 /* Free dynamically allocated space used by PREG.  */
 605 
 606 void
 607 regfree (regex_t *preg)
     /* [previous][next][first][last][top][bottom][index][help] */
 608 {
 609   re_dfa_t *dfa = preg->buffer;
 610   if (__glibc_likely (dfa != NULL))
 611     {
 612       lock_fini (dfa->lock);
 613       free_dfa_content (dfa);
 614     }
 615   preg->buffer = NULL;
 616   preg->allocated = 0;
 617 
 618   re_free (preg->fastmap);
 619   preg->fastmap = NULL;
 620 
 621   re_free (preg->translate);
 622   preg->translate = NULL;
 623 }
 624 libc_hidden_def (__regfree)
     /* [previous][next][first][last][top][bottom][index][help] */
 625 weak_alias (__regfree, regfree)
 626 
 627 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 628    them unless specifically requested.  */
 629 
 630 #if defined _REGEX_RE_COMP || defined _LIBC
 631 
 632 /* BSD has one and only one pattern buffer.  */
 633 static struct re_pattern_buffer re_comp_buf;
 634 
 635 char *
 636 # ifdef _LIBC
 637 /* Make these definitions weak in libc, so POSIX programs can redefine
 638    these names if they don't use our functions, and still use
 639    regcomp/regexec above without link errors.  */
 640 weak_function
 641 # endif
 642 re_comp (const char *s)
 643 {
 644   reg_errcode_t ret;
 645   char *fastmap;
 646 
 647   if (!s)
 648     {
 649       if (!re_comp_buf.buffer)
 650         return gettext ("No previous regular expression");
 651       return 0;
 652     }
 653 
 654   if (re_comp_buf.buffer)
 655     {
 656       fastmap = re_comp_buf.fastmap;
 657       re_comp_buf.fastmap = NULL;
 658       __regfree (&re_comp_buf);
 659       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 660       re_comp_buf.fastmap = fastmap;
 661     }
 662 
 663   if (re_comp_buf.fastmap == NULL)
 664     {
 665       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
 666       if (re_comp_buf.fastmap == NULL)
 667         return (char *) gettext (__re_error_msgid
 668                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 669     }
 670 
 671   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 672      don't need to initialize the pattern buffer fields which affect it.  */
 673 
 674   /* Match anchors at newlines.  */
 675   re_comp_buf.newline_anchor = 1;
 676 
 677   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 678 
 679   if (!ret)
 680     return NULL;
 681 
 682   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 683   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 684 }
 685 
 686 #ifdef _LIBC
 687 libc_freeres_fn (free_mem)
     /* [previous][next][first][last][top][bottom][index][help] */
 688 {
 689   __regfree (&re_comp_buf);
 690 }
 691 #endif
 692 
 693 #endif /* _REGEX_RE_COMP */
 694 
 695 /* Internal entry point.
 696    Compile the regular expression PATTERN, whose length is LENGTH.
 697    SYNTAX indicate regular expression's syntax.  */
 698 
 699 static reg_errcode_t
 700 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
     /* [previous][next][first][last][top][bottom][index][help] */
 701                      reg_syntax_t syntax)
 702 {
 703   reg_errcode_t err = REG_NOERROR;
 704   re_dfa_t *dfa;
 705   re_string_t regexp;
 706 
 707   /* Initialize the pattern buffer.  */
 708   preg->fastmap_accurate = 0;
 709   preg->syntax = syntax;
 710   preg->not_bol = preg->not_eol = 0;
 711   preg->used = 0;
 712   preg->re_nsub = 0;
 713   preg->can_be_null = 0;
 714   preg->regs_allocated = REGS_UNALLOCATED;
 715 
 716   /* Initialize the dfa.  */
 717   dfa = preg->buffer;
 718   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
 719     {
 720       /* If zero allocated, but buffer is non-null, try to realloc
 721          enough space.  This loses if buffer's address is bogus, but
 722          that is the user's responsibility.  If ->buffer is NULL this
 723          is a simple allocation.  */
 724       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 725       if (dfa == NULL)
 726         return REG_ESPACE;
 727       preg->allocated = sizeof (re_dfa_t);
 728       preg->buffer = dfa;
 729     }
 730   preg->used = sizeof (re_dfa_t);
 731 
 732   err = init_dfa (dfa, length);
 733   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
 734     err = REG_ESPACE;
 735   if (__glibc_unlikely (err != REG_NOERROR))
 736     {
 737       free_dfa_content (dfa);
 738       preg->buffer = NULL;
 739       preg->allocated = 0;
 740       return err;
 741     }
 742 #ifdef DEBUG
 743   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 744   dfa->re_str = re_malloc (char, length + 1);
 745   strncpy (dfa->re_str, pattern, length + 1);
 746 #endif
 747 
 748   err = re_string_construct (&regexp, pattern, length, preg->translate,
 749                              (syntax & RE_ICASE) != 0, dfa);
 750   if (__glibc_unlikely (err != REG_NOERROR))
 751     {
 752     re_compile_internal_free_return:
 753       free_workarea_compile (preg);
 754       re_string_destruct (&regexp);
 755       lock_fini (dfa->lock);
 756       free_dfa_content (dfa);
 757       preg->buffer = NULL;
 758       preg->allocated = 0;
 759       return err;
 760     }
 761 
 762   /* Parse the regular expression, and build a structure tree.  */
 763   preg->re_nsub = 0;
 764   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 765   if (__glibc_unlikely (dfa->str_tree == NULL))
 766     goto re_compile_internal_free_return;
 767 
 768   /* Analyze the tree and create the nfa.  */
 769   err = analyze (preg);
 770   if (__glibc_unlikely (err != REG_NOERROR))
 771     goto re_compile_internal_free_return;
 772 
 773   /* If possible, do searching in single byte encoding to speed things up.  */
 774   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 775     optimize_utf8 (dfa);
 776 
 777   /* Then create the initial state of the dfa.  */
 778   err = create_initial_state (dfa);
 779 
 780   /* Release work areas.  */
 781   free_workarea_compile (preg);
 782   re_string_destruct (&regexp);
 783 
 784   if (__glibc_unlikely (err != REG_NOERROR))
 785     {
 786       lock_fini (dfa->lock);
 787       free_dfa_content (dfa);
 788       preg->buffer = NULL;
 789       preg->allocated = 0;
 790     }
 791 
 792   return err;
 793 }
 794 
 795 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 796    as the initial length of some arrays.  */
 797 
 798 static reg_errcode_t
 799 init_dfa (re_dfa_t *dfa, size_t pat_len)
     /* [previous][next][first][last][top][bottom][index][help] */
 800 {
 801   __re_size_t table_size;
 802 #ifndef _LIBC
 803   const char *codeset_name;
 804 #endif
 805   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 806   size_t max_object_size =
 807     MAX (sizeof (struct re_state_table_entry),
 808          MAX (sizeof (re_token_t),
 809               MAX (sizeof (re_node_set),
 810                    MAX (sizeof (regmatch_t),
 811                         max_i18n_object_size))));
 812 
 813   memset (dfa, '\0', sizeof (re_dfa_t));
 814 
 815   /* Force allocation of str_tree_storage the first time.  */
 816   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 817 
 818   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 819      calculation below, and for similar doubling calculations
 820      elsewhere.  And it's <= rather than <, because some of the
 821      doubling calculations add 1 afterwards.  */
 822   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
 823                         <= pat_len))
 824     return REG_ESPACE;
 825 
 826   dfa->nodes_alloc = pat_len + 1;
 827   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 828 
 829   /*  table_size = 2 ^ ceil(log pat_len) */
 830   for (table_size = 1; ; table_size <<= 1)
 831     if (table_size > pat_len)
 832       break;
 833 
 834   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 835   dfa->state_hash_mask = table_size - 1;
 836 
 837   dfa->mb_cur_max = MB_CUR_MAX;
 838 #ifdef _LIBC
 839   if (dfa->mb_cur_max == 6
 840       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 841     dfa->is_utf8 = 1;
 842   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 843                        != 0);
 844 #else
 845   codeset_name = nl_langinfo (CODESET);
 846   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 847       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 848       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 849       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 850     dfa->is_utf8 = 1;
 851 
 852   /* We check exhaustively in the loop below if this charset is a
 853      superset of ASCII.  */
 854   dfa->map_notascii = 0;
 855 #endif
 856 
 857   if (dfa->mb_cur_max > 1)
 858     {
 859       if (dfa->is_utf8)
 860         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 861       else
 862         {
 863           int i, j, ch;
 864 
 865           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 866           if (__glibc_unlikely (dfa->sb_char == NULL))
 867             return REG_ESPACE;
 868 
 869           /* Set the bits corresponding to single byte chars.  */
 870           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 871             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 872               {
 873                 wint_t wch = __btowc (ch);
 874                 if (wch != WEOF)
 875                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 876 #ifndef _LIBC
 877                 if (isascii (ch) && wch != ch)
 878                   dfa->map_notascii = 1;
 879 #endif
 880               }
 881         }
 882     }
 883 
 884   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
 885     return REG_ESPACE;
 886   return REG_NOERROR;
 887 }
 888 
 889 /* Initialize WORD_CHAR table, which indicate which character is
 890    "word".  In this case "word" means that it is the word construction
 891    character used by some operators like "\<", "\>", etc.  */
 892 
 893 static void
 894 init_word_char (re_dfa_t *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
 895 {
 896   int i = 0;
 897   int j;
 898   int ch = 0;
 899   dfa->word_ops_used = 1;
 900   if (__glibc_likely (dfa->map_notascii == 0))
 901     {
 902       bitset_word_t bits0 = 0x00000000;
 903       bitset_word_t bits1 = 0x03ff0000;
 904       bitset_word_t bits2 = 0x87fffffe;
 905       bitset_word_t bits3 = 0x07fffffe;
 906       if (BITSET_WORD_BITS == 64)
 907         {
 908           /* Pacify gcc -Woverflow on 32-bit platformns.  */
 909           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 910           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 911           i = 2;
 912         }
 913       else if (BITSET_WORD_BITS == 32)
 914         {
 915           dfa->word_char[0] = bits0;
 916           dfa->word_char[1] = bits1;
 917           dfa->word_char[2] = bits2;
 918           dfa->word_char[3] = bits3;
 919           i = 4;
 920         }
 921       else
 922         goto general_case;
 923       ch = 128;
 924 
 925       if (__glibc_likely (dfa->is_utf8))
 926         {
 927           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 928           return;
 929         }
 930     }
 931 
 932  general_case:
 933   for (; i < BITSET_WORDS; ++i)
 934     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 935       if (isalnum (ch) || ch == '_')
 936         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 937 }
 938 
 939 /* Free the work area which are only used while compiling.  */
 940 
 941 static void
 942 free_workarea_compile (regex_t *preg)
     /* [previous][next][first][last][top][bottom][index][help] */
 943 {
 944   re_dfa_t *dfa = preg->buffer;
 945   bin_tree_storage_t *storage, *next;
 946   for (storage = dfa->str_tree_storage; storage; storage = next)
 947     {
 948       next = storage->next;
 949       re_free (storage);
 950     }
 951   dfa->str_tree_storage = NULL;
 952   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 953   dfa->str_tree = NULL;
 954   re_free (dfa->org_indices);
 955   dfa->org_indices = NULL;
 956 }
 957 
 958 /* Create initial states for all contexts.  */
 959 
 960 static reg_errcode_t
 961 create_initial_state (re_dfa_t *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
 962 {
 963   Idx first, i;
 964   reg_errcode_t err;
 965   re_node_set init_nodes;
 966 
 967   /* Initial states have the epsilon closure of the node which is
 968      the first node of the regular expression.  */
 969   first = dfa->str_tree->first->node_idx;
 970   dfa->init_node = first;
 971   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 972   if (__glibc_unlikely (err != REG_NOERROR))
 973     return err;
 974 
 975   /* The back-references which are in initial states can epsilon transit,
 976      since in this case all of the subexpressions can be null.
 977      Then we add epsilon closures of the nodes which are the next nodes of
 978      the back-references.  */
 979   if (dfa->nbackref > 0)
 980     for (i = 0; i < init_nodes.nelem; ++i)
 981       {
 982         Idx node_idx = init_nodes.elems[i];
 983         re_token_type_t type = dfa->nodes[node_idx].type;
 984 
 985         Idx clexp_idx;
 986         if (type != OP_BACK_REF)
 987           continue;
 988         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 989           {
 990             re_token_t *clexp_node;
 991             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 992             if (clexp_node->type == OP_CLOSE_SUBEXP
 993                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 994               break;
 995           }
 996         if (clexp_idx == init_nodes.nelem)
 997           continue;
 998 
 999         if (type == OP_BACK_REF)
1000           {
1001             Idx dest_idx = dfa->edests[node_idx].elems[0];
1002             if (!re_node_set_contains (&init_nodes, dest_idx))
1003               {
1004                 reg_errcode_t merge_err
1005                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1006                 if (merge_err != REG_NOERROR)
1007                   return merge_err;
1008                 i = 0;
1009               }
1010           }
1011       }
1012 
1013   /* It must be the first time to invoke acquire_state.  */
1014   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1015   /* We don't check ERR here, since the initial state must not be NULL.  */
1016   if (__glibc_unlikely (dfa->init_state == NULL))
1017     return err;
1018   if (dfa->init_state->has_constraint)
1019     {
1020       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1021                                                        CONTEXT_WORD);
1022       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1023                                                      CONTEXT_NEWLINE);
1024       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1025                                                          &init_nodes,
1026                                                          CONTEXT_NEWLINE
1027                                                          | CONTEXT_BEGBUF);
1028       if (__glibc_unlikely (dfa->init_state_word == NULL
1029                             || dfa->init_state_nl == NULL
1030                             || dfa->init_state_begbuf == NULL))
1031         return err;
1032     }
1033   else
1034     dfa->init_state_word = dfa->init_state_nl
1035       = dfa->init_state_begbuf = dfa->init_state;
1036 
1037   re_node_set_free (&init_nodes);
1038   return REG_NOERROR;
1039 }
1040 
1041 /* If it is possible to do searching in single byte encoding instead of UTF-8
1042    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1043    DFA nodes where needed.  */
1044 
1045 static void
1046 optimize_utf8 (re_dfa_t *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
1047 {
1048   Idx node;
1049   int i;
1050   bool mb_chars = false;
1051   bool has_period = false;
1052 
1053   for (node = 0; node < dfa->nodes_len; ++node)
1054     switch (dfa->nodes[node].type)
1055       {
1056       case CHARACTER:
1057         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1058           mb_chars = true;
1059         break;
1060       case ANCHOR:
1061         switch (dfa->nodes[node].opr.ctx_type)
1062           {
1063           case LINE_FIRST:
1064           case LINE_LAST:
1065           case BUF_FIRST:
1066           case BUF_LAST:
1067             break;
1068           default:
1069             /* Word anchors etc. cannot be handled.  It's okay to test
1070                opr.ctx_type since constraints (for all DFA nodes) are
1071                created by ORing one or more opr.ctx_type values.  */
1072             return;
1073           }
1074         break;
1075       case OP_PERIOD:
1076         has_period = true;
1077         break;
1078       case OP_BACK_REF:
1079       case OP_ALT:
1080       case END_OF_RE:
1081       case OP_DUP_ASTERISK:
1082       case OP_OPEN_SUBEXP:
1083       case OP_CLOSE_SUBEXP:
1084         break;
1085       case COMPLEX_BRACKET:
1086         return;
1087       case SIMPLE_BRACKET:
1088         /* Just double check.  */
1089         {
1090           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1091                         ? 0
1092                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1093           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1094             {
1095               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096                 return;
1097               rshift = 0;
1098             }
1099         }
1100         break;
1101       default:
1102         abort ();
1103       }
1104 
1105   if (mb_chars || has_period)
1106     for (node = 0; node < dfa->nodes_len; ++node)
1107       {
1108         if (dfa->nodes[node].type == CHARACTER
1109             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1110           dfa->nodes[node].mb_partial = 0;
1111         else if (dfa->nodes[node].type == OP_PERIOD)
1112           dfa->nodes[node].type = OP_UTF8_PERIOD;
1113       }
1114 
1115   /* The search can be in single byte locale.  */
1116   dfa->mb_cur_max = 1;
1117   dfa->is_utf8 = 0;
1118   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119 }
1120 
1121 /* Analyze the structure tree, and calculate "first", "next", "edest",
1122    "eclosure", and "inveclosure".  */
1123 
1124 static reg_errcode_t
1125 analyze (regex_t *preg)
     /* [previous][next][first][last][top][bottom][index][help] */
1126 {
1127   re_dfa_t *dfa = preg->buffer;
1128   reg_errcode_t ret;
1129 
1130   /* Allocate arrays.  */
1131   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1132   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1133   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1136                         || dfa->edests == NULL || dfa->eclosures == NULL))
1137     return REG_ESPACE;
1138 
1139   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1140   if (dfa->subexp_map != NULL)
1141     {
1142       Idx i;
1143       for (i = 0; i < preg->re_nsub; i++)
1144         dfa->subexp_map[i] = i;
1145       preorder (dfa->str_tree, optimize_subexps, dfa);
1146       for (i = 0; i < preg->re_nsub; i++)
1147         if (dfa->subexp_map[i] != i)
1148           break;
1149       if (i == preg->re_nsub)
1150         {
1151           re_free (dfa->subexp_map);
1152           dfa->subexp_map = NULL;
1153         }
1154     }
1155 
1156   ret = postorder (dfa->str_tree, lower_subexps, preg);
1157   if (__glibc_unlikely (ret != REG_NOERROR))
1158     return ret;
1159   ret = postorder (dfa->str_tree, calc_first, dfa);
1160   if (__glibc_unlikely (ret != REG_NOERROR))
1161     return ret;
1162   preorder (dfa->str_tree, calc_next, dfa);
1163   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164   if (__glibc_unlikely (ret != REG_NOERROR))
1165     return ret;
1166   ret = calc_eclosure (dfa);
1167   if (__glibc_unlikely (ret != REG_NOERROR))
1168     return ret;
1169 
1170   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1172   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173       || dfa->nbackref)
1174     {
1175       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176       if (__glibc_unlikely (dfa->inveclosures == NULL))
1177         return REG_ESPACE;
1178       ret = calc_inveclosure (dfa);
1179     }
1180 
1181   return ret;
1182 }
1183 
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185    implement parse tree visits.  Instead, we use parent pointers and
1186    some hairy code in these two functions.  */
1187 static reg_errcode_t
1188 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
     /* [previous][next][first][last][top][bottom][index][help] */
1189            void *extra)
1190 {
1191   bin_tree_t *node, *prev;
1192 
1193   for (node = root; ; )
1194     {
1195       /* Descend down the tree, preferably to the left (or to the right
1196          if that's the only child).  */
1197       while (node->left || node->right)
1198         if (node->left)
1199           node = node->left;
1200         else
1201           node = node->right;
1202 
1203       do
1204         {
1205           reg_errcode_t err = fn (extra, node);
1206           if (__glibc_unlikely (err != REG_NOERROR))
1207             return err;
1208           if (node->parent == NULL)
1209             return REG_NOERROR;
1210           prev = node;
1211           node = node->parent;
1212         }
1213       /* Go up while we have a node that is reached from the right.  */
1214       while (node->right == prev || node->right == NULL);
1215       node = node->right;
1216     }
1217 }
1218 
1219 static reg_errcode_t
1220 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
     /* [previous][next][first][last][top][bottom][index][help] */
1221           void *extra)
1222 {
1223   bin_tree_t *node;
1224 
1225   for (node = root; ; )
1226     {
1227       reg_errcode_t err = fn (extra, node);
1228       if (__glibc_unlikely (err != REG_NOERROR))
1229         return err;
1230 
1231       /* Go to the left node, or up and to the right.  */
1232       if (node->left)
1233         node = node->left;
1234       else
1235         {
1236           bin_tree_t *prev = NULL;
1237           while (node->right == prev || node->right == NULL)
1238             {
1239               prev = node;
1240               node = node->parent;
1241               if (!node)
1242                 return REG_NOERROR;
1243             }
1244           node = node->right;
1245         }
1246     }
1247 }
1248 
1249 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1250    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1251    backreferences as well.  Requires a preorder visit.  */
1252 static reg_errcode_t
1253 optimize_subexps (void *extra, bin_tree_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
1254 {
1255   re_dfa_t *dfa = (re_dfa_t *) extra;
1256 
1257   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1258     {
1259       int idx = node->token.opr.idx;
1260       node->token.opr.idx = dfa->subexp_map[idx];
1261       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1262     }
1263 
1264   else if (node->token.type == SUBEXP
1265            && node->left && node->left->token.type == SUBEXP)
1266     {
1267       Idx other_idx = node->left->token.opr.idx;
1268 
1269       node->left = node->left->left;
1270       if (node->left)
1271         node->left->parent = node;
1272 
1273       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1274       if (other_idx < BITSET_WORD_BITS)
1275         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1276     }
1277 
1278   return REG_NOERROR;
1279 }
1280 
1281 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1282    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1283 static reg_errcode_t
1284 lower_subexps (void *extra, bin_tree_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
1285 {
1286   regex_t *preg = (regex_t *) extra;
1287   reg_errcode_t err = REG_NOERROR;
1288 
1289   if (node->left && node->left->token.type == SUBEXP)
1290     {
1291       node->left = lower_subexp (&err, preg, node->left);
1292       if (node->left)
1293         node->left->parent = node;
1294     }
1295   if (node->right && node->right->token.type == SUBEXP)
1296     {
1297       node->right = lower_subexp (&err, preg, node->right);
1298       if (node->right)
1299         node->right->parent = node;
1300     }
1301 
1302   return err;
1303 }
1304 
1305 static bin_tree_t *
1306 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
1307 {
1308   re_dfa_t *dfa = preg->buffer;
1309   bin_tree_t *body = node->left;
1310   bin_tree_t *op, *cls, *tree1, *tree;
1311 
1312   if (preg->no_sub
1313       /* We do not optimize empty subexpressions, because otherwise we may
1314          have bad CONCAT nodes with NULL children.  This is obviously not
1315          very common, so we do not lose much.  An example that triggers
1316          this case is the sed "script" /\(\)/x.  */
1317       && node->left != NULL
1318       && (node->token.opr.idx >= BITSET_WORD_BITS
1319           || !(dfa->used_bkref_map
1320                & ((bitset_word_t) 1 << node->token.opr.idx))))
1321     return node->left;
1322 
1323   /* Convert the SUBEXP node to the concatenation of an
1324      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1325   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1326   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1327   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1328   tree = create_tree (dfa, op, tree1, CONCAT);
1329   if (__glibc_unlikely (tree == NULL || tree1 == NULL
1330                         || op == NULL || cls == NULL))
1331     {
1332       *err = REG_ESPACE;
1333       return NULL;
1334     }
1335 
1336   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1337   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1338   return tree;
1339 }
1340 
1341 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1342    nodes.  Requires a postorder visit.  */
1343 static reg_errcode_t
1344 calc_first (void *extra, bin_tree_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
1345 {
1346   re_dfa_t *dfa = (re_dfa_t *) extra;
1347   if (node->token.type == CONCAT)
1348     {
1349       node->first = node->left->first;
1350       node->node_idx = node->left->node_idx;
1351     }
1352   else
1353     {
1354       node->first = node;
1355       node->node_idx = re_dfa_add_node (dfa, node->token);
1356       if (__glibc_unlikely (node->node_idx == -1))
1357         return REG_ESPACE;
1358       if (node->token.type == ANCHOR)
1359         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1360     }
1361   return REG_NOERROR;
1362 }
1363 
1364 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1365 static reg_errcode_t
1366 calc_next (void *extra, bin_tree_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
1367 {
1368   switch (node->token.type)
1369     {
1370     case OP_DUP_ASTERISK:
1371       node->left->next = node;
1372       break;
1373     case CONCAT:
1374       node->left->next = node->right->first;
1375       node->right->next = node->next;
1376       break;
1377     default:
1378       if (node->left)
1379         node->left->next = node->next;
1380       if (node->right)
1381         node->right->next = node->next;
1382       break;
1383     }
1384   return REG_NOERROR;
1385 }
1386 
1387 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1388 static reg_errcode_t
1389 link_nfa_nodes (void *extra, bin_tree_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
1390 {
1391   re_dfa_t *dfa = (re_dfa_t *) extra;
1392   Idx idx = node->node_idx;
1393   reg_errcode_t err = REG_NOERROR;
1394 
1395   switch (node->token.type)
1396     {
1397     case CONCAT:
1398       break;
1399 
1400     case END_OF_RE:
1401       DEBUG_ASSERT (node->next == NULL);
1402       break;
1403 
1404     case OP_DUP_ASTERISK:
1405     case OP_ALT:
1406       {
1407         Idx left, right;
1408         dfa->has_plural_match = 1;
1409         if (node->left != NULL)
1410           left = node->left->first->node_idx;
1411         else
1412           left = node->next->node_idx;
1413         if (node->right != NULL)
1414           right = node->right->first->node_idx;
1415         else
1416           right = node->next->node_idx;
1417         DEBUG_ASSERT (left > -1);
1418         DEBUG_ASSERT (right > -1);
1419         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1420       }
1421       break;
1422 
1423     case ANCHOR:
1424     case OP_OPEN_SUBEXP:
1425     case OP_CLOSE_SUBEXP:
1426       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1427       break;
1428 
1429     case OP_BACK_REF:
1430       dfa->nexts[idx] = node->next->node_idx;
1431       if (node->token.type == OP_BACK_REF)
1432         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1433       break;
1434 
1435     default:
1436       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1437       dfa->nexts[idx] = node->next->node_idx;
1438       break;
1439     }
1440 
1441   return err;
1442 }
1443 
1444 /* Duplicate the epsilon closure of the node ROOT_NODE.
1445    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1446    to their own constraint.  */
1447 
1448 static reg_errcode_t
1449 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
     /* [previous][next][first][last][top][bottom][index][help] */
1450                         Idx root_node, unsigned int init_constraint)
1451 {
1452   Idx org_node, clone_node;
1453   bool ok;
1454   unsigned int constraint = init_constraint;
1455   for (org_node = top_org_node, clone_node = top_clone_node;;)
1456     {
1457       Idx org_dest, clone_dest;
1458       if (dfa->nodes[org_node].type == OP_BACK_REF)
1459         {
1460           /* If the back reference epsilon-transit, its destination must
1461              also have the constraint.  Then duplicate the epsilon closure
1462              of the destination of the back reference, and store it in
1463              edests of the back reference.  */
1464           org_dest = dfa->nexts[org_node];
1465           re_node_set_empty (dfa->edests + clone_node);
1466           clone_dest = duplicate_node (dfa, org_dest, constraint);
1467           if (__glibc_unlikely (clone_dest == -1))
1468             return REG_ESPACE;
1469           dfa->nexts[clone_node] = dfa->nexts[org_node];
1470           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1471           if (__glibc_unlikely (! ok))
1472             return REG_ESPACE;
1473         }
1474       else if (dfa->edests[org_node].nelem == 0)
1475         {
1476           /* In case of the node can't epsilon-transit, don't duplicate the
1477              destination and store the original destination as the
1478              destination of the node.  */
1479           dfa->nexts[clone_node] = dfa->nexts[org_node];
1480           break;
1481         }
1482       else if (dfa->edests[org_node].nelem == 1)
1483         {
1484           /* In case of the node can epsilon-transit, and it has only one
1485              destination.  */
1486           org_dest = dfa->edests[org_node].elems[0];
1487           re_node_set_empty (dfa->edests + clone_node);
1488           /* If the node is root_node itself, it means the epsilon closure
1489              has a loop.  Then tie it to the destination of the root_node.  */
1490           if (org_node == root_node && clone_node != org_node)
1491             {
1492               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1493               if (__glibc_unlikely (! ok))
1494                 return REG_ESPACE;
1495               break;
1496             }
1497           /* In case the node has another constraint, append it.  */
1498           constraint |= dfa->nodes[org_node].constraint;
1499           clone_dest = duplicate_node (dfa, org_dest, constraint);
1500           if (__glibc_unlikely (clone_dest == -1))
1501             return REG_ESPACE;
1502           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503           if (__glibc_unlikely (! ok))
1504             return REG_ESPACE;
1505         }
1506       else /* dfa->edests[org_node].nelem == 2 */
1507         {
1508           /* In case of the node can epsilon-transit, and it has two
1509              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1510           org_dest = dfa->edests[org_node].elems[0];
1511           re_node_set_empty (dfa->edests + clone_node);
1512           /* Search for a duplicated node which satisfies the constraint.  */
1513           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1514           if (clone_dest == -1)
1515             {
1516               /* There is no such duplicated node, create a new one.  */
1517               reg_errcode_t err;
1518               clone_dest = duplicate_node (dfa, org_dest, constraint);
1519               if (__glibc_unlikely (clone_dest == -1))
1520                 return REG_ESPACE;
1521               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1522               if (__glibc_unlikely (! ok))
1523                 return REG_ESPACE;
1524               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1525                                             root_node, constraint);
1526               if (__glibc_unlikely (err != REG_NOERROR))
1527                 return err;
1528             }
1529           else
1530             {
1531               /* There is a duplicated node which satisfies the constraint,
1532                  use it to avoid infinite loop.  */
1533               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534               if (__glibc_unlikely (! ok))
1535                 return REG_ESPACE;
1536             }
1537 
1538           org_dest = dfa->edests[org_node].elems[1];
1539           clone_dest = duplicate_node (dfa, org_dest, constraint);
1540           if (__glibc_unlikely (clone_dest == -1))
1541             return REG_ESPACE;
1542           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543           if (__glibc_unlikely (! ok))
1544             return REG_ESPACE;
1545         }
1546       org_node = org_dest;
1547       clone_node = clone_dest;
1548     }
1549   return REG_NOERROR;
1550 }
1551 
1552 /* Search for a node which is duplicated from the node ORG_NODE, and
1553    satisfies the constraint CONSTRAINT.  */
1554 
1555 static Idx
1556 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
     /* [previous][next][first][last][top][bottom][index][help] */
1557                         unsigned int constraint)
1558 {
1559   Idx idx;
1560   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1561     {
1562       if (org_node == dfa->org_indices[idx]
1563           && constraint == dfa->nodes[idx].constraint)
1564         return idx; /* Found.  */
1565     }
1566   return -1; /* Not found.  */
1567 }
1568 
1569 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1570    Return the index of the new node, or -1 if insufficient storage is
1571    available.  */
1572 
1573 static Idx
1574 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
     /* [previous][next][first][last][top][bottom][index][help] */
1575 {
1576   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1577   if (__glibc_likely (dup_idx != -1))
1578     {
1579       dfa->nodes[dup_idx].constraint = constraint;
1580       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1581       dfa->nodes[dup_idx].duplicated = 1;
1582 
1583       /* Store the index of the original node.  */
1584       dfa->org_indices[dup_idx] = org_idx;
1585     }
1586   return dup_idx;
1587 }
1588 
1589 static reg_errcode_t
1590 calc_inveclosure (re_dfa_t *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
1591 {
1592   Idx src, idx;
1593   bool ok;
1594   for (idx = 0; idx < dfa->nodes_len; ++idx)
1595     re_node_set_init_empty (dfa->inveclosures + idx);
1596 
1597   for (src = 0; src < dfa->nodes_len; ++src)
1598     {
1599       Idx *elems = dfa->eclosures[src].elems;
1600       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1601         {
1602           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1603           if (__glibc_unlikely (! ok))
1604             return REG_ESPACE;
1605         }
1606     }
1607 
1608   return REG_NOERROR;
1609 }
1610 
1611 /* Calculate "eclosure" for all the node in DFA.  */
1612 
1613 static reg_errcode_t
1614 calc_eclosure (re_dfa_t *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
1615 {
1616   Idx node_idx;
1617   bool incomplete;
1618   DEBUG_ASSERT (dfa->nodes_len > 0);
1619   incomplete = false;
1620   /* For each nodes, calculate epsilon closure.  */
1621   for (node_idx = 0; ; ++node_idx)
1622     {
1623       reg_errcode_t err;
1624       re_node_set eclosure_elem;
1625       if (node_idx == dfa->nodes_len)
1626         {
1627           if (!incomplete)
1628             break;
1629           incomplete = false;
1630           node_idx = 0;
1631         }
1632 
1633       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1634 
1635       /* If we have already calculated, skip it.  */
1636       if (dfa->eclosures[node_idx].nelem != 0)
1637         continue;
1638       /* Calculate epsilon closure of 'node_idx'.  */
1639       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1640       if (__glibc_unlikely (err != REG_NOERROR))
1641         return err;
1642 
1643       if (dfa->eclosures[node_idx].nelem == 0)
1644         {
1645           incomplete = true;
1646           re_node_set_free (&eclosure_elem);
1647         }
1648     }
1649   return REG_NOERROR;
1650 }
1651 
1652 /* Calculate epsilon closure of NODE.  */
1653 
1654 static reg_errcode_t
1655 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
     /* [previous][next][first][last][top][bottom][index][help] */
1656 {
1657   reg_errcode_t err;
1658   Idx i;
1659   re_node_set eclosure;
1660   bool incomplete = false;
1661   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662   if (__glibc_unlikely (err != REG_NOERROR))
1663     return err;
1664 
1665   /* An epsilon closure includes itself.  */
1666   eclosure.elems[eclosure.nelem++] = node;
1667 
1668   /* This indicates that we are calculating this node now.
1669      We reference this value to avoid infinite loop.  */
1670   dfa->eclosures[node].nelem = -1;
1671 
1672   /* If the current node has constraints, duplicate all nodes
1673      since they must inherit the constraints.  */
1674   if (dfa->nodes[node].constraint
1675       && dfa->edests[node].nelem
1676       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1677     {
1678       err = duplicate_node_closure (dfa, node, node, node,
1679                                     dfa->nodes[node].constraint);
1680       if (__glibc_unlikely (err != REG_NOERROR))
1681         return err;
1682     }
1683 
1684   /* Expand each epsilon destination nodes.  */
1685   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1686     for (i = 0; i < dfa->edests[node].nelem; ++i)
1687       {
1688         re_node_set eclosure_elem;
1689         Idx edest = dfa->edests[node].elems[i];
1690         /* If calculating the epsilon closure of 'edest' is in progress,
1691            return intermediate result.  */
1692         if (dfa->eclosures[edest].nelem == -1)
1693           {
1694             incomplete = true;
1695             continue;
1696           }
1697         /* If we haven't calculated the epsilon closure of 'edest' yet,
1698            calculate now. Otherwise use calculated epsilon closure.  */
1699         if (dfa->eclosures[edest].nelem == 0)
1700           {
1701             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1702             if (__glibc_unlikely (err != REG_NOERROR))
1703               return err;
1704           }
1705         else
1706           eclosure_elem = dfa->eclosures[edest];
1707         /* Merge the epsilon closure of 'edest'.  */
1708         err = re_node_set_merge (&eclosure, &eclosure_elem);
1709         if (__glibc_unlikely (err != REG_NOERROR))
1710           return err;
1711         /* If the epsilon closure of 'edest' is incomplete,
1712            the epsilon closure of this node is also incomplete.  */
1713         if (dfa->eclosures[edest].nelem == 0)
1714           {
1715             incomplete = true;
1716             re_node_set_free (&eclosure_elem);
1717           }
1718       }
1719 
1720   if (incomplete && !root)
1721     dfa->eclosures[node].nelem = 0;
1722   else
1723     dfa->eclosures[node] = eclosure;
1724   *new_set = eclosure;
1725   return REG_NOERROR;
1726 }
1727 
1728 /* Functions for token which are used in the parser.  */
1729 
1730 /* Fetch a token from INPUT.
1731    We must not use this function inside bracket expressions.  */
1732 
1733 static void
1734 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
     /* [previous][next][first][last][top][bottom][index][help] */
1735 {
1736   re_string_skip_bytes (input, peek_token (result, input, syntax));
1737 }
1738 
1739 /* Peek a token from INPUT, and return the length of the token.
1740    We must not use this function inside bracket expressions.  */
1741 
1742 static int
1743 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
     /* [previous][next][first][last][top][bottom][index][help] */
1744 {
1745   unsigned char c;
1746 
1747   if (re_string_eoi (input))
1748     {
1749       token->type = END_OF_RE;
1750       return 0;
1751     }
1752 
1753   c = re_string_peek_byte (input, 0);
1754   token->opr.c = c;
1755 
1756   token->word_char = 0;
1757   token->mb_partial = 0;
1758   if (input->mb_cur_max > 1
1759       && !re_string_first_byte (input, re_string_cur_idx (input)))
1760     {
1761       token->type = CHARACTER;
1762       token->mb_partial = 1;
1763       return 1;
1764     }
1765   if (c == '\\')
1766     {
1767       unsigned char c2;
1768       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1769         {
1770           token->type = BACK_SLASH;
1771           return 1;
1772         }
1773 
1774       c2 = re_string_peek_byte_case (input, 1);
1775       token->opr.c = c2;
1776       token->type = CHARACTER;
1777       if (input->mb_cur_max > 1)
1778         {
1779           wint_t wc = re_string_wchar_at (input,
1780                                           re_string_cur_idx (input) + 1);
1781           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1782         }
1783       else
1784         token->word_char = IS_WORD_CHAR (c2) != 0;
1785 
1786       switch (c2)
1787         {
1788         case '|':
1789           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1790             token->type = OP_ALT;
1791           break;
1792         case '1': case '2': case '3': case '4': case '5':
1793         case '6': case '7': case '8': case '9':
1794           if (!(syntax & RE_NO_BK_REFS))
1795             {
1796               token->type = OP_BACK_REF;
1797               token->opr.idx = c2 - '1';
1798             }
1799           break;
1800         case '<':
1801           if (!(syntax & RE_NO_GNU_OPS))
1802             {
1803               token->type = ANCHOR;
1804               token->opr.ctx_type = WORD_FIRST;
1805             }
1806           break;
1807         case '>':
1808           if (!(syntax & RE_NO_GNU_OPS))
1809             {
1810               token->type = ANCHOR;
1811               token->opr.ctx_type = WORD_LAST;
1812             }
1813           break;
1814         case 'b':
1815           if (!(syntax & RE_NO_GNU_OPS))
1816             {
1817               token->type = ANCHOR;
1818               token->opr.ctx_type = WORD_DELIM;
1819             }
1820           break;
1821         case 'B':
1822           if (!(syntax & RE_NO_GNU_OPS))
1823             {
1824               token->type = ANCHOR;
1825               token->opr.ctx_type = NOT_WORD_DELIM;
1826             }
1827           break;
1828         case 'w':
1829           if (!(syntax & RE_NO_GNU_OPS))
1830             token->type = OP_WORD;
1831           break;
1832         case 'W':
1833           if (!(syntax & RE_NO_GNU_OPS))
1834             token->type = OP_NOTWORD;
1835           break;
1836         case 's':
1837           if (!(syntax & RE_NO_GNU_OPS))
1838             token->type = OP_SPACE;
1839           break;
1840         case 'S':
1841           if (!(syntax & RE_NO_GNU_OPS))
1842             token->type = OP_NOTSPACE;
1843           break;
1844         case '`':
1845           if (!(syntax & RE_NO_GNU_OPS))
1846             {
1847               token->type = ANCHOR;
1848               token->opr.ctx_type = BUF_FIRST;
1849             }
1850           break;
1851         case '\'':
1852           if (!(syntax & RE_NO_GNU_OPS))
1853             {
1854               token->type = ANCHOR;
1855               token->opr.ctx_type = BUF_LAST;
1856             }
1857           break;
1858         case '(':
1859           if (!(syntax & RE_NO_BK_PARENS))
1860             token->type = OP_OPEN_SUBEXP;
1861           break;
1862         case ')':
1863           if (!(syntax & RE_NO_BK_PARENS))
1864             token->type = OP_CLOSE_SUBEXP;
1865           break;
1866         case '+':
1867           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1868             token->type = OP_DUP_PLUS;
1869           break;
1870         case '?':
1871           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1872             token->type = OP_DUP_QUESTION;
1873           break;
1874         case '{':
1875           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1876             token->type = OP_OPEN_DUP_NUM;
1877           break;
1878         case '}':
1879           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1880             token->type = OP_CLOSE_DUP_NUM;
1881           break;
1882         default:
1883           break;
1884         }
1885       return 2;
1886     }
1887 
1888   token->type = CHARACTER;
1889   if (input->mb_cur_max > 1)
1890     {
1891       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1893     }
1894   else
1895     token->word_char = IS_WORD_CHAR (token->opr.c);
1896 
1897   switch (c)
1898     {
1899     case '\n':
1900       if (syntax & RE_NEWLINE_ALT)
1901         token->type = OP_ALT;
1902       break;
1903     case '|':
1904       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1905         token->type = OP_ALT;
1906       break;
1907     case '*':
1908       token->type = OP_DUP_ASTERISK;
1909       break;
1910     case '+':
1911       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1912         token->type = OP_DUP_PLUS;
1913       break;
1914     case '?':
1915       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1916         token->type = OP_DUP_QUESTION;
1917       break;
1918     case '{':
1919       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1920         token->type = OP_OPEN_DUP_NUM;
1921       break;
1922     case '}':
1923       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1924         token->type = OP_CLOSE_DUP_NUM;
1925       break;
1926     case '(':
1927       if (syntax & RE_NO_BK_PARENS)
1928         token->type = OP_OPEN_SUBEXP;
1929       break;
1930     case ')':
1931       if (syntax & RE_NO_BK_PARENS)
1932         token->type = OP_CLOSE_SUBEXP;
1933       break;
1934     case '[':
1935       token->type = OP_OPEN_BRACKET;
1936       break;
1937     case '.':
1938       token->type = OP_PERIOD;
1939       break;
1940     case '^':
1941       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1942           && re_string_cur_idx (input) != 0)
1943         {
1944           char prev = re_string_peek_byte (input, -1);
1945           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1946             break;
1947         }
1948       token->type = ANCHOR;
1949       token->opr.ctx_type = LINE_FIRST;
1950       break;
1951     case '$':
1952       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1953           && re_string_cur_idx (input) + 1 != re_string_length (input))
1954         {
1955           re_token_t next;
1956           re_string_skip_bytes (input, 1);
1957           peek_token (&next, input, syntax);
1958           re_string_skip_bytes (input, -1);
1959           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1960             break;
1961         }
1962       token->type = ANCHOR;
1963       token->opr.ctx_type = LINE_LAST;
1964       break;
1965     default:
1966       break;
1967     }
1968   return 1;
1969 }
1970 
1971 /* Peek a token from INPUT, and return the length of the token.
1972    We must not use this function out of bracket expressions.  */
1973 
1974 static int
1975 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
     /* [previous][next][first][last][top][bottom][index][help] */
1976 {
1977   unsigned char c;
1978   if (re_string_eoi (input))
1979     {
1980       token->type = END_OF_RE;
1981       return 0;
1982     }
1983   c = re_string_peek_byte (input, 0);
1984   token->opr.c = c;
1985 
1986   if (input->mb_cur_max > 1
1987       && !re_string_first_byte (input, re_string_cur_idx (input)))
1988     {
1989       token->type = CHARACTER;
1990       return 1;
1991     }
1992 
1993   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1994       && re_string_cur_idx (input) + 1 < re_string_length (input))
1995     {
1996       /* In this case, '\' escape a character.  */
1997       unsigned char c2;
1998       re_string_skip_bytes (input, 1);
1999       c2 = re_string_peek_byte (input, 0);
2000       token->opr.c = c2;
2001       token->type = CHARACTER;
2002       return 1;
2003     }
2004   if (c == '[') /* '[' is a special char in a bracket exps.  */
2005     {
2006       unsigned char c2;
2007       int token_len;
2008       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2009         c2 = re_string_peek_byte (input, 1);
2010       else
2011         c2 = 0;
2012       token->opr.c = c2;
2013       token_len = 2;
2014       switch (c2)
2015         {
2016         case '.':
2017           token->type = OP_OPEN_COLL_ELEM;
2018           break;
2019 
2020         case '=':
2021           token->type = OP_OPEN_EQUIV_CLASS;
2022           break;
2023 
2024         case ':':
2025           if (syntax & RE_CHAR_CLASSES)
2026             {
2027               token->type = OP_OPEN_CHAR_CLASS;
2028               break;
2029             }
2030           FALLTHROUGH;
2031         default:
2032           token->type = CHARACTER;
2033           token->opr.c = c;
2034           token_len = 1;
2035           break;
2036         }
2037       return token_len;
2038     }
2039   switch (c)
2040     {
2041     case '-':
2042       token->type = OP_CHARSET_RANGE;
2043       break;
2044     case ']':
2045       token->type = OP_CLOSE_BRACKET;
2046       break;
2047     case '^':
2048       token->type = OP_NON_MATCH_LIST;
2049       break;
2050     default:
2051       token->type = CHARACTER;
2052     }
2053   return 1;
2054 }
2055 
2056 /* Functions for parser.  */
2057 
2058 /* Entry point of the parser.
2059    Parse the regular expression REGEXP and return the structure tree.
2060    If an error occurs, ERR is set by error code, and return NULL.
2061    This function build the following tree, from regular expression <reg_exp>:
2062            CAT
2063            / \
2064           /   \
2065    <reg_exp>  EOR
2066 
2067    CAT means concatenation.
2068    EOR means end of regular expression.  */
2069 
2070 static bin_tree_t *
2071 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
     /* [previous][next][first][last][top][bottom][index][help] */
2072        reg_errcode_t *err)
2073 {
2074   re_dfa_t *dfa = preg->buffer;
2075   bin_tree_t *tree, *eor, *root;
2076   re_token_t current_token;
2077   dfa->syntax = syntax;
2078   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2079   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2080   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2081     return NULL;
2082   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2083   if (tree != NULL)
2084     root = create_tree (dfa, tree, eor, CONCAT);
2085   else
2086     root = eor;
2087   if (__glibc_unlikely (eor == NULL || root == NULL))
2088     {
2089       *err = REG_ESPACE;
2090       return NULL;
2091     }
2092   return root;
2093 }
2094 
2095 /* This function build the following tree, from regular expression
2096    <branch1>|<branch2>:
2097            ALT
2098            / \
2099           /   \
2100    <branch1> <branch2>
2101 
2102    ALT means alternative, which represents the operator '|'.  */
2103 
2104 static bin_tree_t *
2105 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
     /* [previous][next][first][last][top][bottom][index][help] */
2106                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2107 {
2108   re_dfa_t *dfa = preg->buffer;
2109   bin_tree_t *tree, *branch = NULL;
2110   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2111   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2112   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2113     return NULL;
2114 
2115   while (token->type == OP_ALT)
2116     {
2117       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2118       if (token->type != OP_ALT && token->type != END_OF_RE
2119           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2120         {
2121           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2122           dfa->completed_bkref_map = initial_bkref_map;
2123           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2124           if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2125             {
2126               if (tree != NULL)
2127                 postorder (tree, free_tree, NULL);
2128               return NULL;
2129             }
2130           dfa->completed_bkref_map |= accumulated_bkref_map;
2131         }
2132       else
2133         branch = NULL;
2134       tree = create_tree (dfa, tree, branch, OP_ALT);
2135       if (__glibc_unlikely (tree == NULL))
2136         {
2137           *err = REG_ESPACE;
2138           return NULL;
2139         }
2140     }
2141   return tree;
2142 }
2143 
2144 /* This function build the following tree, from regular expression
2145    <exp1><exp2>:
2146         CAT
2147         / \
2148        /   \
2149    <exp1> <exp2>
2150 
2151    CAT means concatenation.  */
2152 
2153 static bin_tree_t *
2154 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
     /* [previous][next][first][last][top][bottom][index][help] */
2155               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2156 {
2157   bin_tree_t *tree, *expr;
2158   re_dfa_t *dfa = preg->buffer;
2159   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2160   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2161     return NULL;
2162 
2163   while (token->type != OP_ALT && token->type != END_OF_RE
2164          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2165     {
2166       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2167       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2168         {
2169           if (tree != NULL)
2170             postorder (tree, free_tree, NULL);
2171           return NULL;
2172         }
2173       if (tree != NULL && expr != NULL)
2174         {
2175           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2176           if (newtree == NULL)
2177             {
2178               postorder (expr, free_tree, NULL);
2179               postorder (tree, free_tree, NULL);
2180               *err = REG_ESPACE;
2181               return NULL;
2182             }
2183           tree = newtree;
2184         }
2185       else if (tree == NULL)
2186         tree = expr;
2187       /* Otherwise expr == NULL, we don't need to create new tree.  */
2188     }
2189   return tree;
2190 }
2191 
2192 /* This function build the following tree, from regular expression a*:
2193          *
2194          |
2195          a
2196 */
2197 
2198 static bin_tree_t *
2199 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
     /* [previous][next][first][last][top][bottom][index][help] */
2200                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2201 {
2202   re_dfa_t *dfa = preg->buffer;
2203   bin_tree_t *tree;
2204   switch (token->type)
2205     {
2206     case CHARACTER:
2207       tree = create_token_tree (dfa, NULL, NULL, token);
2208       if (__glibc_unlikely (tree == NULL))
2209         {
2210           *err = REG_ESPACE;
2211           return NULL;
2212         }
2213       if (dfa->mb_cur_max > 1)
2214         {
2215           while (!re_string_eoi (regexp)
2216                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2217             {
2218               bin_tree_t *mbc_remain;
2219               fetch_token (token, regexp, syntax);
2220               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2221               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2222               if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2223                 {
2224                   *err = REG_ESPACE;
2225                   return NULL;
2226                 }
2227             }
2228         }
2229       break;
2230 
2231     case OP_OPEN_SUBEXP:
2232       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2233       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2234         return NULL;
2235       break;
2236 
2237     case OP_OPEN_BRACKET:
2238       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2239       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2240         return NULL;
2241       break;
2242 
2243     case OP_BACK_REF:
2244       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2245         {
2246           *err = REG_ESUBREG;
2247           return NULL;
2248         }
2249       dfa->used_bkref_map |= 1 << token->opr.idx;
2250       tree = create_token_tree (dfa, NULL, NULL, token);
2251       if (__glibc_unlikely (tree == NULL))
2252         {
2253           *err = REG_ESPACE;
2254           return NULL;
2255         }
2256       ++dfa->nbackref;
2257       dfa->has_mb_node = 1;
2258       break;
2259 
2260     case OP_OPEN_DUP_NUM:
2261       if (syntax & RE_CONTEXT_INVALID_DUP)
2262         {
2263           *err = REG_BADRPT;
2264           return NULL;
2265         }
2266       FALLTHROUGH;
2267     case OP_DUP_ASTERISK:
2268     case OP_DUP_PLUS:
2269     case OP_DUP_QUESTION:
2270       if (syntax & RE_CONTEXT_INVALID_OPS)
2271         {
2272           *err = REG_BADRPT;
2273           return NULL;
2274         }
2275       else if (syntax & RE_CONTEXT_INDEP_OPS)
2276         {
2277           fetch_token (token, regexp, syntax);
2278           return parse_expression (regexp, preg, token, syntax, nest, err);
2279         }
2280       FALLTHROUGH;
2281     case OP_CLOSE_SUBEXP:
2282       if ((token->type == OP_CLOSE_SUBEXP)
2283           && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2284         {
2285           *err = REG_ERPAREN;
2286           return NULL;
2287         }
2288       FALLTHROUGH;
2289     case OP_CLOSE_DUP_NUM:
2290       /* We treat it as a normal character.  */
2291 
2292       /* Then we can these characters as normal characters.  */
2293       token->type = CHARACTER;
2294       /* mb_partial and word_char bits should be initialized already
2295          by peek_token.  */
2296       tree = create_token_tree (dfa, NULL, NULL, token);
2297       if (__glibc_unlikely (tree == NULL))
2298         {
2299           *err = REG_ESPACE;
2300           return NULL;
2301         }
2302       break;
2303 
2304     case ANCHOR:
2305       if ((token->opr.ctx_type
2306            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2307           && dfa->word_ops_used == 0)
2308         init_word_char (dfa);
2309       if (token->opr.ctx_type == WORD_DELIM
2310           || token->opr.ctx_type == NOT_WORD_DELIM)
2311         {
2312           bin_tree_t *tree_first, *tree_last;
2313           if (token->opr.ctx_type == WORD_DELIM)
2314             {
2315               token->opr.ctx_type = WORD_FIRST;
2316               tree_first = create_token_tree (dfa, NULL, NULL, token);
2317               token->opr.ctx_type = WORD_LAST;
2318             }
2319           else
2320             {
2321               token->opr.ctx_type = INSIDE_WORD;
2322               tree_first = create_token_tree (dfa, NULL, NULL, token);
2323               token->opr.ctx_type = INSIDE_NOTWORD;
2324             }
2325           tree_last = create_token_tree (dfa, NULL, NULL, token);
2326           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2327           if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2328                                 || tree == NULL))
2329             {
2330               *err = REG_ESPACE;
2331               return NULL;
2332             }
2333         }
2334       else
2335         {
2336           tree = create_token_tree (dfa, NULL, NULL, token);
2337           if (__glibc_unlikely (tree == NULL))
2338             {
2339               *err = REG_ESPACE;
2340               return NULL;
2341             }
2342         }
2343       /* We must return here, since ANCHORs can't be followed
2344          by repetition operators.
2345          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2346              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2347       fetch_token (token, regexp, syntax);
2348       return tree;
2349 
2350     case OP_PERIOD:
2351       tree = create_token_tree (dfa, NULL, NULL, token);
2352       if (__glibc_unlikely (tree == NULL))
2353         {
2354           *err = REG_ESPACE;
2355           return NULL;
2356         }
2357       if (dfa->mb_cur_max > 1)
2358         dfa->has_mb_node = 1;
2359       break;
2360 
2361     case OP_WORD:
2362     case OP_NOTWORD:
2363       tree = build_charclass_op (dfa, regexp->trans,
2364                                  "alnum",
2365                                  "_",
2366                                  token->type == OP_NOTWORD, err);
2367       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2368         return NULL;
2369       break;
2370 
2371     case OP_SPACE:
2372     case OP_NOTSPACE:
2373       tree = build_charclass_op (dfa, regexp->trans,
2374                                  "space",
2375                                  "",
2376                                  token->type == OP_NOTSPACE, err);
2377       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2378         return NULL;
2379       break;
2380 
2381     case OP_ALT:
2382     case END_OF_RE:
2383       return NULL;
2384 
2385     case BACK_SLASH:
2386       *err = REG_EESCAPE;
2387       return NULL;
2388 
2389     default:
2390       /* Must not happen?  */
2391       DEBUG_ASSERT (false);
2392       return NULL;
2393     }
2394   fetch_token (token, regexp, syntax);
2395 
2396   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2397          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2398     {
2399       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2400                                            syntax, err);
2401       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2402         {
2403           if (tree != NULL)
2404             postorder (tree, free_tree, NULL);
2405           return NULL;
2406         }
2407       tree = dup_tree;
2408       /* In BRE consecutive duplications are not allowed.  */
2409       if ((syntax & RE_CONTEXT_INVALID_DUP)
2410           && (token->type == OP_DUP_ASTERISK
2411               || token->type == OP_OPEN_DUP_NUM))
2412         {
2413           if (tree != NULL)
2414             postorder (tree, free_tree, NULL);
2415           *err = REG_BADRPT;
2416           return NULL;
2417         }
2418     }
2419 
2420   return tree;
2421 }
2422 
2423 /* This function build the following tree, from regular expression
2424    (<reg_exp>):
2425          SUBEXP
2426             |
2427         <reg_exp>
2428 */
2429 
2430 static bin_tree_t *
2431 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
     /* [previous][next][first][last][top][bottom][index][help] */
2432                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2433 {
2434   re_dfa_t *dfa = preg->buffer;
2435   bin_tree_t *tree;
2436   size_t cur_nsub;
2437   cur_nsub = preg->re_nsub++;
2438 
2439   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2440 
2441   /* The subexpression may be a null string.  */
2442   if (token->type == OP_CLOSE_SUBEXP)
2443     tree = NULL;
2444   else
2445     {
2446       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2447       if (__glibc_unlikely (*err == REG_NOERROR
2448                             && token->type != OP_CLOSE_SUBEXP))
2449         {
2450           if (tree != NULL)
2451             postorder (tree, free_tree, NULL);
2452           *err = REG_EPAREN;
2453         }
2454       if (__glibc_unlikely (*err != REG_NOERROR))
2455         return NULL;
2456     }
2457 
2458   if (cur_nsub <= '9' - '1')
2459     dfa->completed_bkref_map |= 1 << cur_nsub;
2460 
2461   tree = create_tree (dfa, tree, NULL, SUBEXP);
2462   if (__glibc_unlikely (tree == NULL))
2463     {
2464       *err = REG_ESPACE;
2465       return NULL;
2466     }
2467   tree->token.opr.idx = cur_nsub;
2468   return tree;
2469 }
2470 
2471 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2472 
2473 static bin_tree_t *
2474 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
     /* [previous][next][first][last][top][bottom][index][help] */
2475               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2476 {
2477   bin_tree_t *tree = NULL, *old_tree = NULL;
2478   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2479   re_token_t start_token = *token;
2480 
2481   if (token->type == OP_OPEN_DUP_NUM)
2482     {
2483       end = 0;
2484       start = fetch_number (regexp, token, syntax);
2485       if (start == -1)
2486         {
2487           if (token->type == CHARACTER && token->opr.c == ',')
2488             start = 0; /* We treat "{,m}" as "{0,m}".  */
2489           else
2490             {
2491               *err = REG_BADBR; /* <re>{} is invalid.  */
2492               return NULL;
2493             }
2494         }
2495       if (__glibc_likely (start != -2))
2496         {
2497           /* We treat "{n}" as "{n,n}".  */
2498           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2499                  : ((token->type == CHARACTER && token->opr.c == ',')
2500                     ? fetch_number (regexp, token, syntax) : -2));
2501         }
2502       if (__glibc_unlikely (start == -2 || end == -2))
2503         {
2504           /* Invalid sequence.  */
2505           if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2506             {
2507               if (token->type == END_OF_RE)
2508                 *err = REG_EBRACE;
2509               else
2510                 *err = REG_BADBR;
2511 
2512               return NULL;
2513             }
2514 
2515           /* If the syntax bit is set, rollback.  */
2516           re_string_set_index (regexp, start_idx);
2517           *token = start_token;
2518           token->type = CHARACTER;
2519           /* mb_partial and word_char bits should be already initialized by
2520              peek_token.  */
2521           return elem;
2522         }
2523 
2524       if (__glibc_unlikely ((end != -1 && start > end)
2525                             || token->type != OP_CLOSE_DUP_NUM))
2526         {
2527           /* First number greater than second.  */
2528           *err = REG_BADBR;
2529           return NULL;
2530         }
2531 
2532       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2533         {
2534           *err = REG_ESIZE;
2535           return NULL;
2536         }
2537     }
2538   else
2539     {
2540       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2541       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2542     }
2543 
2544   fetch_token (token, regexp, syntax);
2545 
2546   if (__glibc_unlikely (elem == NULL))
2547     return NULL;
2548   if (__glibc_unlikely (start == 0 && end == 0))
2549     {
2550       postorder (elem, free_tree, NULL);
2551       return NULL;
2552     }
2553 
2554   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2555   if (__glibc_unlikely (start > 0))
2556     {
2557       tree = elem;
2558       for (i = 2; i <= start; ++i)
2559         {
2560           elem = duplicate_tree (elem, dfa);
2561           tree = create_tree (dfa, tree, elem, CONCAT);
2562           if (__glibc_unlikely (elem == NULL || tree == NULL))
2563             goto parse_dup_op_espace;
2564         }
2565 
2566       if (start == end)
2567         return tree;
2568 
2569       /* Duplicate ELEM before it is marked optional.  */
2570       elem = duplicate_tree (elem, dfa);
2571       if (__glibc_unlikely (elem == NULL))
2572         goto parse_dup_op_espace;
2573       old_tree = tree;
2574     }
2575   else
2576     old_tree = NULL;
2577 
2578   if (elem->token.type == SUBEXP)
2579     {
2580       uintptr_t subidx = elem->token.opr.idx;
2581       postorder (elem, mark_opt_subexp, (void *) subidx);
2582     }
2583 
2584   tree = create_tree (dfa, elem, NULL,
2585                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2586   if (__glibc_unlikely (tree == NULL))
2587     goto parse_dup_op_espace;
2588 
2589   /* This loop is actually executed only when end != -1,
2590      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2591      already created the start+1-th copy.  */
2592   if (TYPE_SIGNED (Idx) || end != -1)
2593     for (i = start + 2; i <= end; ++i)
2594       {
2595         elem = duplicate_tree (elem, dfa);
2596         tree = create_tree (dfa, tree, elem, CONCAT);
2597         if (__glibc_unlikely (elem == NULL || tree == NULL))
2598           goto parse_dup_op_espace;
2599 
2600         tree = create_tree (dfa, tree, NULL, OP_ALT);
2601         if (__glibc_unlikely (tree == NULL))
2602           goto parse_dup_op_espace;
2603       }
2604 
2605   if (old_tree)
2606     tree = create_tree (dfa, old_tree, tree, CONCAT);
2607 
2608   return tree;
2609 
2610  parse_dup_op_espace:
2611   *err = REG_ESPACE;
2612   return NULL;
2613 }
2614 
2615 /* Size of the names for collating symbol/equivalence_class/character_class.
2616    I'm not sure, but maybe enough.  */
2617 #define BRACKET_NAME_BUF_SIZE 32
2618 
2619 #ifndef _LIBC
2620 
2621 /* Convert the byte B to the corresponding wide character.  In a
2622    unibyte locale, treat B as itself.  In a multibyte locale, return
2623    WEOF if B is an encoding error.  */
2624 static wint_t
2625 parse_byte (unsigned char b, re_dfa_t const *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
2626 {
2627   return dfa->mb_cur_max > 1 ? __btowc (b) : b;
2628 }
2629 
2630 /* Local function for parse_bracket_exp used in _LIBC environment.
2631    Build the range expression which starts from START_ELEM, and ends
2632    at END_ELEM.  The result are written to MBCSET and SBCSET.
2633    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2634    mbcset->range_ends, is a pointer argument since we may
2635    update it.  */
2636 
2637 static reg_errcode_t
2638 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
     /* [previous][next][first][last][top][bottom][index][help] */
2639                  bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2640                  re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
2641                  const unsigned char *collseqmb, const char *collseqwc,
2642                  int_fast32_t table_size, const void *symb_table,
2643                  const unsigned char *extra)
2644 {
2645   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2646   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2647                         || start_elem->type == CHAR_CLASS
2648                         || end_elem->type == EQUIV_CLASS
2649                         || end_elem->type == CHAR_CLASS))
2650     return REG_ERANGE;
2651 
2652   /* We can handle no multi character collating elements without libc
2653      support.  */
2654   if (__glibc_unlikely ((start_elem->type == COLL_SYM
2655                          && strlen ((char *) start_elem->opr.name) > 1)
2656                         || (end_elem->type == COLL_SYM
2657                             && strlen ((char *) end_elem->opr.name) > 1)))
2658     return REG_ECOLLATE;
2659 
2660   unsigned int
2661     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2662                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663                    : 0)),
2664     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2665               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2666                  : 0));
2667   wint_t
2668     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2669                 ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
2670     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2671               ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2672 
2673   if (start_wc == WEOF || end_wc == WEOF)
2674     return REG_ECOLLATE;
2675   else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2676                              && start_wc > end_wc))
2677     return REG_ERANGE;
2678 
2679   /* Got valid collation sequence values, add them as a new entry.
2680      However, for !_LIBC we have no collation elements: if the
2681      character set is single byte, the single byte character set
2682      that we build below suffices.  parse_bracket_exp passes
2683      no MBCSET if dfa->mb_cur_max == 1.  */
2684   if (dfa->mb_cur_max > 1)
2685     {
2686       /* Check the space of the arrays.  */
2687       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2688         {
2689           /* There is not enough space, need realloc.  */
2690           wchar_t *new_array_start, *new_array_end;
2691           Idx new_nranges;
2692 
2693           /* +1 in case of mbcset->nranges is 0.  */
2694           new_nranges = 2 * mbcset->nranges + 1;
2695           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2696              are NULL if *range_alloc == 0.  */
2697           new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2698                                         new_nranges);
2699           new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2700                                       new_nranges);
2701 
2702           if (__glibc_unlikely (new_array_start == NULL
2703                                 || new_array_end == NULL))
2704             {
2705               re_free (new_array_start);
2706               re_free (new_array_end);
2707               return REG_ESPACE;
2708             }
2709 
2710           mbcset->range_starts = new_array_start;
2711           mbcset->range_ends = new_array_end;
2712           *range_alloc = new_nranges;
2713         }
2714 
2715       mbcset->range_starts[mbcset->nranges] = start_wc;
2716       mbcset->range_ends[mbcset->nranges++] = end_wc;
2717     }
2718 
2719   /* Build the table for single byte characters.  */
2720   for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2721     {
2722       if (start_wc <= wc && wc <= end_wc)
2723         bitset_set (sbcset, wc);
2724     }
2725 
2726   return REG_NOERROR;
2727 }
2728 #endif /* not _LIBC */
2729 
2730 #ifndef _LIBC
2731 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
2732    Build the collating element which is represented by NAME.
2733    The result are written to MBCSET and SBCSET.
2734    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2735    pointer argument since we may update it.  */
2736 
2737 static reg_errcode_t
2738 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
     /* [previous][next][first][last][top][bottom][index][help] */
2739                         Idx *coll_sym_alloc, const unsigned char *name,
2740                         uint_fast32_t nrules, int_fast32_t table_size,
2741                         const void *symb_table, const unsigned char *extra)
2742 {
2743   size_t name_len = strlen ((const char *) name);
2744   if (__glibc_unlikely (name_len != 1))
2745     return REG_ECOLLATE;
2746   else
2747     {
2748       bitset_set (sbcset, name[0]);
2749       return REG_NOERROR;
2750     }
2751 }
2752 #endif /* not _LIBC */
2753 
2754 #ifdef _LIBC
2755 /* Local function for parse_bracket_exp used in _LIBC environment.
2756    Seek the collating symbol entry corresponding to NAME.
2757    Return the index of the symbol in the SYMB_TABLE,
2758    or -1 if not found.  */
2759 
2760 static __always_inline int32_t
2761 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
     /* [previous][next][first][last][top][bottom][index][help] */
2762                              const int32_t *symb_table,
2763                              int_fast32_t table_size,
2764                              const unsigned char *extra)
2765 {
2766   int_fast32_t elem;
2767 
2768   for (elem = 0; elem < table_size; elem++)
2769     if (symb_table[2 * elem] != 0)
2770       {
2771         int32_t idx = symb_table[2 * elem + 1];
2772         /* Skip the name of collating element name.  */
2773         idx += 1 + extra[idx];
2774         if (/* Compare the length of the name.  */
2775             name_len == extra[idx]
2776             /* Compare the name.  */
2777             && memcmp (name, &extra[idx + 1], name_len) == 0)
2778           /* Yep, this is the entry.  */
2779           return elem;
2780       }
2781   return -1;
2782 }
2783 
2784 /* Local function for parse_bracket_exp used in _LIBC environment.
2785    Look up the collation sequence value of BR_ELEM.
2786    Return the value if succeeded, UINT_MAX otherwise.  */
2787 
2788 static __always_inline unsigned int
2789 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
     /* [previous][next][first][last][top][bottom][index][help] */
2790                                  const unsigned char *collseqmb,
2791                                  const char *collseqwc,
2792                                  int_fast32_t table_size,
2793                                  const int32_t *symb_table,
2794                                  const unsigned char *extra)
2795 {
2796   if (br_elem->type == SB_CHAR)
2797     {
2798       /* if (MB_CUR_MAX == 1) */
2799       if (nrules == 0)
2800         return collseqmb[br_elem->opr.ch];
2801       else
2802         {
2803           wint_t wc = __btowc (br_elem->opr.ch);
2804           return __collseq_table_lookup (collseqwc, wc);
2805         }
2806     }
2807   else if (br_elem->type == MB_CHAR)
2808     {
2809       if (nrules != 0)
2810         return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2811     }
2812   else if (br_elem->type == COLL_SYM)
2813     {
2814       size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2815       if (nrules != 0)
2816         {
2817           int32_t elem, idx;
2818           elem = seek_collating_symbol_entry (br_elem->opr.name,
2819                                               sym_name_len,
2820                                               symb_table, table_size,
2821                                               extra);
2822           if (elem != -1)
2823             {
2824               /* We found the entry.  */
2825               idx = symb_table[2 * elem + 1];
2826               /* Skip the name of collating element name.  */
2827               idx += 1 + extra[idx];
2828               /* Skip the byte sequence of the collating element.  */
2829               idx += 1 + extra[idx];
2830               /* Adjust for the alignment.  */
2831               idx = (idx + 3) & ~3;
2832               /* Skip the multibyte collation sequence value.  */
2833               idx += sizeof (unsigned int);
2834               /* Skip the wide char sequence of the collating element.  */
2835               idx += sizeof (unsigned int) *
2836                 (1 + *(unsigned int *) (extra + idx));
2837               /* Return the collation sequence value.  */
2838               return *(unsigned int *) (extra + idx);
2839             }
2840           else if (sym_name_len == 1)
2841             {
2842               /* No valid character.  Match it as a single byte
2843                  character.  */
2844               return collseqmb[br_elem->opr.name[0]];
2845             }
2846         }
2847       else if (sym_name_len == 1)
2848         return collseqmb[br_elem->opr.name[0]];
2849     }
2850   return UINT_MAX;
2851 }
2852 
2853 /* Local function for parse_bracket_exp used in _LIBC environment.
2854    Build the range expression which starts from START_ELEM, and ends
2855    at END_ELEM.  The result are written to MBCSET and SBCSET.
2856    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2857    mbcset->range_ends, is a pointer argument since we may
2858    update it.  */
2859 
2860 static __always_inline reg_errcode_t
2861 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
     /* [previous][next][first][last][top][bottom][index][help] */
2862                  bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2863                  re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2864                  const unsigned char *collseqmb, const char *collseqwc,
2865                  int_fast32_t table_size, const int32_t *symb_table,
2866                  const unsigned char *extra)
2867 {
2868   unsigned int ch;
2869   uint32_t start_collseq;
2870   uint32_t end_collseq;
2871 
2872   /* Equivalence Classes and Character Classes can't be a range
2873      start/end.  */
2874   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2875                         || start_elem->type == CHAR_CLASS
2876                         || end_elem->type == EQUIV_CLASS
2877                         || end_elem->type == CHAR_CLASS))
2878     return REG_ERANGE;
2879 
2880   /* FIXME: Implement rational ranges here, too.  */
2881   start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2882                                                    table_size, symb_table, extra);
2883   end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2884                                                  table_size, symb_table, extra);
2885   /* Check start/end collation sequence values.  */
2886   if (__glibc_unlikely (start_collseq == UINT_MAX
2887                         || end_collseq == UINT_MAX))
2888     return REG_ECOLLATE;
2889   if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2890                         && start_collseq > end_collseq))
2891     return REG_ERANGE;
2892 
2893   /* Got valid collation sequence values, add them as a new entry.
2894      However, if we have no collation elements, and the character set
2895      is single byte, the single byte character set that we
2896      build below suffices. */
2897   if (nrules > 0 || dfa->mb_cur_max > 1)
2898     {
2899       /* Check the space of the arrays.  */
2900       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2901         {
2902           /* There is not enough space, need realloc.  */
2903           uint32_t *new_array_start;
2904           uint32_t *new_array_end;
2905           int new_nranges;
2906 
2907           /* +1 in case of mbcset->nranges is 0.  */
2908           new_nranges = 2 * mbcset->nranges + 1;
2909           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2910                                         new_nranges);
2911           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2912                                       new_nranges);
2913 
2914           if (__glibc_unlikely (new_array_start == NULL
2915                                 || new_array_end == NULL))
2916             return REG_ESPACE;
2917 
2918           mbcset->range_starts = new_array_start;
2919           mbcset->range_ends = new_array_end;
2920           *range_alloc = new_nranges;
2921         }
2922 
2923       mbcset->range_starts[mbcset->nranges] = start_collseq;
2924       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2925     }
2926 
2927   /* Build the table for single byte characters.  */
2928   for (ch = 0; ch < SBC_MAX; ch++)
2929     {
2930       uint32_t ch_collseq;
2931       /* if (MB_CUR_MAX == 1) */
2932       if (nrules == 0)
2933         ch_collseq = collseqmb[ch];
2934       else
2935         ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2936       if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2937         bitset_set (sbcset, ch);
2938     }
2939   return REG_NOERROR;
2940 }
2941 
2942 /* Local function for parse_bracket_exp used in _LIBC environment.
2943    Build the collating element which is represented by NAME.
2944    The result are written to MBCSET and SBCSET.
2945    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2946    pointer argument since we may update it.  */
2947 
2948 static __always_inline reg_errcode_t
2949 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
     /* [previous][next][first][last][top][bottom][index][help] */
2950                         Idx *coll_sym_alloc, const unsigned char *name,
2951                         uint_fast32_t nrules, int_fast32_t table_size,
2952                         const int32_t *symb_table, const unsigned char *extra)
2953 {
2954   int32_t elem, idx;
2955   size_t name_len = strlen ((const char *) name);
2956   if (nrules != 0)
2957     {
2958       elem = seek_collating_symbol_entry (name, name_len, symb_table,
2959                                           table_size, extra);
2960       if (elem != -1)
2961         {
2962           /* We found the entry.  */
2963           idx = symb_table[2 * elem + 1];
2964           /* Skip the name of collating element name.  */
2965           idx += 1 + extra[idx];
2966         }
2967       else if (name_len == 1)
2968         {
2969           /* No valid character, treat it as a normal
2970              character.  */
2971           bitset_set (sbcset, name[0]);
2972           return REG_NOERROR;
2973         }
2974       else
2975         return REG_ECOLLATE;
2976 
2977       /* Got valid collation sequence, add it as a new entry.  */
2978       /* Check the space of the arrays.  */
2979       if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
2980         {
2981           /* Not enough, realloc it.  */
2982           /* +1 in case of mbcset->ncoll_syms is 0.  */
2983           int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2984           /* Use realloc since mbcset->coll_syms is NULL
2985              if *alloc == 0.  */
2986           int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2987                                                new_coll_sym_alloc);
2988           if (__glibc_unlikely (new_coll_syms == NULL))
2989             return REG_ESPACE;
2990           mbcset->coll_syms = new_coll_syms;
2991           *coll_sym_alloc = new_coll_sym_alloc;
2992         }
2993       mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2994       return REG_NOERROR;
2995     }
2996   else
2997     {
2998       if (__glibc_unlikely (name_len != 1))
2999         return REG_ECOLLATE;
3000       else
3001         {
3002           bitset_set (sbcset, name[0]);
3003           return REG_NOERROR;
3004         }
3005     }
3006 }
3007 #endif /* _LIBC */
3008 
3009 /* This function parse bracket expression like "[abc]", "[a-c]",
3010    "[[.a-a.]]" etc.  */
3011 
3012 static bin_tree_t *
3013 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
     /* [previous][next][first][last][top][bottom][index][help] */
3014                    reg_syntax_t syntax, reg_errcode_t *err)
3015 {
3016   const unsigned char *collseqmb = NULL;
3017   const char *collseqwc = NULL;
3018   uint_fast32_t nrules = 0;
3019   int_fast32_t table_size = 0;
3020   const void *symb_table = NULL;
3021   const unsigned char *extra = NULL;
3022 
3023   re_token_t br_token;
3024   re_bitset_ptr_t sbcset;
3025   re_charset_t *mbcset;
3026   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3027   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3028   bool non_match = false;
3029   bin_tree_t *work_tree;
3030   int token_len;
3031   bool first_round = true;
3032 #ifdef _LIBC
3033   collseqmb = (const unsigned char *)
3034     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3035   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3036   if (nrules)
3037     {
3038       /*
3039       if (MB_CUR_MAX > 1)
3040       */
3041       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3042       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3043       symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3044       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3045                                                    _NL_COLLATE_SYMB_EXTRAMB);
3046     }
3047 #endif
3048   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3049   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3050   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3051     {
3052       re_free (sbcset);
3053       re_free (mbcset);
3054       *err = REG_ESPACE;
3055       return NULL;
3056     }
3057 
3058   token_len = peek_token_bracket (token, regexp, syntax);
3059   if (__glibc_unlikely (token->type == END_OF_RE))
3060     {
3061       *err = REG_BADPAT;
3062       goto parse_bracket_exp_free_return;
3063     }
3064   if (token->type == OP_NON_MATCH_LIST)
3065     {
3066       mbcset->non_match = 1;
3067       non_match = true;
3068       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3069         bitset_set (sbcset, '\n');
3070       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3071       token_len = peek_token_bracket (token, regexp, syntax);
3072       if (__glibc_unlikely (token->type == END_OF_RE))
3073         {
3074           *err = REG_BADPAT;
3075           goto parse_bracket_exp_free_return;
3076         }
3077     }
3078 
3079   /* We treat the first ']' as a normal character.  */
3080   if (token->type == OP_CLOSE_BRACKET)
3081     token->type = CHARACTER;
3082 
3083   while (1)
3084     {
3085       bracket_elem_t start_elem, end_elem;
3086       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3087       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3088       reg_errcode_t ret;
3089       int token_len2 = 0;
3090       bool is_range_exp = false;
3091       re_token_t token2;
3092 
3093       start_elem.opr.name = start_name_buf;
3094       start_elem.type = COLL_SYM;
3095       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3096                                    syntax, first_round);
3097       if (__glibc_unlikely (ret != REG_NOERROR))
3098         {
3099           *err = ret;
3100           goto parse_bracket_exp_free_return;
3101         }
3102       first_round = false;
3103 
3104       /* Get information about the next token.  We need it in any case.  */
3105       token_len = peek_token_bracket (token, regexp, syntax);
3106 
3107       /* Do not check for ranges if we know they are not allowed.  */
3108       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3109         {
3110           if (__glibc_unlikely (token->type == END_OF_RE))
3111             {
3112               *err = REG_EBRACK;
3113               goto parse_bracket_exp_free_return;
3114             }
3115           if (token->type == OP_CHARSET_RANGE)
3116             {
3117               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3118               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3119               if (__glibc_unlikely (token2.type == END_OF_RE))
3120                 {
3121                   *err = REG_EBRACK;
3122                   goto parse_bracket_exp_free_return;
3123                 }
3124               if (token2.type == OP_CLOSE_BRACKET)
3125                 {
3126                   /* We treat the last '-' as a normal character.  */
3127                   re_string_skip_bytes (regexp, -token_len);
3128                   token->type = CHARACTER;
3129                 }
3130               else
3131                 is_range_exp = true;
3132             }
3133         }
3134 
3135       if (is_range_exp == true)
3136         {
3137           end_elem.opr.name = end_name_buf;
3138           end_elem.type = COLL_SYM;
3139           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3140                                        dfa, syntax, true);
3141           if (__glibc_unlikely (ret != REG_NOERROR))
3142             {
3143               *err = ret;
3144               goto parse_bracket_exp_free_return;
3145             }
3146 
3147           token_len = peek_token_bracket (token, regexp, syntax);
3148 
3149           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3150                                   &start_elem, &end_elem,
3151                                   dfa, syntax, nrules, collseqmb, collseqwc,
3152                                   table_size, symb_table, extra);
3153           if (__glibc_unlikely (*err != REG_NOERROR))
3154             goto parse_bracket_exp_free_return;
3155         }
3156       else
3157         {
3158           switch (start_elem.type)
3159             {
3160             case SB_CHAR:
3161               bitset_set (sbcset, start_elem.opr.ch);
3162               break;
3163             case MB_CHAR:
3164               /* Check whether the array has enough space.  */
3165               if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3166                 {
3167                   wchar_t *new_mbchars;
3168                   /* Not enough, realloc it.  */
3169                   /* +1 in case of mbcset->nmbchars is 0.  */
3170                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3171                   /* Use realloc since array is NULL if *alloc == 0.  */
3172                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3173                                             mbchar_alloc);
3174                   if (__glibc_unlikely (new_mbchars == NULL))
3175                     goto parse_bracket_exp_espace;
3176                   mbcset->mbchars = new_mbchars;
3177                 }
3178               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3179               break;
3180             case EQUIV_CLASS:
3181               *err = build_equiv_class (sbcset,
3182                                         mbcset, &equiv_class_alloc,
3183                                         start_elem.opr.name);
3184               if (__glibc_unlikely (*err != REG_NOERROR))
3185                 goto parse_bracket_exp_free_return;
3186               break;
3187             case COLL_SYM:
3188               *err = build_collating_symbol (sbcset,
3189                                              mbcset, &coll_sym_alloc,
3190                                              start_elem.opr.name,
3191                                              nrules, table_size, symb_table, extra);
3192               if (__glibc_unlikely (*err != REG_NOERROR))
3193                 goto parse_bracket_exp_free_return;
3194               break;
3195             case CHAR_CLASS:
3196               *err = build_charclass (regexp->trans, sbcset,
3197                                       mbcset, &char_class_alloc,
3198                                       (const char *) start_elem.opr.name,
3199                                       syntax);
3200               if (__glibc_unlikely (*err != REG_NOERROR))
3201                goto parse_bracket_exp_free_return;
3202               break;
3203             default:
3204               DEBUG_ASSERT (false);
3205               break;
3206             }
3207         }
3208       if (__glibc_unlikely (token->type == END_OF_RE))
3209         {
3210           *err = REG_EBRACK;
3211           goto parse_bracket_exp_free_return;
3212         }
3213       if (token->type == OP_CLOSE_BRACKET)
3214         break;
3215     }
3216 
3217   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3218 
3219   /* If it is non-matching list.  */
3220   if (non_match)
3221     bitset_not (sbcset);
3222 
3223   /* Ensure only single byte characters are set.  */
3224   if (dfa->mb_cur_max > 1)
3225     bitset_mask (sbcset, dfa->sb_char);
3226 
3227   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3228       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3229                                                      || mbcset->non_match)))
3230     {
3231       bin_tree_t *mbc_tree;
3232       int sbc_idx;
3233       /* Build a tree for complex bracket.  */
3234       dfa->has_mb_node = 1;
3235       br_token.type = COMPLEX_BRACKET;
3236       br_token.opr.mbcset = mbcset;
3237       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3238       if (__glibc_unlikely (mbc_tree == NULL))
3239         goto parse_bracket_exp_espace;
3240       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3241         if (sbcset[sbc_idx])
3242           break;
3243       /* If there are no bits set in sbcset, there is no point
3244          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3245       if (sbc_idx < BITSET_WORDS)
3246         {
3247           /* Build a tree for simple bracket.  */
3248           br_token.type = SIMPLE_BRACKET;
3249           br_token.opr.sbcset = sbcset;
3250           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3251           if (__glibc_unlikely (work_tree == NULL))
3252             goto parse_bracket_exp_espace;
3253 
3254           /* Then join them by ALT node.  */
3255           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3256           if (__glibc_unlikely (work_tree == NULL))
3257             goto parse_bracket_exp_espace;
3258         }
3259       else
3260         {
3261           re_free (sbcset);
3262           work_tree = mbc_tree;
3263         }
3264     }
3265   else
3266     {
3267       free_charset (mbcset);
3268       /* Build a tree for simple bracket.  */
3269       br_token.type = SIMPLE_BRACKET;
3270       br_token.opr.sbcset = sbcset;
3271       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272       if (__glibc_unlikely (work_tree == NULL))
3273         goto parse_bracket_exp_espace;
3274     }
3275   return work_tree;
3276 
3277  parse_bracket_exp_espace:
3278   *err = REG_ESPACE;
3279  parse_bracket_exp_free_return:
3280   re_free (sbcset);
3281   free_charset (mbcset);
3282   return NULL;
3283 }
3284 
3285 /* Parse an element in the bracket expression.  */
3286 
3287 static reg_errcode_t
3288 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
     /* [previous][next][first][last][top][bottom][index][help] */
3289                        re_token_t *token, int token_len, re_dfa_t *dfa,
3290                        reg_syntax_t syntax, bool accept_hyphen)
3291 {
3292   int cur_char_size;
3293   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3294   if (cur_char_size > 1)
3295     {
3296       elem->type = MB_CHAR;
3297       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3298       re_string_skip_bytes (regexp, cur_char_size);
3299       return REG_NOERROR;
3300     }
3301   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3302   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3303       || token->type == OP_OPEN_EQUIV_CLASS)
3304     return parse_bracket_symbol (elem, regexp, token);
3305   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3306     {
3307       /* A '-' must only appear as anything but a range indicator before
3308          the closing bracket.  Everything else is an error.  */
3309       re_token_t token2;
3310       (void) peek_token_bracket (&token2, regexp, syntax);
3311       if (token2.type != OP_CLOSE_BRACKET)
3312         /* The actual error value is not standardized since this whole
3313            case is undefined.  But ERANGE makes good sense.  */
3314         return REG_ERANGE;
3315     }
3316   elem->type = SB_CHAR;
3317   elem->opr.ch = token->opr.c;
3318   return REG_NOERROR;
3319 }
3320 
3321 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3322    such as [:<character_class>:], [.<collating_element>.], and
3323    [=<equivalent_class>=].  */
3324 
3325 static reg_errcode_t
3326 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
     /* [previous][next][first][last][top][bottom][index][help] */
3327                       re_token_t *token)
3328 {
3329   unsigned char ch, delim = token->opr.c;
3330   int i = 0;
3331   if (re_string_eoi(regexp))
3332     return REG_EBRACK;
3333   for (;; ++i)
3334     {
3335       if (i >= BRACKET_NAME_BUF_SIZE)
3336         return REG_EBRACK;
3337       if (token->type == OP_OPEN_CHAR_CLASS)
3338         ch = re_string_fetch_byte_case (regexp);
3339       else
3340         ch = re_string_fetch_byte (regexp);
3341       if (re_string_eoi(regexp))
3342         return REG_EBRACK;
3343       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3344         break;
3345       elem->opr.name[i] = ch;
3346     }
3347   re_string_skip_bytes (regexp, 1);
3348   elem->opr.name[i] = '\0';
3349   switch (token->type)
3350     {
3351     case OP_OPEN_COLL_ELEM:
3352       elem->type = COLL_SYM;
3353       break;
3354     case OP_OPEN_EQUIV_CLASS:
3355       elem->type = EQUIV_CLASS;
3356       break;
3357     case OP_OPEN_CHAR_CLASS:
3358       elem->type = CHAR_CLASS;
3359       break;
3360     default:
3361       break;
3362     }
3363   return REG_NOERROR;
3364 }
3365 
3366   /* Helper function for parse_bracket_exp.
3367      Build the equivalence class which is represented by NAME.
3368      The result are written to MBCSET and SBCSET.
3369      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3370      is a pointer argument since we may update it.  */
3371 
3372 static reg_errcode_t
3373 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
     /* [previous][next][first][last][top][bottom][index][help] */
3374                    Idx *equiv_class_alloc, const unsigned char *name)
3375 {
3376 #ifdef _LIBC
3377   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3378   if (nrules != 0)
3379     {
3380       const int32_t *table, *indirect;
3381       const unsigned char *weights, *extra, *cp;
3382       unsigned char char_buf[2];
3383       int32_t idx1, idx2;
3384       unsigned int ch;
3385       size_t len;
3386       /* Calculate the index for equivalence class.  */
3387       cp = name;
3388       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3389       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3390                                                _NL_COLLATE_WEIGHTMB);
3391       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3392                                                    _NL_COLLATE_EXTRAMB);
3393       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3394                                                 _NL_COLLATE_INDIRECTMB);
3395       idx1 = findidx (table, indirect, extra, &cp, -1);
3396       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3397         /* This isn't a valid character.  */
3398         return REG_ECOLLATE;
3399 
3400       /* Build single byte matching table for this equivalence class.  */
3401       len = weights[idx1 & 0xffffff];
3402       for (ch = 0; ch < SBC_MAX; ++ch)
3403         {
3404           char_buf[0] = ch;
3405           cp = char_buf;
3406           idx2 = findidx (table, indirect, extra, &cp, 1);
3407 /*
3408           idx2 = table[ch];
3409 */
3410           if (idx2 == 0)
3411             /* This isn't a valid character.  */
3412             continue;
3413           /* Compare only if the length matches and the collation rule
3414              index is the same.  */
3415           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3416               && memcmp (weights + (idx1 & 0xffffff) + 1,
3417                          weights + (idx2 & 0xffffff) + 1, len) == 0)
3418             bitset_set (sbcset, ch);
3419         }
3420       /* Check whether the array has enough space.  */
3421       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3422         {
3423           /* Not enough, realloc it.  */
3424           /* +1 in case of mbcset->nequiv_classes is 0.  */
3425           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3426           /* Use realloc since the array is NULL if *alloc == 0.  */
3427           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3428                                                    int32_t,
3429                                                    new_equiv_class_alloc);
3430           if (__glibc_unlikely (new_equiv_classes == NULL))
3431             return REG_ESPACE;
3432           mbcset->equiv_classes = new_equiv_classes;
3433           *equiv_class_alloc = new_equiv_class_alloc;
3434         }
3435       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3436     }
3437   else
3438 #endif /* _LIBC */
3439     {
3440       if (__glibc_unlikely (strlen ((const char *) name) != 1))
3441         return REG_ECOLLATE;
3442       bitset_set (sbcset, *name);
3443     }
3444   return REG_NOERROR;
3445 }
3446 
3447   /* Helper function for parse_bracket_exp.
3448      Build the character class which is represented by NAME.
3449      The result are written to MBCSET and SBCSET.
3450      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3451      is a pointer argument since we may update it.  */
3452 
3453 static reg_errcode_t
3454 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
     /* [previous][next][first][last][top][bottom][index][help] */
3455                  re_charset_t *mbcset, Idx *char_class_alloc,
3456                  const char *class_name, reg_syntax_t syntax)
3457 {
3458   int i;
3459   const char *name = class_name;
3460 
3461   /* In case of REG_ICASE "upper" and "lower" match the both of
3462      upper and lower cases.  */
3463   if ((syntax & RE_ICASE)
3464       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3465     name = "alpha";
3466 
3467   /* Check the space of the arrays.  */
3468   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3469     {
3470       /* Not enough, realloc it.  */
3471       /* +1 in case of mbcset->nchar_classes is 0.  */
3472       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3473       /* Use realloc since array is NULL if *alloc == 0.  */
3474       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3475                                                new_char_class_alloc);
3476       if (__glibc_unlikely (new_char_classes == NULL))
3477         return REG_ESPACE;
3478       mbcset->char_classes = new_char_classes;
3479       *char_class_alloc = new_char_class_alloc;
3480     }
3481   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3482 
3483 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3484   do {                                          \
3485     if (__glibc_unlikely (trans != NULL))                       \
3486       {                                         \
3487         for (i = 0; i < SBC_MAX; ++i)           \
3488           if (ctype_func (i))                   \
3489             bitset_set (sbcset, trans[i]);      \
3490       }                                         \
3491     else                                        \
3492       {                                         \
3493         for (i = 0; i < SBC_MAX; ++i)           \
3494           if (ctype_func (i))                   \
3495             bitset_set (sbcset, i);             \
3496       }                                         \
3497   } while (0)
3498 
3499   if (strcmp (name, "alnum") == 0)
3500     BUILD_CHARCLASS_LOOP (isalnum);
3501   else if (strcmp (name, "cntrl") == 0)
3502     BUILD_CHARCLASS_LOOP (iscntrl);
3503   else if (strcmp (name, "lower") == 0)
3504     BUILD_CHARCLASS_LOOP (islower);
3505   else if (strcmp (name, "space") == 0)
3506     BUILD_CHARCLASS_LOOP (isspace);
3507   else if (strcmp (name, "alpha") == 0)
3508     BUILD_CHARCLASS_LOOP (isalpha);
3509   else if (strcmp (name, "digit") == 0)
3510     BUILD_CHARCLASS_LOOP (isdigit);
3511   else if (strcmp (name, "print") == 0)
3512     BUILD_CHARCLASS_LOOP (isprint);
3513   else if (strcmp (name, "upper") == 0)
3514     BUILD_CHARCLASS_LOOP (isupper);
3515   else if (strcmp (name, "blank") == 0)
3516     BUILD_CHARCLASS_LOOP (isblank);
3517   else if (strcmp (name, "graph") == 0)
3518     BUILD_CHARCLASS_LOOP (isgraph);
3519   else if (strcmp (name, "punct") == 0)
3520     BUILD_CHARCLASS_LOOP (ispunct);
3521   else if (strcmp (name, "xdigit") == 0)
3522     BUILD_CHARCLASS_LOOP (isxdigit);
3523   else
3524     return REG_ECTYPE;
3525 
3526   return REG_NOERROR;
3527 }
3528 
3529 static bin_tree_t *
3530 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
     /* [previous][next][first][last][top][bottom][index][help] */
3531                     const char *class_name,
3532                     const char *extra, bool non_match,
3533                     reg_errcode_t *err)
3534 {
3535   re_bitset_ptr_t sbcset;
3536   re_charset_t *mbcset;
3537   Idx alloc = 0;
3538   reg_errcode_t ret;
3539   bin_tree_t *tree;
3540 
3541   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3542   if (__glibc_unlikely (sbcset == NULL))
3543     {
3544       *err = REG_ESPACE;
3545       return NULL;
3546     }
3547   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3548   if (__glibc_unlikely (mbcset == NULL))
3549     {
3550       re_free (sbcset);
3551       *err = REG_ESPACE;
3552       return NULL;
3553     }
3554   mbcset->non_match = non_match;
3555 
3556   /* We don't care the syntax in this case.  */
3557   ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3558 
3559   if (__glibc_unlikely (ret != REG_NOERROR))
3560     {
3561       re_free (sbcset);
3562       free_charset (mbcset);
3563       *err = ret;
3564       return NULL;
3565     }
3566   /* \w match '_' also.  */
3567   for (; *extra; extra++)
3568     bitset_set (sbcset, *extra);
3569 
3570   /* If it is non-matching list.  */
3571   if (non_match)
3572     bitset_not (sbcset);
3573 
3574   /* Ensure only single byte characters are set.  */
3575   if (dfa->mb_cur_max > 1)
3576     bitset_mask (sbcset, dfa->sb_char);
3577 
3578   /* Build a tree for simple bracket.  */
3579   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3580   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3581   if (__glibc_unlikely (tree == NULL))
3582     goto build_word_op_espace;
3583 
3584   if (dfa->mb_cur_max > 1)
3585     {
3586       bin_tree_t *mbc_tree;
3587       /* Build a tree for complex bracket.  */
3588       br_token.type = COMPLEX_BRACKET;
3589       br_token.opr.mbcset = mbcset;
3590       dfa->has_mb_node = 1;
3591       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3592       if (__glibc_unlikely (mbc_tree == NULL))
3593         goto build_word_op_espace;
3594       /* Then join them by ALT node.  */
3595       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3596       if (__glibc_likely (mbc_tree != NULL))
3597         return tree;
3598     }
3599   else
3600     {
3601       free_charset (mbcset);
3602       return tree;
3603     }
3604 
3605  build_word_op_espace:
3606   re_free (sbcset);
3607   free_charset (mbcset);
3608   *err = REG_ESPACE;
3609   return NULL;
3610 }
3611 
3612 /* This is intended for the expressions like "a{1,3}".
3613    Fetch a number from 'input', and return the number.
3614    Return -1 if the number field is empty like "{,1}".
3615    Return RE_DUP_MAX + 1 if the number field is too large.
3616    Return -2 if an error occurred.  */
3617 
3618 static Idx
3619 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
     /* [previous][next][first][last][top][bottom][index][help] */
3620 {
3621   Idx num = -1;
3622   unsigned char c;
3623   while (1)
3624     {
3625       fetch_token (token, input, syntax);
3626       c = token->opr.c;
3627       if (__glibc_unlikely (token->type == END_OF_RE))
3628         return -2;
3629       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3630         break;
3631       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3632              ? -2
3633              : num == -1
3634              ? c - '0'
3635              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3636     }
3637   return num;
3638 }
3639 
3640 static void
3641 free_charset (re_charset_t *cset)
     /* [previous][next][first][last][top][bottom][index][help] */
3642 {
3643   re_free (cset->mbchars);
3644 #ifdef _LIBC
3645   re_free (cset->coll_syms);
3646   re_free (cset->equiv_classes);
3647 #endif
3648   re_free (cset->range_starts);
3649   re_free (cset->range_ends);
3650   re_free (cset->char_classes);
3651   re_free (cset);
3652 }
3653 
3654 /* Functions for binary tree operation.  */
3655 
3656 /* Create a tree node.  */
3657 
3658 static bin_tree_t *
3659 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
     /* [previous][next][first][last][top][bottom][index][help] */
3660              re_token_type_t type)
3661 {
3662   re_token_t t = { .type = type };
3663   return create_token_tree (dfa, left, right, &t);
3664 }
3665 
3666 static bin_tree_t *
3667 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
     /* [previous][next][first][last][top][bottom][index][help] */
3668                    const re_token_t *token)
3669 {
3670   bin_tree_t *tree;
3671   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3672     {
3673       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3674 
3675       if (storage == NULL)
3676         return NULL;
3677       storage->next = dfa->str_tree_storage;
3678       dfa->str_tree_storage = storage;
3679       dfa->str_tree_storage_idx = 0;
3680     }
3681   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3682 
3683   tree->parent = NULL;
3684   tree->left = left;
3685   tree->right = right;
3686   tree->token = *token;
3687   tree->token.duplicated = 0;
3688   tree->token.opt_subexp = 0;
3689   tree->first = NULL;
3690   tree->next = NULL;
3691   tree->node_idx = -1;
3692 
3693   if (left != NULL)
3694     left->parent = tree;
3695   if (right != NULL)
3696     right->parent = tree;
3697   return tree;
3698 }
3699 
3700 /* Mark the tree SRC as an optional subexpression.
3701    To be called from preorder or postorder.  */
3702 
3703 static reg_errcode_t
3704 mark_opt_subexp (void *extra, bin_tree_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
3705 {
3706   Idx idx = (uintptr_t) extra;
3707   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3708     node->token.opt_subexp = 1;
3709 
3710   return REG_NOERROR;
3711 }
3712 
3713 /* Free the allocated memory inside NODE. */
3714 
3715 static void
3716 free_token (re_token_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
3717 {
3718   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3719     free_charset (node->opr.mbcset);
3720   else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3721     re_free (node->opr.sbcset);
3722 }
3723 
3724 /* Worker function for tree walking.  Free the allocated memory inside NODE
3725    and its children. */
3726 
3727 static reg_errcode_t
3728 free_tree (void *extra, bin_tree_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
3729 {
3730   free_token (&node->token);
3731   return REG_NOERROR;
3732 }
3733 
3734 
3735 /* Duplicate the node SRC, and return new node.  This is a preorder
3736    visit similar to the one implemented by the generic visitor, but
3737    we need more infrastructure to maintain two parallel trees --- so,
3738    it's easier to duplicate.  */
3739 
3740 static bin_tree_t *
3741 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
3742 {
3743   const bin_tree_t *node;
3744   bin_tree_t *dup_root;
3745   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3746 
3747   for (node = root; ; )
3748     {
3749       /* Create a new tree and link it back to the current parent.  */
3750       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3751       if (*p_new == NULL)
3752         return NULL;
3753       (*p_new)->parent = dup_node;
3754       (*p_new)->token.duplicated = 1;
3755       dup_node = *p_new;
3756 
3757       /* Go to the left node, or up and to the right.  */
3758       if (node->left)
3759         {
3760           node = node->left;
3761           p_new = &dup_node->left;
3762         }
3763       else
3764         {
3765           const bin_tree_t *prev = NULL;
3766           while (node->right == prev || node->right == NULL)
3767             {
3768               prev = node;
3769               node = node->parent;
3770               dup_node = dup_node->parent;
3771               if (!node)
3772                 return dup_root;
3773             }
3774           node = node->right;
3775           p_new = &dup_node->right;
3776         }
3777     }
3778 }

/* [previous][next][first][last][top][bottom][index][help] */