root/maint/gnulib/lib/dfa.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. streq
  2. isasciidigit
  3. isblank
  4. to_uchar
  5. newline_constraint
  6. letter_constraint
  7. other_constraint
  8. succeeds_in_context
  9. prev_newline_dependent
  10. prev_letter_dependent
  11. accepting
  12. accepts_in_context
  13. mbs_to_wchar
  14. prtok
  15. tstbit
  16. setbit
  17. clrbit
  18. zeroset
  19. fillset
  20. notset
  21. equal
  22. emptyset
  23. maybe_realloc
  24. charclass_index
  25. unibyte_word_constituent
  26. char_context
  27. setbit_wc
  28. setbit_case_fold_c
  29. fetch_wc
  30. bracket_fetch_wc
  31. find_pred
  32. parse_bracket_exp
  33. push_lex_state
  34. pop_lex_state
  35. lex
  36. addtok_mb
  37. addtok
  38. addtok_wc
  39. add_utf8_anychar
  40. atom
  41. nsubtoks
  42. copytoks
  43. closure
  44. branch
  45. regexp
  46. dfaparse
  47. copy
  48. alloc_position_set
  49. insert
  50. append
  51. merge_constrained
  52. merge
  53. merge2
  54. delete
  55. replace
  56. state_index
  57. epsclosure
  58. charclass_context
  59. state_separate_contexts
  60. merge_nfa_state
  61. compare
  62. reorder_tokens
  63. dfaoptimize
  64. dfaanalyze
  65. realloc_trans_if_necessary
  66. build_state
  67. transit_state_singlebyte
  68. transit_state
  69. skip_remains_mb
  70. dfaexec_main
  71. dfaexec_mb
  72. dfaexec_sb
  73. dfaexec_noop
  74. dfaexec
  75. dfasuperset
  76. dfaisfast
  77. free_mbdata
  78. dfasupported
  79. maybe_disable_superset_dfa
  80. dfassbuild
  81. dfacomp
  82. dfafree
  83. icatalloc
  84. freelist
  85. enlistnew
  86. enlist
  87. comsubs
  88. addlists
  89. inboth
  90. allocmust
  91. resetmust
  92. freemust
  93. dfamust
  94. dfamustfree
  95. dfaalloc
  96. dfasyntax
  97. dfacopysyntax

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
   3    Foundation, Inc.
   4 
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9 
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14 
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19 
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22 
  23 #include <config.h>
  24 
  25 #include "dfa.h"
  26 
  27 #include "flexmember.h"
  28 #include "idx.h"
  29 #include "verify.h"
  30 
  31 #include <assert.h>
  32 #include <ctype.h>
  33 #include <stdint.h>
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <limits.h>
  37 #include <string.h>
  38 
  39 /* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
  40    understandably cannot deduce that the input comes from a
  41    well-formed regular expression.  There's little point to the
  42    runtime overhead of 'assert' instead of 'assume_nonnull' when the
  43    MMU will check anyway.  */
  44 #define assume_nonnull(x) assume ((x) != NULL)
  45 
  46 static bool
  47 streq (char const *a, char const *b)
     /* [previous][next][first][last][top][bottom][index][help] */
  48 {
  49   return strcmp (a, b) == 0;
  50 }
  51 
  52 static bool
  53 isasciidigit (char c)
     /* [previous][next][first][last][top][bottom][index][help] */
  54 {
  55   return '0' <= c && c <= '9';
  56 }
  57 
  58 #include "gettext.h"
  59 #define _(str) gettext (str)
  60 
  61 #include <wchar.h>
  62 
  63 #include "xalloc.h"
  64 #include "localeinfo.h"
  65 
  66 #ifndef FALLTHROUGH
  67 # if 201710L < __STDC_VERSION__
  68 #  define FALLTHROUGH [[__fallthrough__]]
  69 # elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
  70 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  71 # else
  72 #  define FALLTHROUGH ((void) 0)
  73 # endif
  74 #endif
  75 
  76 #ifndef MIN
  77 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  78 #endif
  79 
  80 /* HPUX defines these as macros in sys/param.h.  */
  81 #ifdef setbit
  82 # undef setbit
  83 #endif
  84 #ifdef clrbit
  85 # undef clrbit
  86 #endif
  87 
  88 /* For code that does not use Gnulib’s isblank module.  */
  89 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
  90 # define isblank dfa_isblank
  91 static int
  92 isblank (int c)
     /* [previous][next][first][last][top][bottom][index][help] */
  93 {
  94   return c == ' ' || c == '\t';
  95 }
  96 #endif
  97 
  98 /* First integer value that is greater than any character code.  */
  99 enum { NOTCHAR = 1 << CHAR_BIT };
 100 
 101 #ifdef UINT_LEAST64_MAX
 102 
 103 /* Number of bits used in a charclass word.  */
 104 enum { CHARCLASS_WORD_BITS = 64 };
 105 
 106 /* This represents part of a character class.  It must be unsigned and
 107    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
 108 typedef uint_least64_t charclass_word;
 109 
 110 /* Part of a charclass initializer that represents 64 bits' worth of a
 111    charclass, where LO and HI are the low and high-order 32 bits of
 112    the 64-bit quantity.  */
 113 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
 114 
 115 #else
 116 /* Fallbacks for pre-C99 hosts that lack 64-bit integers.  */
 117 enum { CHARCLASS_WORD_BITS = 32 };
 118 typedef unsigned long charclass_word;
 119 # define CHARCLASS_PAIR(lo, hi) lo, hi
 120 #endif
 121 
 122 /* An initializer for a charclass whose 32-bit words are A through H.  */
 123 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
 124    {{                                                   \
 125       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
 126       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
 127    }}
 128 
 129 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 130 static charclass_word const CHARCLASS_WORD_MASK
 131   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 132 
 133 /* Number of words required to hold a bit for every character.  */
 134 enum
 135 {
 136   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 137 };
 138 
 139 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 140 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 141 
 142 /* Convert a possibly-signed character to an unsigned character.  This is
 143    a bit safer than casting to unsigned char, since it catches some type
 144    errors that the cast doesn't.  */
 145 static unsigned char
 146 to_uchar (char ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 147 {
 148   return ch;
 149 }
 150 
 151 /* Contexts tell us whether a character is a newline or a word constituent.
 152    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 153    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 154    a particular character class.
 155 
 156    A state also stores a context value, which is a bitmask of CTX_* values.
 157    A state's context represents a set of characters that the state's
 158    predecessors must match.  For example, a state whose context does not
 159    include CTX_LETTER will never have transitions where the previous
 160    character is a word constituent.  A state whose context is CTX_ANY
 161    might have transitions from any character.  */
 162 
 163 enum
 164   {
 165     CTX_NONE = 1,
 166     CTX_LETTER = 2,
 167     CTX_NEWLINE = 4,
 168     CTX_ANY = 7
 169   };
 170 
 171 /* Sometimes characters can only be matched depending on the surrounding
 172    context.  Such context decisions depend on what the previous character
 173    was, and the value of the current (lookahead) character.  Context
 174    dependent constraints are encoded as 9-bit integers.  Each bit that
 175    is set indicates that the constraint succeeds in the corresponding
 176    context.
 177 
 178    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 179    bit 3-5  - valid contexts when next character is CTX_LETTER
 180    bit 0-2  - valid contexts when next character is CTX_NONE
 181 
 182    succeeds_in_context determines whether a given constraint
 183    succeeds in a particular context.  Prev is a bitmask of possible
 184    context values for the previous character, curr is the (single-bit)
 185    context value for the lookahead character.  */
 186 static int
 187 newline_constraint (int constraint)
     /* [previous][next][first][last][top][bottom][index][help] */
 188 {
 189   return (constraint >> 6) & 7;
 190 }
 191 static int
 192 letter_constraint (int constraint)
     /* [previous][next][first][last][top][bottom][index][help] */
 193 {
 194   return (constraint >> 3) & 7;
 195 }
 196 static int
 197 other_constraint (int constraint)
     /* [previous][next][first][last][top][bottom][index][help] */
 198 {
 199   return constraint & 7;
 200 }
 201 
 202 static bool
 203 succeeds_in_context (int constraint, int prev, int curr)
     /* [previous][next][first][last][top][bottom][index][help] */
 204 {
 205   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 206               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 207               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 208              & prev);
 209 }
 210 
 211 /* The following describe what a constraint depends on.  */
 212 static bool
 213 prev_newline_dependent (int constraint)
     /* [previous][next][first][last][top][bottom][index][help] */
 214 {
 215   return ((constraint ^ constraint >> 2) & 0111) != 0;
 216 }
 217 static bool
 218 prev_letter_dependent (int constraint)
     /* [previous][next][first][last][top][bottom][index][help] */
 219 {
 220   return ((constraint ^ constraint >> 1) & 0111) != 0;
 221 }
 222 
 223 /* Tokens that match the empty string subject to some constraint actually
 224    work by applying that constraint to determine what may follow them,
 225    taking into account what has gone before.  The following values are
 226    the constraints corresponding to the special tokens previously defined.  */
 227 enum
 228   {
 229     NO_CONSTRAINT = 0777,
 230     BEGLINE_CONSTRAINT = 0444,
 231     ENDLINE_CONSTRAINT = 0700,
 232     BEGWORD_CONSTRAINT = 0050,
 233     ENDWORD_CONSTRAINT = 0202,
 234     LIMWORD_CONSTRAINT = 0252,
 235     NOTLIMWORD_CONSTRAINT = 0525
 236   };
 237 
 238 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 239    are operators and others are terminal symbols.  Most (but not all) of these
 240    codes are returned by the lexical analyzer.  */
 241 
 242 typedef ptrdiff_t token;
 243 static token const TOKEN_MAX = PTRDIFF_MAX;
 244 
 245 /* States are indexed by state_num values.  These are normally
 246    nonnegative but -1 is used as a special value.  */
 247 typedef ptrdiff_t state_num;
 248 
 249 /* Predefined token values.  */
 250 enum
 251 {
 252   END = -1,                     /* END is a terminal symbol that matches the
 253                                    end of input; any value of END or less in
 254                                    the parse tree is such a symbol.  Accepting
 255                                    states of the DFA are those that would have
 256                                    a transition on END.  This is -1, not some
 257                                    more-negative value, to tweak the speed of
 258                                    comparisons to END.  */
 259 
 260   /* Ordinary character values are terminal symbols that match themselves.  */
 261 
 262   /* CSET must come last in the following list of special tokens.  Otherwise,
 263      the list order matters only for performance.  Related special tokens
 264      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
 265      || CSET <= t) can be done with a single machine-level comparison.  */
 266 
 267   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 268                                    the empty string.  */
 269 
 270   QMARK,                        /* QMARK is an operator of one argument that
 271                                    matches zero or one occurrences of its
 272                                    argument.  */
 273 
 274   STAR,                         /* STAR is an operator of one argument that
 275                                    matches the Kleene closure (zero or more
 276                                    occurrences) of its argument.  */
 277 
 278   PLUS,                         /* PLUS is an operator of one argument that
 279                                    matches the positive closure (one or more
 280                                    occurrences) of its argument.  */
 281 
 282   REPMN,                        /* REPMN is a lexical token corresponding
 283                                    to the {m,n} construct.  REPMN never
 284                                    appears in the compiled token vector.  */
 285 
 286   CAT,                          /* CAT is an operator of two arguments that
 287                                    matches the concatenation of its
 288                                    arguments.  CAT is never returned by the
 289                                    lexical analyzer.  */
 290 
 291   OR,                           /* OR is an operator of two arguments that
 292                                    matches either of its arguments.  */
 293 
 294   LPAREN,                       /* LPAREN never appears in the parse tree,
 295                                    it is only a lexeme.  */
 296 
 297   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 298 
 299   WCHAR,                        /* Only returned by lex.  wctok contains
 300                                    the wide character representation.  */
 301 
 302   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 303                                    a valid multibyte (or single byte) character.
 304                                    It is used only if MB_CUR_MAX > 1.  */
 305 
 306   BEG,                          /* BEG is an initial symbol that matches the
 307                                    beginning of input.  */
 308 
 309   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 310                                    the empty string at the beginning of a
 311                                    line.  */
 312 
 313   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 314                                    the empty string at the end of a line.  */
 315 
 316   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 317                                    the empty string at the beginning of a
 318                                    word.  */
 319 
 320   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 321                                    the empty string at the end of a word.  */
 322 
 323   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 324                                    the empty string at the beginning or the
 325                                    end of a word.  */
 326 
 327   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 328                                    matches the empty string not at
 329                                    the beginning or end of a word.  */
 330 
 331   BACKREF,                      /* BACKREF is generated by \<digit>
 332                                    or by any other construct that
 333                                    is not completely handled.  If the scanner
 334                                    detects a transition on backref, it returns
 335                                    a kind of "semi-success" indicating that
 336                                    the match will have to be verified with
 337                                    a backtracking matcher.  */
 338 
 339   MBCSET,                       /* MBCSET is similar to CSET, but for
 340                                    multibyte characters.  */
 341 
 342   CSET                          /* CSET and (and any value greater) is a
 343                                    terminal symbol that matches any of a
 344                                    class of characters.  */
 345 };
 346 
 347 
 348 /* States of the recognizer correspond to sets of positions in the parse
 349    tree, together with the constraints under which they may be matched.
 350    So a position is encoded as an index into the parse tree together with
 351    a constraint.  */
 352 typedef struct
 353 {
 354   idx_t index;                  /* Index into the parse array.  */
 355   unsigned int constraint;      /* Constraint for matching this position.  */
 356 } position;
 357 
 358 /* Sets of positions are stored as arrays.  */
 359 typedef struct
 360 {
 361   position *elems;              /* Elements of this position set.  */
 362   idx_t nelem;                  /* Number of elements in this set.  */
 363   idx_t alloc;                  /* Number of elements allocated in ELEMS.  */
 364 } position_set;
 365 
 366 /* A state of the dfa consists of a set of positions, some flags,
 367    and the token value of the lowest-numbered position of the state that
 368    contains an END token.  */
 369 typedef struct
 370 {
 371   size_t hash;                  /* Hash of the positions of this state.  */
 372   position_set elems;           /* Positions this state could match.  */
 373   unsigned char context;        /* Context from previous state.  */
 374   unsigned short constraint;    /* Constraint for this state to accept.  */
 375   position_set mbps;            /* Positions which can match multibyte
 376                                    characters or the follows, e.g., period.
 377                                    Used only if MB_CUR_MAX > 1.  */
 378   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 379                                    negative if the state does not have
 380                                    ANYCHAR.  */
 381 } dfa_state;
 382 
 383 /* Maximum for any transition table count.  This should be at least 3,
 384    for the initial state setup.  */
 385 enum { MAX_TRCOUNT = 1024 };
 386 
 387 /* A bracket operator.
 388    e.g., [a-c], [[:alpha:]], etc.  */
 389 struct mb_char_classes
 390 {
 391   ptrdiff_t cset;
 392   bool invert;
 393   wchar_t *chars;               /* Normal characters.  */
 394   idx_t nchars;
 395   idx_t nchars_alloc;
 396 };
 397 
 398 struct regex_syntax
 399 {
 400   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 401   reg_syntax_t syntax_bits;
 402   bool syntax_bits_set;
 403 
 404   /* Flag for case-folding letters into sets.  */
 405   bool case_fold;
 406 
 407   /* True if ^ and $ match only the start and end of data, and do not match
 408      end-of-line within data.  */
 409   bool anchor;
 410 
 411   /* End-of-line byte in data.  */
 412   unsigned char eolbyte;
 413 
 414   /* Cache of char-context values.  */
 415   char sbit[NOTCHAR];
 416 
 417   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 418      multibyte character.  */
 419   bool never_trail[NOTCHAR];
 420 
 421   /* Set of characters considered letters.  */
 422   charclass letters;
 423 
 424   /* Set of characters that are newline.  */
 425   charclass newline;
 426 };
 427 
 428 /* Lexical analyzer.  All the dross that deals with the obnoxious
 429    GNU Regex syntax bits is located here.  The poor, suffering
 430    reader is referred to the GNU Regex documentation for the
 431    meaning of the @#%!@#%^!@ syntax bits.  */
 432 struct lexer_state
 433 {
 434   char const *ptr;      /* Pointer to next input character.  */
 435   idx_t left;           /* Number of characters remaining.  */
 436   token lasttok;        /* Previous token returned; initially END.  */
 437   idx_t parens;         /* Count of outstanding left parens.  */
 438   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 439 
 440   /* Wide character representation of the current multibyte character,
 441      or WEOF if there was an encoding error.  Used only if
 442      MB_CUR_MAX > 1.  */
 443   wint_t wctok;
 444 
 445   /* The most recently analyzed multibyte bracket expression.  */
 446   struct mb_char_classes brack;
 447 
 448   /* We're separated from beginning or (, | only by zero-width characters.  */
 449   bool laststart;
 450 };
 451 
 452 /* Recursive descent parser for regular expressions.  */
 453 
 454 struct parser_state
 455 {
 456   token tok;               /* Lookahead token.  */
 457   idx_t depth;             /* Current depth of a hypothetical stack
 458                               holding deferred productions.  This is
 459                               used to determine the depth that will be
 460                               required of the real stack later on in
 461                               dfaanalyze.  */
 462 };
 463 
 464 /* A compiled regular expression.  */
 465 struct dfa
 466 {
 467   /* Fields filled by the scanner.  */
 468   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 469   idx_t cindex;                 /* Index for adding new charclasses.  */
 470   idx_t calloc;                 /* Number of charclasses allocated.  */
 471   ptrdiff_t canychar;           /* Index of anychar class, or -1.  */
 472 
 473   /* Scanner state */
 474   struct lexer_state lex;
 475 
 476   /* Parser state */
 477   struct parser_state parse;
 478 
 479   /* Fields filled by the parser.  */
 480   token *tokens;                /* Postfix parse array.  */
 481   idx_t tindex;                 /* Index for adding new tokens.  */
 482   idx_t talloc;                 /* Number of tokens currently allocated.  */
 483   idx_t depth;                  /* Depth required of an evaluation stack
 484                                    used for depth-first traversal of the
 485                                    parse tree.  */
 486   idx_t nleaves;                /* Number of non-EMPTY leaves
 487                                    in the parse tree.  */
 488   idx_t nregexps;               /* Count of parallel regexps being built
 489                                    with dfaparse.  */
 490   bool fast;                    /* The DFA is fast.  */
 491   bool epsilon;                 /* Does a token match only the empty string?  */
 492   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
 493   mbstate_t mbs;                /* Multibyte conversion state.  */
 494 
 495   /* The following are valid only if MB_CUR_MAX > 1.  */
 496 
 497   /* The value of multibyte_prop[i] is defined by following rule.
 498      if tokens[i] < NOTCHAR
 499      bit 0 : tokens[i] is the first byte of a character, including
 500      single-byte characters.
 501      bit 1 : tokens[i] is the last byte of a character, including
 502      single-byte characters.
 503 
 504      e.g.
 505      tokens
 506      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 507      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 508      multibyte_prop
 509      = 3     , 1               ,  0              ,  2              , 3
 510    */
 511   char *multibyte_prop;
 512 
 513   /* Fields filled by the superset.  */
 514   struct dfa *superset;             /* Hint of the dfa.  */
 515 
 516   /* Fields filled by the state builder.  */
 517   dfa_state *states;            /* States of the dfa.  */
 518   state_num sindex;             /* Index for adding new states.  */
 519   idx_t salloc;                 /* Number of states currently allocated.  */
 520 
 521   /* Fields filled by the parse tree->NFA conversion.  */
 522   position_set *follows;        /* Array of follow sets, indexed by position
 523                                    index.  The follow of a position is the set
 524                                    of positions containing characters that
 525                                    could conceivably follow a character
 526                                    matching the given position in a string
 527                                    matching the regexp.  Allocated to the
 528                                    maximum possible position index.  */
 529   bool searchflag;              /* We are supposed to build a searching
 530                                    as opposed to an exact matcher.  A searching
 531                                    matcher finds the first and shortest string
 532                                    matching a regexp anywhere in the buffer,
 533                                    whereas an exact matcher finds the longest
 534                                    string matching, but anchored to the
 535                                    beginning of the buffer.  */
 536 
 537   /* Fields filled by dfaanalyze.  */
 538   int *constraints;             /* Array of union of accepting constraints
 539                                    in the follow of a position.  */
 540   int *separates;               /* Array of contexts on follow of a
 541                                    position.  */
 542 
 543   /* Fields filled by dfaexec.  */
 544   state_num tralloc;            /* Number of transition tables that have
 545                                    slots so far, not counting trans[-1] and
 546                                    trans[-2].  */
 547   int trcount;                  /* Number of transition tables that have
 548                                    been built, other than for initial
 549                                    states.  */
 550   int min_trcount;              /* Number of initial states.  Equivalently,
 551                                    the minimum state number for which trcount
 552                                    counts transitions.  */
 553   state_num **trans;            /* Transition tables for states that can
 554                                    never accept.  If the transitions for a
 555                                    state have not yet been computed, or the
 556                                    state could possibly accept, its entry in
 557                                    this table is NULL.  This points to two
 558                                    past the start of the allocated array,
 559                                    and trans[-1] and trans[-2] are always
 560                                    NULL.  */
 561   state_num **fails;            /* Transition tables after failing to accept
 562                                    on a state that potentially could do so.
 563                                    If trans[i] is non-null, fails[i] must
 564                                    be null.  */
 565   char *success;                /* Table of acceptance conditions used in
 566                                    dfaexec and computed in build_state.  */
 567   state_num *newlines;          /* Transitions on newlines.  The entry for a
 568                                    newline in any transition table is always
 569                                    -1 so we can count lines without wasting
 570                                    too many cycles.  The transition for a
 571                                    newline is stored separately and handled
 572                                    as a special case.  Newline is also used
 573                                    as a sentinel at the end of the buffer.  */
 574   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 575                                    context in multibyte locales, in which we
 576                                    do not distinguish between their contexts,
 577                                    as not supported word.  */
 578   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 579   state_num **mb_trans;         /* Transition tables for states with
 580                                    ANYCHAR.  */
 581   state_num mb_trcount;         /* Number of transition tables for states with
 582                                    ANYCHAR that have actually been built.  */
 583 
 584   /* Syntax configuration.  This is near the end so that dfacopysyntax
 585      can memset up to here.  */
 586   struct regex_syntax syntax;
 587 
 588   /* Information derived from the locale.  This is at the end so that
 589      a quick memset need not clear it specially.  */
 590 
 591   /* dfaexec implementation.  */
 592   char *(*dfaexec) (struct dfa *, char const *, char *,
 593                     bool, idx_t *, bool *);
 594 
 595   /* Other cached information derived from the locale.  */
 596   struct localeinfo localeinfo;
 597 };
 598 
 599 /* User access to dfa internals.  */
 600 
 601 /* S could possibly be an accepting state of R.  */
 602 static bool
 603 accepting (state_num s, struct dfa const *r)
     /* [previous][next][first][last][top][bottom][index][help] */
 604 {
 605   return r->states[s].constraint != 0;
 606 }
 607 
 608 /* STATE accepts in the specified context.  */
 609 static bool
 610 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
 611 {
 612   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 613 }
 614 
 615 static void regexp (struct dfa *dfa);
 616 
 617 /* Store into *PWC the result of converting the leading bytes of the
 618    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 619    and updating the conversion state in *D.  On conversion error,
 620    convert just a single byte, to WEOF.  Return the number of bytes
 621    converted.
 622 
 623    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
 624 
 625    * PWC points to wint_t, not to wchar_t.
 626    * The last arg is a dfa *D instead of merely a multibyte conversion
 627      state D->mbs.
 628    * N is idx_t not size_t, and must be at least 1.
 629    * S[N - 1] must be a sentinel byte.
 630    * Shift encodings are not supported.
 631    * The return value is always in the range 1..N.
 632    * D->mbs is always valid afterwards.
 633    * *PWC is always set to something.  */
 634 static int
 635 mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
     /* [previous][next][first][last][top][bottom][index][help] */
 636 {
 637   unsigned char uc = s[0];
 638   wint_t wc = d->localeinfo.sbctowc[uc];
 639 
 640   if (wc == WEOF)
 641     {
 642       wchar_t wch;
 643       size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
 644       if (0 < nbytes && nbytes < (size_t) -2)
 645         {
 646           *pwc = wch;
 647           return nbytes;
 648         }
 649       memset (&d->mbs, 0, sizeof d->mbs);
 650     }
 651 
 652   *pwc = wc;
 653   return 1;
 654 }
 655 
 656 #ifdef DEBUG
 657 
 658 static void
 659 prtok (token t)
     /* [previous][next][first][last][top][bottom][index][help] */
 660 {
 661   if (t <= END)
 662     fprintf (stderr, "END");
 663   else if (0 <= t && t < NOTCHAR)
 664     {
 665       unsigned int ch = t;
 666       fprintf (stderr, "0x%02x", ch);
 667     }
 668   else
 669     {
 670       char const *s;
 671       switch (t)
 672         {
 673         case BEG:
 674           s = "BEG";
 675           break;
 676         case EMPTY:
 677           s = "EMPTY";
 678           break;
 679         case BACKREF:
 680           s = "BACKREF";
 681           break;
 682         case BEGLINE:
 683           s = "BEGLINE";
 684           break;
 685         case ENDLINE:
 686           s = "ENDLINE";
 687           break;
 688         case BEGWORD:
 689           s = "BEGWORD";
 690           break;
 691         case ENDWORD:
 692           s = "ENDWORD";
 693           break;
 694         case LIMWORD:
 695           s = "LIMWORD";
 696           break;
 697         case NOTLIMWORD:
 698           s = "NOTLIMWORD";
 699           break;
 700         case QMARK:
 701           s = "QMARK";
 702           break;
 703         case STAR:
 704           s = "STAR";
 705           break;
 706         case PLUS:
 707           s = "PLUS";
 708           break;
 709         case CAT:
 710           s = "CAT";
 711           break;
 712         case OR:
 713           s = "OR";
 714           break;
 715         case LPAREN:
 716           s = "LPAREN";
 717           break;
 718         case RPAREN:
 719           s = "RPAREN";
 720           break;
 721         case ANYCHAR:
 722           s = "ANYCHAR";
 723           break;
 724         case MBCSET:
 725           s = "MBCSET";
 726           break;
 727         default:
 728           s = "CSET";
 729           break;
 730         }
 731       fprintf (stderr, "%s", s);
 732     }
 733 }
 734 #endif /* DEBUG */
 735 
 736 /* Stuff pertaining to charclasses.  */
 737 
 738 static bool
 739 tstbit (unsigned int b, charclass const *c)
     /* [previous][next][first][last][top][bottom][index][help] */
 740 {
 741   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 742 }
 743 
 744 static void
 745 setbit (unsigned int b, charclass *c)
     /* [previous][next][first][last][top][bottom][index][help] */
 746 {
 747   charclass_word one = 1;
 748   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 749 }
 750 
 751 static void
 752 clrbit (unsigned int b, charclass *c)
     /* [previous][next][first][last][top][bottom][index][help] */
 753 {
 754   charclass_word one = 1;
 755   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 756 }
 757 
 758 static void
 759 zeroset (charclass *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 760 {
 761   memset (s, 0, sizeof *s);
 762 }
 763 
 764 static void
 765 fillset (charclass *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 766 {
 767   for (int i = 0; i < CHARCLASS_WORDS; i++)
 768     s->w[i] = CHARCLASS_WORD_MASK;
 769 }
 770 
 771 static void
 772 notset (charclass *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 773 {
 774   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 775     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 776 }
 777 
 778 static bool
 779 equal (charclass const *s1, charclass const *s2)
     /* [previous][next][first][last][top][bottom][index][help] */
 780 {
 781   charclass_word w = 0;
 782   for (int i = 0; i < CHARCLASS_WORDS; i++)
 783     w |= s1->w[i] ^ s2->w[i];
 784   return w == 0;
 785 }
 786 
 787 static bool
 788 emptyset (charclass const *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 789 {
 790   charclass_word w = 0;
 791   for (int i = 0; i < CHARCLASS_WORDS; i++)
 792     w |= s->w[i];
 793   return w == 0;
 794 }
 795 
 796 /* Ensure that the array addressed by PA holds at least I + 1 items.
 797    Either return PA, or reallocate the array and return its new address.
 798    Although PA may be null, the returned value is never null.
 799 
 800    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 801    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 802    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 803    ITEM_SIZE is the size of one item; it must be positive.
 804    Avoid O(N**2) behavior on arrays growing linearly.  */
 805 static void *
 806 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
     /* [previous][next][first][last][top][bottom][index][help] */
 807                ptrdiff_t nitems_max, idx_t item_size)
 808 {
 809   if (i < *nitems)
 810     return pa;
 811   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 812 }
 813 
 814 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 815 static idx_t
 816 charclass_index (struct dfa *d, charclass const *s)
     /* [previous][next][first][last][top][bottom][index][help] */
 817 {
 818   idx_t i;
 819 
 820   for (i = 0; i < d->cindex; ++i)
 821     if (equal (s, &d->charclasses[i]))
 822       return i;
 823   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 824                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 825   ++d->cindex;
 826   d->charclasses[i] = *s;
 827   return i;
 828 }
 829 
 830 static bool
 831 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
     /* [previous][next][first][last][top][bottom][index][help] */
 832 {
 833   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 834 }
 835 
 836 static int
 837 char_context (struct dfa const *dfa, unsigned char c)
     /* [previous][next][first][last][top][bottom][index][help] */
 838 {
 839   if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
 840     return CTX_NEWLINE;
 841   if (unibyte_word_constituent (dfa, c))
 842     return CTX_LETTER;
 843   return CTX_NONE;
 844 }
 845 
 846 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
 847    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 848    this may happen when folding case in weird Turkish locales where
 849    dotless i/dotted I are not included in the chosen character set.
 850    Return whether a bit was set in the charclass.  */
 851 static bool
 852 setbit_wc (wint_t wc, charclass *c)
     /* [previous][next][first][last][top][bottom][index][help] */
 853 {
 854   int b = wctob (wc);
 855   if (b < 0)
 856     return false;
 857 
 858   setbit (b, c);
 859   return true;
 860 }
 861 
 862 /* Set a bit for B and its case variants in the charclass C.
 863    MB_CUR_MAX must be 1.  */
 864 static void
 865 setbit_case_fold_c (int b, charclass *c)
     /* [previous][next][first][last][top][bottom][index][help] */
 866 {
 867   int ub = toupper (b);
 868   for (int i = 0; i < NOTCHAR; i++)
 869     if (toupper (i) == ub)
 870       setbit (i, c);
 871 }
 872 
 873 /* Fetch the next lexical input character from the pattern.  There
 874    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 875    value of the character or to WEOF depending on whether the input is
 876    a valid multibyte character (possibly of length 1).  Then return
 877    the next input byte value, except return EOF if the input is a
 878    multibyte character of length greater than 1.  */
 879 static int
 880 fetch_wc (struct dfa *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
 881 {
 882   int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 883                              dfa);
 884   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 885   dfa->lex.ptr += nbytes;
 886   dfa->lex.left -= nbytes;
 887   return c;
 888 }
 889 
 890 /* If there is no more input, report an error about unbalanced brackets.
 891    Otherwise, behave as with fetch_wc (DFA).  */
 892 static int
 893 bracket_fetch_wc (struct dfa *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
 894 {
 895   if (! dfa->lex.left)
 896     dfaerror (_("unbalanced ["));
 897   return fetch_wc (dfa);
 898 }
 899 
 900 typedef int predicate (int);
 901 
 902 /* The following list maps the names of the Posix named character classes
 903    to predicate functions that determine whether a given character is in
 904    the class.  The leading [ has already been eaten by the lexical
 905    analyzer.  */
 906 struct dfa_ctype
 907 {
 908   const char *name;
 909   predicate *func;
 910   bool single_byte_only;
 911 };
 912 
 913 static const struct dfa_ctype prednames[] = {
 914   {"alpha", isalpha, false},
 915   {"upper", isupper, false},
 916   {"lower", islower, false},
 917   {"digit", isdigit, true},
 918   {"xdigit", isxdigit, false},
 919   {"space", isspace, false},
 920   {"punct", ispunct, false},
 921   {"alnum", isalnum, false},
 922   {"print", isprint, false},
 923   {"graph", isgraph, false},
 924   {"cntrl", iscntrl, false},
 925   {"blank", isblank, false},
 926   {NULL, NULL, false}
 927 };
 928 
 929 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 930 find_pred (const char *str)
     /* [previous][next][first][last][top][bottom][index][help] */
 931 {
 932   for (int i = 0; prednames[i].name; i++)
 933     if (streq (str, prednames[i].name))
 934       return &prednames[i];
 935   return NULL;
 936 }
 937 
 938 /* Parse a bracket expression, which possibly includes multibyte
 939    characters.  */
 940 static token
 941 parse_bracket_exp (struct dfa *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
 942 {
 943   /* This is a bracket expression that dfaexec is known to
 944      process correctly.  */
 945   bool known_bracket_exp = true;
 946 
 947   /* Used to warn about [:space:].
 948      Bit 0 = first character is a colon.
 949      Bit 1 = last character is a colon.
 950      Bit 2 = includes any other character but a colon.
 951      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
 952   int colon_warning_state;
 953 
 954   dfa->lex.brack.nchars = 0;
 955   charclass ccl;
 956   zeroset (&ccl);
 957   int c = bracket_fetch_wc (dfa);
 958   bool invert = c == '^';
 959   if (invert)
 960     {
 961       c = bracket_fetch_wc (dfa);
 962       known_bracket_exp = dfa->localeinfo.simple;
 963     }
 964   wint_t wc = dfa->lex.wctok;
 965   int c1;
 966   wint_t wc1;
 967   colon_warning_state = (c == ':');
 968   do
 969     {
 970       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
 971       colon_warning_state &= ~2;
 972 
 973       /* Note that if we're looking at some other [:...:] construct,
 974          we just treat it as a bunch of ordinary characters.  We can do
 975          this because we assume regex has checked for syntax errors before
 976          dfa is ever called.  */
 977       if (c == '[')
 978         {
 979           c1 = bracket_fetch_wc (dfa);
 980           wc1 = dfa->lex.wctok;
 981 
 982           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
 983               || c1 == '.' || c1 == '=')
 984             {
 985               enum { MAX_BRACKET_STRING_LEN = 32 };
 986               char str[MAX_BRACKET_STRING_LEN + 1];
 987               int len = 0;
 988               for (;;)
 989                 {
 990                   c = bracket_fetch_wc (dfa);
 991                   if (dfa->lex.left == 0
 992                       || (c == c1 && dfa->lex.ptr[0] == ']'))
 993                     break;
 994                   if (len < MAX_BRACKET_STRING_LEN)
 995                     str[len++] = c;
 996                   else
 997                     /* This is in any case an invalid class name.  */
 998                     str[0] = '\0';
 999                 }
1000               str[len] = '\0';
1001 
1002               /* Fetch bracket.  */
1003               c = bracket_fetch_wc (dfa);
1004               wc = dfa->lex.wctok;
1005               if (c1 == ':')
1006                 /* Build character class.  POSIX allows character
1007                    classes to match multicharacter collating elements,
1008                    but the regex code does not support that, so do not
1009                    worry about that possibility.  */
1010                 {
1011                   char const *class
1012                     = (dfa->syntax.case_fold && (streq (str, "upper")
1013                                                  || streq (str, "lower"))
1014                        ? "alpha" : str);
1015                   const struct dfa_ctype *pred = find_pred (class);
1016                   if (!pred)
1017                     dfaerror (_("invalid character class"));
1018 
1019                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1020                     known_bracket_exp = false;
1021                   else
1022                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1023                       if (pred->func (c2))
1024                         setbit (c2, &ccl);
1025                 }
1026               else
1027                 known_bracket_exp = false;
1028 
1029               colon_warning_state |= 8;
1030 
1031               /* Fetch new lookahead character.  */
1032               c1 = bracket_fetch_wc (dfa);
1033               wc1 = dfa->lex.wctok;
1034               continue;
1035             }
1036 
1037           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1038              are already set up.  */
1039         }
1040 
1041       if (c == '\\'
1042           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1043         {
1044           c = bracket_fetch_wc (dfa);
1045           wc = dfa->lex.wctok;
1046         }
1047 
1048       if (c1 == NOTCHAR)
1049         {
1050           c1 = bracket_fetch_wc (dfa);
1051           wc1 = dfa->lex.wctok;
1052         }
1053 
1054       if (c1 == '-')
1055         /* build range characters.  */
1056         {
1057           int c2 = bracket_fetch_wc (dfa);
1058           wint_t wc2 = dfa->lex.wctok;
1059 
1060           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1061              Treat it like [-a[.aa.]] while parsing it, and
1062              remember that the set is unknown.  */
1063           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1064             {
1065               known_bracket_exp = false;
1066               c2 = ']';
1067             }
1068 
1069           if (c2 == ']')
1070             {
1071               /* In the case [x-], the - is an ordinary hyphen,
1072                  which is left in c1, the lookahead character.  */
1073               dfa->lex.ptr--;
1074               dfa->lex.left++;
1075             }
1076           else
1077             {
1078               if (c2 == '\\' && (dfa->syntax.syntax_bits
1079                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1080                 {
1081                   c2 = bracket_fetch_wc (dfa);
1082                   wc2 = dfa->lex.wctok;
1083                 }
1084 
1085               colon_warning_state |= 8;
1086               c1 = bracket_fetch_wc (dfa);
1087               wc1 = dfa->lex.wctok;
1088 
1089               /* Treat [x-y] as a range if x != y.  */
1090               if (wc != wc2 || wc == WEOF)
1091                 {
1092                   if (dfa->localeinfo.simple
1093                       || (isasciidigit (c) & isasciidigit (c2)))
1094                     {
1095                       for (int ci = c; ci <= c2; ci++)
1096                         if (dfa->syntax.case_fold && isalpha (ci))
1097                           setbit_case_fold_c (ci, &ccl);
1098                         else
1099                           setbit (ci, &ccl);
1100                     }
1101                   else
1102                     known_bracket_exp = false;
1103 
1104                   continue;
1105                 }
1106             }
1107         }
1108 
1109       colon_warning_state |= (c == ':') ? 2 : 4;
1110 
1111       if (!dfa->localeinfo.multibyte)
1112         {
1113           if (dfa->syntax.case_fold && isalpha (c))
1114             setbit_case_fold_c (c, &ccl);
1115           else
1116             setbit (c, &ccl);
1117           continue;
1118         }
1119 
1120       if (wc == WEOF)
1121         known_bracket_exp = false;
1122       else
1123         {
1124           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1125           int n = (dfa->syntax.case_fold
1126                    ? case_folded_counterparts (wc, folded + 1) + 1
1127                    : 1);
1128           folded[0] = wc;
1129           for (int i = 0; i < n; i++)
1130             if (!setbit_wc (folded[i], &ccl))
1131               {
1132                 dfa->lex.brack.chars
1133                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1134                                    &dfa->lex.brack.nchars_alloc, -1,
1135                                    sizeof *dfa->lex.brack.chars);
1136                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1137               }
1138         }
1139     }
1140   while ((wc = wc1, (c = c1) != ']'));
1141 
1142   if (colon_warning_state == 7)
1143     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1144 
1145   if (! known_bracket_exp)
1146     return BACKREF;
1147 
1148   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1149     {
1150       dfa->lex.brack.invert = invert;
1151       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1152       return MBCSET;
1153     }
1154 
1155   if (invert)
1156     {
1157       notset (&ccl);
1158       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1159         clrbit ('\n', &ccl);
1160     }
1161 
1162   return CSET + charclass_index (dfa, &ccl);
1163 }
1164 
1165 struct lexptr
1166 {
1167   char const *ptr;
1168   idx_t left;
1169 };
1170 
1171 static void
1172 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
     /* [previous][next][first][last][top][bottom][index][help] */
1173 {
1174   ls->ptr = dfa->lex.ptr;
1175   ls->left = dfa->lex.left;
1176   dfa->lex.ptr = s;
1177   dfa->lex.left = strlen (s);
1178 }
1179 
1180 static void
1181 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
     /* [previous][next][first][last][top][bottom][index][help] */
1182 {
1183   dfa->lex.ptr = ls->ptr;
1184   dfa->lex.left = ls->left;
1185 }
1186 
1187 static token
1188 lex (struct dfa *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
1189 {
1190   bool backslash = false;
1191 
1192   /* Basic plan: We fetch a character.  If it's a backslash,
1193      we set the backslash flag and go through the loop again.
1194      On the plus side, this avoids having a duplicate of the
1195      main switch inside the backslash case.  On the minus side,
1196      it means that just about every case begins with
1197      "if (backslash) ...".  */
1198   for (int i = 0; i < 2; ++i)
1199     {
1200       if (! dfa->lex.left)
1201         return dfa->lex.lasttok = END;
1202       int c = fetch_wc (dfa);
1203 
1204       switch (c)
1205         {
1206         case '\\':
1207           if (backslash)
1208             goto normal_char;
1209           if (dfa->lex.left == 0)
1210             dfaerror (_("unfinished \\ escape"));
1211           backslash = true;
1212           break;
1213 
1214         case '^':
1215           if (backslash)
1216             goto normal_char;
1217           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1218               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1219               || dfa->lex.lasttok == OR)
1220             return dfa->lex.lasttok = BEGLINE;
1221           goto normal_char;
1222 
1223         case '$':
1224           if (backslash)
1225             goto normal_char;
1226           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1227               || dfa->lex.left == 0
1228               || ((dfa->lex.left
1229                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1230                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1231                                    & (dfa->lex.ptr[0] == '\\')]
1232                       == ')'))
1233               || ((dfa->lex.left
1234                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1235                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1236                                    & (dfa->lex.ptr[0] == '\\')]
1237                       == '|'))
1238               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1239                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1240             return dfa->lex.lasttok = ENDLINE;
1241           goto normal_char;
1242 
1243         case '1':
1244         case '2':
1245         case '3':
1246         case '4':
1247         case '5':
1248         case '6':
1249         case '7':
1250         case '8':
1251         case '9':
1252           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1253             {
1254               dfa->lex.laststart = false;
1255               return dfa->lex.lasttok = BACKREF;
1256             }
1257           goto normal_char;
1258 
1259         case '`':
1260           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1261             {
1262               /* FIXME: should be beginning of string */
1263               return dfa->lex.lasttok = BEGLINE;
1264             }
1265           goto normal_char;
1266 
1267         case '\'':
1268           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1269             {
1270               /* FIXME: should be end of string */
1271               return dfa->lex.lasttok = ENDLINE;
1272             }
1273           goto normal_char;
1274 
1275         case '<':
1276           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1277             return dfa->lex.lasttok = BEGWORD;
1278           goto normal_char;
1279 
1280         case '>':
1281           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1282             return dfa->lex.lasttok = ENDWORD;
1283           goto normal_char;
1284 
1285         case 'b':
1286           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1287             return dfa->lex.lasttok = LIMWORD;
1288           goto normal_char;
1289 
1290         case 'B':
1291           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1292             return dfa->lex.lasttok = NOTLIMWORD;
1293           goto normal_char;
1294 
1295         case '?':
1296           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1297             goto normal_char;
1298           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1299             goto normal_char;
1300           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1301               && dfa->lex.laststart)
1302             goto normal_char;
1303           return dfa->lex.lasttok = QMARK;
1304 
1305         case '*':
1306           if (backslash)
1307             goto normal_char;
1308           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1309               && dfa->lex.laststart)
1310             goto normal_char;
1311           return dfa->lex.lasttok = STAR;
1312 
1313         case '+':
1314           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1315             goto normal_char;
1316           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1317             goto normal_char;
1318           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1319               && dfa->lex.laststart)
1320             goto normal_char;
1321           return dfa->lex.lasttok = PLUS;
1322 
1323         case '{':
1324           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1325             goto normal_char;
1326           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1327             goto normal_char;
1328           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1329               && dfa->lex.laststart)
1330             goto normal_char;
1331 
1332           /* Cases:
1333              {M} - exact count
1334              {M,} - minimum count, maximum is infinity
1335              {,N} - 0 through N
1336              {,} - 0 to infinity (same as '*')
1337              {M,N} - M through N */
1338           {
1339             char const *p = dfa->lex.ptr;
1340             char const *lim = p + dfa->lex.left;
1341             dfa->lex.minrep = dfa->lex.maxrep = -1;
1342             for (; p != lim && isasciidigit (*p); p++)
1343               dfa->lex.minrep = (dfa->lex.minrep < 0
1344                                  ? *p - '0'
1345                                  : MIN (RE_DUP_MAX + 1,
1346                                         dfa->lex.minrep * 10 + *p - '0'));
1347             if (p != lim)
1348               {
1349                 if (*p != ',')
1350                   dfa->lex.maxrep = dfa->lex.minrep;
1351                 else
1352                   {
1353                     if (dfa->lex.minrep < 0)
1354                       dfa->lex.minrep = 0;
1355                     while (++p != lim && isasciidigit (*p))
1356                       dfa->lex.maxrep
1357                         = (dfa->lex.maxrep < 0
1358                            ? *p - '0'
1359                            : MIN (RE_DUP_MAX + 1,
1360                                   dfa->lex.maxrep * 10 + *p - '0'));
1361                   }
1362               }
1363             if (! ((! backslash || (p != lim && *p++ == '\\'))
1364                    && p != lim && *p++ == '}'
1365                    && 0 <= dfa->lex.minrep
1366                    && (dfa->lex.maxrep < 0
1367                        || dfa->lex.minrep <= dfa->lex.maxrep)))
1368               {
1369                 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1370                   goto normal_char;
1371                 dfaerror (_("invalid content of \\{\\}"));
1372               }
1373             if (RE_DUP_MAX < dfa->lex.maxrep)
1374               dfaerror (_("regular expression too big"));
1375             dfa->lex.ptr = p;
1376             dfa->lex.left = lim - p;
1377           }
1378           dfa->lex.laststart = false;
1379           return dfa->lex.lasttok = REPMN;
1380 
1381         case '|':
1382           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1383             goto normal_char;
1384           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1385             goto normal_char;
1386           dfa->lex.laststart = true;
1387           return dfa->lex.lasttok = OR;
1388 
1389         case '\n':
1390           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1391               || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1392             goto normal_char;
1393           dfa->lex.laststart = true;
1394           return dfa->lex.lasttok = OR;
1395 
1396         case '(':
1397           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1398             goto normal_char;
1399           dfa->lex.parens++;
1400           dfa->lex.laststart = true;
1401           return dfa->lex.lasttok = LPAREN;
1402 
1403         case ')':
1404           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1405             goto normal_char;
1406           if (dfa->lex.parens == 0
1407               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1408             goto normal_char;
1409           dfa->lex.parens--;
1410           dfa->lex.laststart = false;
1411           return dfa->lex.lasttok = RPAREN;
1412 
1413         case '.':
1414           if (backslash)
1415             goto normal_char;
1416           if (dfa->canychar < 0)
1417             {
1418               charclass ccl;
1419               fillset (&ccl);
1420               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1421                 clrbit ('\n', &ccl);
1422               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1423                 clrbit ('\0', &ccl);
1424               if (dfa->localeinfo.multibyte)
1425                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1426                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1427                     clrbit (c2, &ccl);
1428               dfa->canychar = charclass_index (dfa, &ccl);
1429             }
1430           dfa->lex.laststart = false;
1431           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1432                                      ? ANYCHAR
1433                                      : CSET + dfa->canychar);
1434 
1435         case 's':
1436         case 'S':
1437           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1438             goto normal_char;
1439           if (!dfa->localeinfo.multibyte)
1440             {
1441               charclass ccl;
1442               zeroset (&ccl);
1443               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1444                 if (isspace (c2))
1445                   setbit (c2, &ccl);
1446               if (c == 'S')
1447                 notset (&ccl);
1448               dfa->lex.laststart = false;
1449               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1450             }
1451 
1452           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1453              add_utf8_anychar, makes sense.  */
1454 
1455           /* \s and \S are documented to be equivalent to [[:space:]] and
1456              [^[:space:]] respectively, so tell the lexer to process those
1457              strings, each minus its "already processed" '['.  */
1458           {
1459             struct lexptr ls;
1460             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1461             dfa->lex.lasttok = parse_bracket_exp (dfa);
1462             pop_lex_state (dfa, &ls);
1463           }
1464 
1465           dfa->lex.laststart = false;
1466           return dfa->lex.lasttok;
1467 
1468         case 'w':
1469         case 'W':
1470           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1471             goto normal_char;
1472 
1473           if (!dfa->localeinfo.multibyte)
1474             {
1475               charclass ccl;
1476               zeroset (&ccl);
1477               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1478                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1479                   setbit (c2, &ccl);
1480               if (c == 'W')
1481                 notset (&ccl);
1482               dfa->lex.laststart = false;
1483               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1484             }
1485 
1486           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1487              add_utf8_anychar, makes sense.  */
1488 
1489           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1490              [^_[:alnum:]] respectively, so tell the lexer to process those
1491              strings, each minus its "already processed" '['.  */
1492           {
1493             struct lexptr ls;
1494             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1495             dfa->lex.lasttok = parse_bracket_exp (dfa);
1496             pop_lex_state (dfa, &ls);
1497           }
1498 
1499           dfa->lex.laststart = false;
1500           return dfa->lex.lasttok;
1501 
1502         case '[':
1503           if (backslash)
1504             goto normal_char;
1505           dfa->lex.laststart = false;
1506           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1507 
1508         default:
1509         normal_char:
1510           dfa->lex.laststart = false;
1511           /* For multibyte character sets, folding is done in atom.  Always
1512              return WCHAR.  */
1513           if (dfa->localeinfo.multibyte)
1514             return dfa->lex.lasttok = WCHAR;
1515 
1516           if (dfa->syntax.case_fold && isalpha (c))
1517             {
1518               charclass ccl;
1519               zeroset (&ccl);
1520               setbit_case_fold_c (c, &ccl);
1521               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1522             }
1523 
1524           return dfa->lex.lasttok = c;
1525         }
1526     }
1527 
1528   /* The above loop should consume at most a backslash
1529      and some other character.  */
1530   abort ();
1531   return END;                   /* keeps pedantic compilers happy.  */
1532 }
1533 
1534 static void
1535 addtok_mb (struct dfa *dfa, token t, char mbprop)
     /* [previous][next][first][last][top][bottom][index][help] */
1536 {
1537   if (dfa->talloc == dfa->tindex)
1538     {
1539       dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1540                              sizeof *dfa->tokens);
1541       if (dfa->localeinfo.multibyte)
1542         dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1543                                              sizeof *dfa->multibyte_prop);
1544     }
1545   if (dfa->localeinfo.multibyte)
1546     dfa->multibyte_prop[dfa->tindex] = mbprop;
1547   dfa->tokens[dfa->tindex++] = t;
1548 
1549   switch (t)
1550     {
1551     case QMARK:
1552     case STAR:
1553     case PLUS:
1554       break;
1555 
1556     case CAT:
1557     case OR:
1558       dfa->parse.depth--;
1559       break;
1560 
1561     case EMPTY:
1562       dfa->epsilon = true;
1563       goto increment_depth;
1564 
1565     case BACKREF:
1566       dfa->fast = false;
1567       goto increment_nleaves;
1568 
1569     case BEGLINE:
1570     case ENDLINE:
1571     case BEGWORD:
1572     case ENDWORD:
1573     case LIMWORD:
1574     case NOTLIMWORD:
1575       dfa->epsilon = true;
1576       FALLTHROUGH;
1577     default:
1578     increment_nleaves:
1579       dfa->nleaves++;
1580     increment_depth:
1581       dfa->parse.depth++;
1582       if (dfa->depth < dfa->parse.depth)
1583         dfa->depth = dfa->parse.depth;
1584       break;
1585     }
1586 }
1587 
1588 static void addtok_wc (struct dfa *dfa, wint_t wc);
1589 
1590 /* Add the given token to the parse tree, maintaining the depth count and
1591    updating the maximum depth if necessary.  */
1592 static void
1593 addtok (struct dfa *dfa, token t)
     /* [previous][next][first][last][top][bottom][index][help] */
1594 {
1595   if (dfa->localeinfo.multibyte && t == MBCSET)
1596     {
1597       bool need_or = false;
1598 
1599       /* Extract wide characters into alternations for better performance.
1600          This does not require UTF-8.  */
1601       for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1602         {
1603           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1604           if (need_or)
1605             addtok (dfa, OR);
1606           need_or = true;
1607         }
1608       dfa->lex.brack.nchars = 0;
1609 
1610       /* Wide characters have been handled above, so it is possible
1611          that the set is empty now.  Do nothing in that case.  */
1612       if (dfa->lex.brack.cset != -1)
1613         {
1614           addtok (dfa, CSET + dfa->lex.brack.cset);
1615           if (need_or)
1616             addtok (dfa, OR);
1617         }
1618     }
1619   else
1620     {
1621       addtok_mb (dfa, t, 3);
1622     }
1623 }
1624 
1625 /* We treat a multibyte character as a single atom, so that DFA
1626    can treat a multibyte character as a single expression.
1627 
1628    e.g., we construct the following tree from "<mb1><mb2>".
1629    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1630    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1631 static void
1632 addtok_wc (struct dfa *dfa, wint_t wc)
     /* [previous][next][first][last][top][bottom][index][help] */
1633 {
1634   unsigned char buf[MB_LEN_MAX];
1635   mbstate_t s = { 0 };
1636   size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1637   int buflen;
1638 
1639   if (stored_bytes != (size_t) -1)
1640     buflen = stored_bytes;
1641   else
1642     {
1643       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1644          the addtok_mb call altogether can corrupt the heap.  */
1645       buflen = 1;
1646       buf[0] = 0;
1647     }
1648 
1649   addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1650   for (int i = 1; i < buflen; i++)
1651     {
1652       addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1653       addtok (dfa, CAT);
1654     }
1655 }
1656 
1657 static void
1658 add_utf8_anychar (struct dfa *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
1659 {
1660   /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1661      UTF-8 byte sequence has been defined as follows:
1662 
1663      ([\x00-\x7f]
1664      |[\xc2-\xdf][\x80-\xbf]
1665      |[\xe0][\xa0-\xbf][\x80-\xbf]
1666      |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1667      |[\xed][\x80-\x9f][\x80-\xbf]
1668      |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1669      |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1670      |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1671 
1672      which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1673      where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1674      D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1675      H = [\x80-\x9f], I = [\xf0],
1676      J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1677 
1678      This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
1679 
1680   /* Mnemonics for classes containing two or more bytes.  */
1681   enum { A, B, C, E, F, H, J, K, M };
1682 
1683   /* Mnemonics for single-byte tokens.  */
1684   enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1685 
1686   static charclass const utf8_classes[] = {
1687     /* A. 00-7f: 1-byte sequence.  */
1688     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1689 
1690     /* B. c2-df: 1st byte of a 2-byte sequence.  */
1691     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1692 
1693     /* C. 80-bf: non-leading bytes.  */
1694     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1695 
1696     /* D. e0 (just a token).  */
1697 
1698     /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
1699     CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1700 
1701     /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
1702     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1703 
1704     /* G. ed (just a token).  */
1705 
1706     /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
1707     CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1708 
1709     /* I. f0 (just a token).  */
1710 
1711     /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
1712     CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1713 
1714     /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
1715     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1716 
1717     /* L. f4 (just a token).  */
1718 
1719     /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
1720     CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1721   };
1722 
1723   /* Define the character classes that are needed below.  */
1724   if (dfa->utf8_anychar_classes[0] == 0)
1725     {
1726       charclass c = utf8_classes[0];
1727       if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1728         clrbit ('\n', &c);
1729       if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1730         clrbit ('\0', &c);
1731       dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1732 
1733       for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1734         dfa->utf8_anychar_classes[i]
1735           = CSET + charclass_index (dfa, &utf8_classes[i]);
1736     }
1737 
1738   /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1739      The token buffer is in reverse Polish order, so we get
1740      "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1741       C CAT OR C CAT OR C CAT OR".  */
1742   addtok (dfa, dfa->utf8_anychar_classes[A]);
1743   addtok (dfa, dfa->utf8_anychar_classes[B]);
1744   addtok (dfa, D_token);
1745   addtok (dfa, dfa->utf8_anychar_classes[E]);
1746   addtok (dfa, CAT);
1747   addtok (dfa, OR);
1748   addtok (dfa, G_token);
1749   addtok (dfa, dfa->utf8_anychar_classes[H]);
1750   addtok (dfa, CAT);
1751   addtok (dfa, OR);
1752   addtok (dfa, dfa->utf8_anychar_classes[F]);
1753   addtok (dfa, I_token);
1754   addtok (dfa, dfa->utf8_anychar_classes[J]);
1755   addtok (dfa, CAT);
1756   addtok (dfa, OR);
1757   addtok (dfa, L_token);
1758   addtok (dfa, dfa->utf8_anychar_classes[M]);
1759   addtok (dfa, CAT);
1760   addtok (dfa, OR);
1761   addtok (dfa, dfa->utf8_anychar_classes[K]);
1762   for (int i = 0; i < 3; i++)
1763     {
1764       addtok (dfa, dfa->utf8_anychar_classes[C]);
1765       addtok (dfa, CAT);
1766       addtok (dfa, OR);
1767     }
1768 }
1769 
1770 /* The grammar understood by the parser is as follows.
1771 
1772    regexp:
1773      regexp OR branch
1774      branch
1775 
1776    branch:
1777      branch closure
1778      closure
1779 
1780    closure:
1781      closure QMARK
1782      closure STAR
1783      closure PLUS
1784      closure REPMN
1785      atom
1786 
1787    atom:
1788      <normal character>
1789      <multibyte character>
1790      ANYCHAR
1791      MBCSET
1792      CSET
1793      BACKREF
1794      BEGLINE
1795      ENDLINE
1796      BEGWORD
1797      ENDWORD
1798      LIMWORD
1799      NOTLIMWORD
1800      LPAREN regexp RPAREN
1801      <empty>
1802 
1803    The parser builds a parse tree in postfix form in an array of tokens.  */
1804 
1805 static void
1806 atom (struct dfa *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
1807 {
1808   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1809       || dfa->parse.tok >= CSET
1810       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1811       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1812       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1813       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1814       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1815     {
1816       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1817         {
1818           /* For UTF-8 expand the period to a series of CSETs that define a
1819              valid UTF-8 character.  This avoids using the slow multibyte
1820              path.  I'm pretty sure it would be both profitable and correct to
1821              do it for any encoding; however, the optimization must be done
1822              manually as it is done above in add_utf8_anychar.  So, let's
1823              start with UTF-8: it is the most used, and the structure of the
1824              encoding makes the correctness more obvious.  */
1825           add_utf8_anychar (dfa);
1826         }
1827       else
1828         addtok (dfa, dfa->parse.tok);
1829       dfa->parse.tok = lex (dfa);
1830     }
1831   else if (dfa->parse.tok == WCHAR)
1832     {
1833       if (dfa->lex.wctok == WEOF)
1834         addtok (dfa, BACKREF);
1835       else
1836         {
1837           addtok_wc (dfa, dfa->lex.wctok);
1838 
1839           if (dfa->syntax.case_fold)
1840             {
1841               wchar_t folded[CASE_FOLDED_BUFSIZE];
1842               int n = case_folded_counterparts (dfa->lex.wctok, folded);
1843               for (int i = 0; i < n; i++)
1844                 {
1845                   addtok_wc (dfa, folded[i]);
1846                   addtok (dfa, OR);
1847                 }
1848             }
1849         }
1850 
1851       dfa->parse.tok = lex (dfa);
1852     }
1853   else if (dfa->parse.tok == LPAREN)
1854     {
1855       dfa->parse.tok = lex (dfa);
1856       regexp (dfa);
1857       if (dfa->parse.tok != RPAREN)
1858         dfaerror (_("unbalanced ("));
1859       dfa->parse.tok = lex (dfa);
1860     }
1861   else
1862     addtok (dfa, EMPTY);
1863 }
1864 
1865 /* Return the number of tokens in the given subexpression.  */
1866 static idx_t _GL_ATTRIBUTE_PURE
1867 nsubtoks (struct dfa const *dfa, idx_t tindex)
     /* [previous][next][first][last][top][bottom][index][help] */
1868 {
1869   switch (dfa->tokens[tindex - 1])
1870     {
1871     default:
1872       return 1;
1873     case QMARK:
1874     case STAR:
1875     case PLUS:
1876       return 1 + nsubtoks (dfa, tindex - 1);
1877     case CAT:
1878     case OR:
1879       {
1880         idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1881         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1882       }
1883     }
1884 }
1885 
1886 /* Copy the given subexpression to the top of the tree.  */
1887 static void
1888 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
     /* [previous][next][first][last][top][bottom][index][help] */
1889 {
1890   if (dfa->localeinfo.multibyte)
1891     for (idx_t i = 0; i < ntokens; i++)
1892       addtok_mb (dfa, dfa->tokens[tindex + i],
1893                  dfa->multibyte_prop[tindex + i]);
1894   else
1895     for (idx_t i = 0; i < ntokens; i++)
1896       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1897 }
1898 
1899 static void
1900 closure (struct dfa *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
1901 {
1902   atom (dfa);
1903   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1904          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1905     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1906       {
1907         idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1908         idx_t tindex = dfa->tindex - ntokens;
1909         if (dfa->lex.maxrep < 0)
1910           addtok (dfa, PLUS);
1911         if (dfa->lex.minrep == 0)
1912           addtok (dfa, QMARK);
1913         int i;
1914         for (i = 1; i < dfa->lex.minrep; i++)
1915           {
1916             copytoks (dfa, tindex, ntokens);
1917             addtok (dfa, CAT);
1918           }
1919         for (; i < dfa->lex.maxrep; i++)
1920           {
1921             copytoks (dfa, tindex, ntokens);
1922             addtok (dfa, QMARK);
1923             addtok (dfa, CAT);
1924           }
1925         dfa->parse.tok = lex (dfa);
1926       }
1927     else if (dfa->parse.tok == REPMN)
1928       {
1929         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1930         dfa->parse.tok = lex (dfa);
1931         closure (dfa);
1932       }
1933     else
1934       {
1935         addtok (dfa, dfa->parse.tok);
1936         dfa->parse.tok = lex (dfa);
1937       }
1938 }
1939 
1940 static void
1941 branch (struct dfa* dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
1942 {
1943   closure (dfa);
1944   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1945          && dfa->parse.tok >= 0)
1946     {
1947       closure (dfa);
1948       addtok (dfa, CAT);
1949     }
1950 }
1951 
1952 static void
1953 regexp (struct dfa *dfa)
     /* [previous][next][first][last][top][bottom][index][help] */
1954 {
1955   branch (dfa);
1956   while (dfa->parse.tok == OR)
1957     {
1958       dfa->parse.tok = lex (dfa);
1959       branch (dfa);
1960       addtok (dfa, OR);
1961     }
1962 }
1963 
1964 /* Parse a string S of length LEN into D.  S can include NUL characters.
1965    This is the main entry point for the parser.  */
1966 void
1967 dfaparse (char const *s, idx_t len, struct dfa *d)
     /* [previous][next][first][last][top][bottom][index][help] */
1968 {
1969   d->lex.ptr = s;
1970   d->lex.left = len;
1971   d->lex.lasttok = END;
1972   d->lex.laststart = true;
1973 
1974   if (!d->syntax.syntax_bits_set)
1975     dfaerror (_("no syntax specified"));
1976 
1977   if (!d->nregexps)
1978     addtok (d, BEG);
1979 
1980   d->parse.tok = lex (d);
1981   d->parse.depth = d->depth;
1982 
1983   regexp (d);
1984 
1985   if (d->parse.tok != END)
1986     dfaerror (_("unbalanced )"));
1987 
1988   addtok (d, END - d->nregexps);
1989   addtok (d, CAT);
1990 
1991   if (d->nregexps)
1992     addtok (d, OR);
1993 
1994   ++d->nregexps;
1995 }
1996 
1997 /* Some primitives for operating on sets of positions.  */
1998 
1999 /* Copy one set to another.  */
2000 static void
2001 copy (position_set const *src, position_set *dst)
     /* [previous][next][first][last][top][bottom][index][help] */
2002 {
2003   if (dst->alloc < src->nelem)
2004     {
2005       free (dst->elems);
2006       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2007                             sizeof *dst->elems);
2008     }
2009   dst->nelem = src->nelem;
2010   if (src->nelem != 0)
2011     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2012 }
2013 
2014 static void
2015 alloc_position_set (position_set *s, idx_t size)
     /* [previous][next][first][last][top][bottom][index][help] */
2016 {
2017   s->elems = xnmalloc (size, sizeof *s->elems);
2018   s->alloc = size;
2019   s->nelem = 0;
2020 }
2021 
2022 /* Insert position P in set S.  S is maintained in sorted order on
2023    decreasing index.  If there is already an entry in S with P.index
2024    then merge (logically-OR) P's constraints into the one in S.
2025    S->elems must point to an array large enough to hold the resulting set.  */
2026 static void
2027 insert (position p, position_set *s)
     /* [previous][next][first][last][top][bottom][index][help] */
2028 {
2029   idx_t count = s->nelem;
2030   idx_t lo = 0, hi = count;
2031   while (lo < hi)
2032     {
2033       idx_t mid = (lo + hi) >> 1;
2034       if (s->elems[mid].index < p.index)
2035         lo = mid + 1;
2036       else if (s->elems[mid].index == p.index)
2037         {
2038           s->elems[mid].constraint |= p.constraint;
2039           return;
2040         }
2041       else
2042         hi = mid;
2043     }
2044 
2045   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2046   for (idx_t i = count; i > lo; i--)
2047     s->elems[i] = s->elems[i - 1];
2048   s->elems[lo] = p;
2049   ++s->nelem;
2050 }
2051 
2052 static void
2053 append (position p, position_set *s)
     /* [previous][next][first][last][top][bottom][index][help] */
2054 {
2055   idx_t count = s->nelem;
2056   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2057   s->elems[s->nelem++] = p;
2058 }
2059 
2060 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2061    result is as if the positions of S1, and of S2 with the additional
2062    constraint C2, were inserted into an initially empty set.  */
2063 static void
2064 merge_constrained (position_set const *s1, position_set const *s2,
     /* [previous][next][first][last][top][bottom][index][help] */
2065                    unsigned int c2, position_set *m)
2066 {
2067   idx_t i = 0, j = 0;
2068 
2069   if (m->alloc - s1->nelem < s2->nelem)
2070     {
2071       free (m->elems);
2072       m->alloc = s1->nelem;
2073       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2074     }
2075   m->nelem = 0;
2076   while (i < s1->nelem || j < s2->nelem)
2077     if (! (j < s2->nelem)
2078         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2079       {
2080         unsigned int c = ((i < s1->nelem && j < s2->nelem
2081                            && s1->elems[i].index == s2->elems[j].index)
2082                           ? s2->elems[j++].constraint & c2
2083                           : 0);
2084         m->elems[m->nelem].index = s1->elems[i].index;
2085         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2086       }
2087     else
2088       {
2089         if (s2->elems[j].constraint & c2)
2090           {
2091             m->elems[m->nelem].index = s2->elems[j].index;
2092             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2093           }
2094         j++;
2095       }
2096 }
2097 
2098 /* Merge two sets of positions into a third.  The result is exactly as if
2099    the positions of both sets were inserted into an initially empty set.  */
2100 static void
2101 merge (position_set const *s1, position_set const *s2, position_set *m)
     /* [previous][next][first][last][top][bottom][index][help] */
2102 {
2103   merge_constrained (s1, s2, -1, m);
2104 }
2105 
2106 /* Merge into DST all the elements of SRC, possibly destroying
2107    the contents of the temporary M.  */
2108 static void
2109 merge2 (position_set *dst, position_set const *src, position_set *m)
     /* [previous][next][first][last][top][bottom][index][help] */
2110 {
2111   if (src->nelem < 4)
2112     {
2113       for (idx_t i = 0; i < src->nelem; i++)
2114         insert (src->elems[i], dst);
2115     }
2116    else
2117     {
2118       merge (src, dst, m);
2119       copy (m, dst);
2120     }
2121 }
2122 
2123 /* Delete a position from a set.  Return the nonzero constraint of the
2124    deleted position, or zero if there was no such position.  */
2125 static unsigned int
2126 delete (idx_t del, position_set *s)
     /* [previous][next][first][last][top][bottom][index][help] */
2127 {
2128   idx_t count = s->nelem;
2129   idx_t lo = 0, hi = count;
2130   while (lo < hi)
2131     {
2132       idx_t mid = (lo + hi) >> 1;
2133       if (s->elems[mid].index < del)
2134         lo = mid + 1;
2135       else if (s->elems[mid].index == del)
2136         {
2137           unsigned int c = s->elems[mid].constraint;
2138           idx_t i;
2139           for (i = mid; i + 1 < count; i++)
2140             s->elems[i] = s->elems[i + 1];
2141           s->nelem = i;
2142           return c;
2143         }
2144       else
2145         hi = mid;
2146     }
2147   return 0;
2148 }
2149 
2150 /* Replace a position with the followed set.  */
2151 static void
2152 replace (position_set *dst, idx_t del, position_set *add,
     /* [previous][next][first][last][top][bottom][index][help] */
2153          unsigned int constraint, position_set *tmp)
2154 {
2155   unsigned int c = delete (del, dst) & constraint;
2156 
2157   if (c)
2158     {
2159       copy (dst, tmp);
2160       merge_constrained (tmp, add, c, dst);
2161     }
2162 }
2163 
2164 /* Find the index of the state corresponding to the given position set with
2165    the given preceding context, or create a new state if there is no such
2166    state.  Context tells whether we got here on a newline or letter.  */
2167 static state_num
2168 state_index (struct dfa *d, position_set const *s, int context)
     /* [previous][next][first][last][top][bottom][index][help] */
2169 {
2170   size_t hash = 0;
2171   int constraint = 0;
2172   state_num i;
2173 
2174   for (i = 0; i < s->nelem; ++i)
2175     {
2176       idx_t ind = s->elems[i].index;
2177       hash ^= ind + s->elems[i].constraint;
2178     }
2179 
2180   /* Try to find a state that exactly matches the proposed one.  */
2181   for (i = 0; i < d->sindex; ++i)
2182     {
2183       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2184           || context != d->states[i].context)
2185         continue;
2186       state_num j;
2187       for (j = 0; j < s->nelem; ++j)
2188         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2189             || s->elems[j].index != d->states[i].elems.elems[j].index)
2190           break;
2191       if (j == s->nelem)
2192         return i;
2193     }
2194 
2195 #ifdef DEBUG
2196   fprintf (stderr, "new state %td\n nextpos:", i);
2197   for (state_num j = 0; j < s->nelem; j++)
2198     {
2199       fprintf (stderr, " %td:", s->elems[j].index);
2200       prtok (d->tokens[s->elems[j].index]);
2201     }
2202   fprintf (stderr, "\n context:");
2203   if (context ^ CTX_ANY)
2204     {
2205       if (context & CTX_NONE)
2206         fprintf (stderr, " CTX_NONE");
2207       if (context & CTX_LETTER)
2208         fprintf (stderr, " CTX_LETTER");
2209       if (context & CTX_NEWLINE)
2210         fprintf (stderr, " CTX_NEWLINE");
2211     }
2212   else
2213     fprintf (stderr, " CTX_ANY");
2214   fprintf (stderr, "\n");
2215 #endif
2216 
2217   for (state_num j = 0; j < s->nelem; j++)
2218     {
2219       int c = d->constraints[s->elems[j].index];
2220 
2221       if (c != 0)
2222         {
2223           if (succeeds_in_context (c, context, CTX_ANY))
2224             constraint |= c;
2225         }
2226       else if (d->tokens[s->elems[j].index] == BACKREF)
2227         constraint = NO_CONSTRAINT;
2228     }
2229 
2230 
2231   /* Create a new state.  */
2232   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2233                              sizeof *d->states);
2234   d->states[i].hash = hash;
2235   alloc_position_set (&d->states[i].elems, s->nelem);
2236   copy (s, &d->states[i].elems);
2237   d->states[i].context = context;
2238   d->states[i].constraint = constraint;
2239   d->states[i].mbps.nelem = 0;
2240   d->states[i].mbps.elems = NULL;
2241   d->states[i].mb_trindex = -1;
2242 
2243   ++d->sindex;
2244 
2245   return i;
2246 }
2247 
2248 /* Find the epsilon closure of D's set of positions.  If any position of the set
2249    contains a symbol that matches the empty string in some context, replace
2250    that position with the elements of its follow labeled with an appropriate
2251    constraint.  Repeat exhaustively until no funny positions are left.
2252    S->elems must be large enough to hold the result.  BACKWARD is D's
2253    backward set; use and update it too.  */
2254 static void
2255 epsclosure (struct dfa const *d, position_set *backward)
     /* [previous][next][first][last][top][bottom][index][help] */
2256 {
2257   position_set tmp;
2258   alloc_position_set (&tmp, d->nleaves);
2259   for (idx_t i = 0; i < d->tindex; i++)
2260     if (0 < d->follows[i].nelem)
2261       {
2262         unsigned int constraint;
2263         switch (d->tokens[i])
2264           {
2265           default:
2266             continue;
2267 
2268           case BEGLINE:
2269             constraint = BEGLINE_CONSTRAINT;
2270             break;
2271           case ENDLINE:
2272             constraint = ENDLINE_CONSTRAINT;
2273             break;
2274           case BEGWORD:
2275             constraint = BEGWORD_CONSTRAINT;
2276             break;
2277           case ENDWORD:
2278             constraint = ENDWORD_CONSTRAINT;
2279             break;
2280           case LIMWORD:
2281             constraint = LIMWORD_CONSTRAINT;
2282             break;
2283           case NOTLIMWORD:
2284             constraint = NOTLIMWORD_CONSTRAINT;
2285             break;
2286           case EMPTY:
2287             constraint = NO_CONSTRAINT;
2288             break;
2289           }
2290 
2291         delete (i, &d->follows[i]);
2292 
2293         for (idx_t j = 0; j < backward[i].nelem; j++)
2294           replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2295                    constraint, &tmp);
2296         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2297           replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2298                    NO_CONSTRAINT, &tmp);
2299       }
2300   free (tmp.elems);
2301 }
2302 
2303 /* Returns the set of contexts for which there is at least one
2304    character included in C.  */
2305 
2306 static int
2307 charclass_context (struct dfa const *dfa, charclass const *c)
     /* [previous][next][first][last][top][bottom][index][help] */
2308 {
2309   int context = 0;
2310 
2311   for (int j = 0; j < CHARCLASS_WORDS; j++)
2312     {
2313       if (c->w[j] & dfa->syntax.newline.w[j])
2314         context |= CTX_NEWLINE;
2315       if (c->w[j] & dfa->syntax.letters.w[j])
2316         context |= CTX_LETTER;
2317       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2318         context |= CTX_NONE;
2319     }
2320 
2321   return context;
2322 }
2323 
2324 /* Returns the contexts on which the position set S depends.  Each context
2325    in the set of returned contexts (let's call it SC) may have a different
2326    follow set than other contexts in SC, and also different from the
2327    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2328    in the complement set will have the same follow set.  */
2329 
2330 static int _GL_ATTRIBUTE_PURE
2331 state_separate_contexts (struct dfa *d, position_set const *s)
     /* [previous][next][first][last][top][bottom][index][help] */
2332 {
2333   int separate_contexts = 0;
2334 
2335   for (idx_t j = 0; j < s->nelem; j++)
2336     separate_contexts |= d->separates[s->elems[j].index];
2337 
2338   return separate_contexts;
2339 }
2340 
2341 enum
2342 {
2343   /* Single token is repeated.  It is distinguished from non-repeated.  */
2344   OPT_REPEAT = (1 << 0),
2345 
2346   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
2347      node is not merged.  */
2348   OPT_LPAREN = (1 << 1),
2349 
2350   /* Multiple branches are joined.  The node is not merged.  */
2351   OPT_RPAREN = (1 << 2),
2352 
2353   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
2354      flag is turned on. */
2355   OPT_WALKED = (1 << 3),
2356 
2357   /* The node is queued.  The node is not queued again.  */
2358   OPT_QUEUED = (1 << 4)
2359 };
2360 
2361 static void
2362 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
     /* [previous][next][first][last][top][bottom][index][help] */
2363                  position_set *merged)
2364 {
2365   position_set *follows = d->follows;
2366   idx_t nelem = 0;
2367 
2368   for (idx_t i = 0; i < follows[tindex].nelem; i++)
2369     {
2370       idx_t sindex = follows[tindex].elems[i].index;
2371 
2372       /* Skip the node as pruned in future.  */
2373       unsigned int iconstraint = follows[tindex].elems[i].constraint;
2374       if (iconstraint == 0)
2375         continue;
2376 
2377       if (d->tokens[follows[tindex].elems[i].index] <= END)
2378         {
2379           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2380           continue;
2381         }
2382 
2383       if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2384         {
2385           idx_t j;
2386 
2387           for (j = 0; j < nelem; j++)
2388             {
2389               idx_t dindex = follows[tindex].elems[j].index;
2390 
2391               if (dindex == tindex)
2392                 continue;
2393 
2394               if (follows[tindex].elems[j].constraint != iconstraint)
2395                 continue;
2396 
2397               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2398                 continue;
2399 
2400               if (d->tokens[sindex] != d->tokens[dindex])
2401                 continue;
2402 
2403               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2404                 continue;
2405 
2406               if (flags[sindex] & OPT_REPEAT)
2407                 delete (sindex, &follows[sindex]);
2408 
2409               merge2 (&follows[dindex], &follows[sindex], merged);
2410 
2411               break;
2412             }
2413 
2414           if (j < nelem)
2415             continue;
2416         }
2417 
2418       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2419       flags[sindex] |= OPT_QUEUED;
2420     }
2421 
2422   follows[tindex].nelem = nelem;
2423 }
2424 
2425 static int
2426 compare (const void *a, const void *b)
     /* [previous][next][first][last][top][bottom][index][help] */
2427 {
2428   position const *p = a, *q = b;
2429   return (p->index > q->index) - (p->index < q->index);
2430 }
2431 
2432 static void
2433 reorder_tokens (struct dfa *d)
     /* [previous][next][first][last][top][bottom][index][help] */
2434 {
2435   idx_t nleaves = 0;
2436   ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2437   map[0] = nleaves++;
2438   for (idx_t i = 1; i < d->tindex; i++)
2439     map[i] = -1;
2440 
2441   token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2442   position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2443   int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2444   char *multibyte_prop = (d->localeinfo.multibyte
2445                           ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2446                           : NULL);
2447 
2448   for (idx_t i = 0; i < d->tindex; i++)
2449     {
2450       if (map[i] < 0)
2451         {
2452           free (d->follows[i].elems);
2453           d->follows[i].elems = NULL;
2454           d->follows[i].nelem = 0;
2455           continue;
2456         }
2457 
2458       tokens[map[i]] = d->tokens[i];
2459       follows[map[i]] = d->follows[i];
2460       constraints[map[i]] = d->constraints[i];
2461 
2462       if (multibyte_prop != NULL)
2463         multibyte_prop[map[i]] = d->multibyte_prop[i];
2464 
2465       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2466         {
2467           if (map[d->follows[i].elems[j].index] == -1)
2468             map[d->follows[i].elems[j].index] = nleaves++;
2469 
2470           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2471         }
2472 
2473       qsort (d->follows[i].elems, d->follows[i].nelem,
2474              sizeof *d->follows[i].elems, compare);
2475     }
2476 
2477   for (idx_t i = 0; i < nleaves; i++)
2478     {
2479       d->tokens[i] = tokens[i];
2480       d->follows[i] = follows[i];
2481       d->constraints[i] = constraints[i];
2482 
2483       if (multibyte_prop != NULL)
2484         d->multibyte_prop[i] = multibyte_prop[i];
2485     }
2486 
2487   d->tindex = d->nleaves = nleaves;
2488 
2489   free (tokens);
2490   free (follows);
2491   free (constraints);
2492   free (multibyte_prop);
2493   free (map);
2494 }
2495 
2496 static void
2497 dfaoptimize (struct dfa *d)
     /* [previous][next][first][last][top][bottom][index][help] */
2498 {
2499   char *flags = xizalloc (d->tindex);
2500 
2501   for (idx_t i = 0; i < d->tindex; i++)
2502     {
2503       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2504         {
2505           if (d->follows[i].elems[j].index == i)
2506             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2507           else if (d->follows[i].elems[j].index < i)
2508             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2509           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2510             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2511           else
2512             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2513         }
2514     }
2515 
2516   flags[0] |= OPT_QUEUED;
2517 
2518   position_set merged0;
2519   position_set *merged = &merged0;
2520   alloc_position_set (merged, d->nleaves);
2521 
2522   d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2523 
2524   for (idx_t i = 0; i < d->tindex; i++)
2525     if (flags[i] & OPT_QUEUED)
2526       merge_nfa_state (d, i, flags, merged);
2527 
2528   reorder_tokens (d);
2529 
2530   free (merged->elems);
2531   free (flags);
2532 }
2533 
2534 /* Perform bottom-up analysis on the parse tree, computing various functions.
2535    Note that at this point, we're pretending constructs like \< are real
2536    characters rather than constraints on what can follow them.
2537 
2538    Nullable:  A node is nullable if it is at the root of a regexp that can
2539    match the empty string.
2540    *  EMPTY leaves are nullable.
2541    * No other leaf is nullable.
2542    * A QMARK or STAR node is nullable.
2543    * A PLUS node is nullable if its argument is nullable.
2544    * A CAT node is nullable if both its arguments are nullable.
2545    * An OR node is nullable if either argument is nullable.
2546 
2547    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2548    that could correspond to the first character of a string matching the
2549    regexp rooted at the given node.
2550    * EMPTY leaves have empty firstpos.
2551    * The firstpos of a nonempty leaf is that leaf itself.
2552    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2553      argument.
2554    * The firstpos of a CAT node is the firstpos of the left argument, union
2555      the firstpos of the right if the left argument is nullable.
2556    * The firstpos of an OR node is the union of firstpos of each argument.
2557 
2558    Lastpos:  The lastpos of a node is the set of positions that could
2559    correspond to the last character of a string matching the regexp at
2560    the given node.
2561    * EMPTY leaves have empty lastpos.
2562    * The lastpos of a nonempty leaf is that leaf itself.
2563    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2564      argument.
2565    * The lastpos of a CAT node is the lastpos of its right argument, union
2566      the lastpos of the left if the right argument is nullable.
2567    * The lastpos of an OR node is the union of the lastpos of each argument.
2568 
2569    Follow:  The follow of a position is the set of positions that could
2570    correspond to the character following a character matching the node in
2571    a string matching the regexp.  At this point we consider special symbols
2572    that match the empty string in some context to be just normal characters.
2573    Later, if we find that a special symbol is in a follow set, we will
2574    replace it with the elements of its follow, labeled with an appropriate
2575    constraint.
2576    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2577      the follow of every node in the lastpos.
2578    * Every node in the firstpos of the second argument of a CAT node is in
2579      the follow of every node in the lastpos of the first argument.
2580 
2581    Because of the postfix representation of the parse tree, the depth-first
2582    analysis is conveniently done by a linear scan with the aid of a stack.
2583    Sets are stored as arrays of the elements, obeying a stack-like allocation
2584    scheme; the number of elements in each set deeper in the stack can be
2585    used to determine the address of a particular set's array.  */
2586 static void
2587 dfaanalyze (struct dfa *d, bool searchflag)
     /* [previous][next][first][last][top][bottom][index][help] */
2588 {
2589   /* Array allocated to hold position sets.  */
2590   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2591   /* Firstpos and lastpos elements.  */
2592   position *firstpos = posalloc;
2593   position *lastpos = firstpos + d->nleaves;
2594   position pos;
2595   position_set tmp;
2596 
2597   /* Stack for element counts and nullable flags.  */
2598   struct
2599   {
2600     /* Whether the entry is nullable.  */
2601     bool nullable;
2602 
2603     /* Counts of firstpos and lastpos sets.  */
2604     idx_t nfirstpos;
2605     idx_t nlastpos;
2606   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2607 
2608   position_set merged;          /* Result of merging sets.  */
2609 
2610   addtok (d, CAT);
2611   idx_t tindex = d->tindex;
2612 
2613 #ifdef DEBUG
2614   fprintf (stderr, "dfaanalyze:\n");
2615   for (idx_t i = 0; i < tindex; i++)
2616     {
2617       fprintf (stderr, " %td:", i);
2618       prtok (d->tokens[i]);
2619     }
2620   putc ('\n', stderr);
2621 #endif
2622 
2623   d->searchflag = searchflag;
2624   alloc_position_set (&merged, d->nleaves);
2625   d->follows = xicalloc (tindex, sizeof *d->follows);
2626   position_set *backward
2627     = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2628 
2629   for (idx_t i = 0; i < tindex; i++)
2630     {
2631       switch (d->tokens[i])
2632         {
2633         case EMPTY:
2634           /* The empty set is nullable.  */
2635           stk->nullable = true;
2636 
2637           /* The firstpos and lastpos of the empty leaf are both empty.  */
2638           stk->nfirstpos = stk->nlastpos = 0;
2639           stk++;
2640           break;
2641 
2642         case STAR:
2643         case PLUS:
2644           /* Every element in the lastpos of the argument is in the backward
2645              set of every element in the firstpos.  */
2646           if (d->epsilon)
2647             {
2648               tmp.elems = lastpos - stk[-1].nlastpos;
2649               tmp.nelem = stk[-1].nlastpos;
2650               for (position *p = firstpos - stk[-1].nfirstpos;
2651                    p < firstpos; p++)
2652                 merge2 (&backward[p->index], &tmp, &merged);
2653             }
2654 
2655           /* Every element in the firstpos of the argument is in the follow
2656              of every element in the lastpos.  */
2657           {
2658             tmp.elems = firstpos - stk[-1].nfirstpos;
2659             tmp.nelem = stk[-1].nfirstpos;
2660             for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2661               merge2 (&d->follows[p->index], &tmp, &merged);
2662           }
2663           FALLTHROUGH;
2664         case QMARK:
2665           /* A QMARK or STAR node is automatically nullable.  */
2666           if (d->tokens[i] != PLUS)
2667             stk[-1].nullable = true;
2668           break;
2669 
2670         case CAT:
2671           /* Every element in the lastpos of the first argument is in
2672              the backward set of every element in the firstpos of the
2673              second argument.  */
2674           if (backward)
2675             {
2676               tmp.nelem = stk[-2].nlastpos;
2677               tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2678               for (position *p = firstpos - stk[-1].nfirstpos;
2679                    p < firstpos; p++)
2680                 merge2 (&backward[p->index], &tmp, &merged);
2681             }
2682 
2683           /* Every element in the firstpos of the second argument is in the
2684              follow of every element in the lastpos of the first argument.  */
2685           {
2686             tmp.nelem = stk[-1].nfirstpos;
2687             tmp.elems = firstpos - stk[-1].nfirstpos;
2688             for (position *plim = lastpos - stk[-1].nlastpos,
2689                    *p = plim - stk[-2].nlastpos;
2690                  p < plim; p++)
2691               merge2 (&d->follows[p->index], &tmp, &merged);
2692           }
2693 
2694           /* The firstpos of a CAT node is the firstpos of the first argument,
2695              union that of the second argument if the first is nullable.  */
2696           if (stk[-2].nullable)
2697             stk[-2].nfirstpos += stk[-1].nfirstpos;
2698           else
2699             firstpos -= stk[-1].nfirstpos;
2700 
2701           /* The lastpos of a CAT node is the lastpos of the second argument,
2702              union that of the first argument if the second is nullable.  */
2703           if (stk[-1].nullable)
2704             stk[-2].nlastpos += stk[-1].nlastpos;
2705           else
2706             {
2707               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2708               for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2709                 p[j] = p[j + stk[-2].nlastpos];
2710               lastpos -= stk[-2].nlastpos;
2711               stk[-2].nlastpos = stk[-1].nlastpos;
2712             }
2713 
2714           /* A CAT node is nullable if both arguments are nullable.  */
2715           stk[-2].nullable &= stk[-1].nullable;
2716           stk--;
2717           break;
2718 
2719         case OR:
2720           /* The firstpos is the union of the firstpos of each argument.  */
2721           stk[-2].nfirstpos += stk[-1].nfirstpos;
2722 
2723           /* The lastpos is the union of the lastpos of each argument.  */
2724           stk[-2].nlastpos += stk[-1].nlastpos;
2725 
2726           /* An OR node is nullable if either argument is nullable.  */
2727           stk[-2].nullable |= stk[-1].nullable;
2728           stk--;
2729           break;
2730 
2731         default:
2732           /* Anything else is a nonempty position.  (Note that special
2733              constructs like \< are treated as nonempty strings here;
2734              an "epsilon closure" effectively makes them nullable later.
2735              Backreferences have to get a real position so we can detect
2736              transitions on them later.  But they are nullable.  */
2737           stk->nullable = d->tokens[i] == BACKREF;
2738 
2739           /* This position is in its own firstpos and lastpos.  */
2740           stk->nfirstpos = stk->nlastpos = 1;
2741           stk++;
2742 
2743           firstpos->index = lastpos->index = i;
2744           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2745           firstpos++, lastpos++;
2746 
2747           break;
2748         }
2749 #ifdef DEBUG
2750       /* ... balance the above nonsyntactic #ifdef goo...  */
2751       fprintf (stderr, "node %td:", i);
2752       prtok (d->tokens[i]);
2753       putc ('\n', stderr);
2754       fprintf (stderr,
2755                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2756       fprintf (stderr, " firstpos:");
2757       for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2758         {
2759           fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2760           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2761         }
2762       fprintf (stderr, "\n lastpos:");
2763       for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2764         {
2765           fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2766           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2767         }
2768       putc ('\n', stderr);
2769 #endif
2770     }
2771 
2772   if (backward)
2773     {
2774       /* For each follow set that is the follow set of a real position,
2775          replace it with its epsilon closure.  */
2776       epsclosure (d, backward);
2777 
2778       for (idx_t i = 0; i < tindex; i++)
2779         free (backward[i].elems);
2780       free (backward);
2781     }
2782 
2783   dfaoptimize (d);
2784 
2785 #ifdef DEBUG
2786   for (idx_t i = 0; i < tindex; i++)
2787     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2788         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2789         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2790       {
2791         fprintf (stderr, "follows(%td:", i);
2792         prtok (d->tokens[i]);
2793         fprintf (stderr, "):");
2794         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2795           {
2796             fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2797             prtok (d->tokens[d->follows[i].elems[j].index]);
2798           }
2799         putc ('\n', stderr);
2800       }
2801 #endif
2802 
2803   pos.index = 0;
2804   pos.constraint = NO_CONSTRAINT;
2805 
2806   alloc_position_set (&tmp, 1);
2807 
2808   append (pos, &tmp);
2809 
2810   d->separates = xicalloc (tindex, sizeof *d->separates);
2811 
2812   for (idx_t i = 0; i < tindex; i++)
2813     {
2814       if (prev_newline_dependent (d->constraints[i]))
2815         d->separates[i] |= CTX_NEWLINE;
2816       if (prev_letter_dependent (d->constraints[i]))
2817         d->separates[i] |= CTX_LETTER;
2818 
2819       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2820         {
2821           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2822             d->separates[i] |= CTX_NEWLINE;
2823           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2824             d->separates[i] |= CTX_LETTER;
2825         }
2826     }
2827 
2828   /* Context wanted by some position.  */
2829   int separate_contexts = state_separate_contexts (d, &tmp);
2830 
2831   /* Build the initial state.  */
2832   if (separate_contexts & CTX_NEWLINE)
2833     state_index (d, &tmp, CTX_NEWLINE);
2834   d->initstate_notbol = d->min_trcount
2835     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2836   if (separate_contexts & CTX_LETTER)
2837     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2838   d->min_trcount++;
2839   d->trcount = 0;
2840 
2841   free (posalloc);
2842   free (stkalloc);
2843   free (merged.elems);
2844   free (tmp.elems);
2845 }
2846 
2847 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2848 static void
2849 realloc_trans_if_necessary (struct dfa *d)
     /* [previous][next][first][last][top][bottom][index][help] */
2850 {
2851   state_num oldalloc = d->tralloc;
2852   if (oldalloc < d->sindex)
2853     {
2854       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2855       idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2856       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2857                            -1, sizeof *realtrans);
2858       realtrans[0] = realtrans[1] = NULL;
2859       d->trans = realtrans + 2;
2860       idx_t newalloc = d->tralloc = newalloc1 - 2;
2861       d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2862       d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2863       d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2864       if (d->localeinfo.multibyte)
2865         {
2866           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2867           realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2868           if (oldalloc == 0)
2869             realtrans[0] = realtrans[1] = NULL;
2870           d->mb_trans = realtrans + 2;
2871         }
2872       for (; oldalloc < newalloc; oldalloc++)
2873         {
2874           d->trans[oldalloc] = NULL;
2875           d->fails[oldalloc] = NULL;
2876           if (d->localeinfo.multibyte)
2877             d->mb_trans[oldalloc] = NULL;
2878         }
2879     }
2880 }
2881 
2882 /*
2883    Calculate the transition table for a new state derived from state s
2884    for a compiled dfa d after input character uc, and return the new
2885    state number.
2886 
2887    Do not worry about all possible input characters; calculate just the group
2888    of positions that match uc.  Label it with the set of characters that
2889    every position in the group matches (taking into account, if necessary,
2890    preceding context information of s).  Then find the union
2891    of these positions' follows, i.e., the set of positions of the
2892    new state.  For each character in the group's label, set the transition
2893    on this character to be to a state corresponding to the set's positions,
2894    and its associated backward context information, if necessary.
2895 
2896    When building a searching matcher, include the positions of state
2897    0 in every state.
2898 
2899    The group is constructed by building an equivalence-class
2900    partition of the positions of s.
2901 
2902    For each position, find the set of characters C that it matches.  Eliminate
2903    any characters from C that fail on grounds of backward context.
2904 
2905    Check whether the group's label L has nonempty
2906    intersection with C.  If L - C is nonempty, create a new group labeled
2907    L - C and having the same positions as the current group, and set L to
2908    the intersection of L and C.  Insert the position in the group, set
2909    C = C - L, and resume scanning.
2910 
2911    If after comparing with every group there are characters remaining in C,
2912    create a new group labeled with the characters of C and insert this
2913    position in that group.  */
2914 
2915 static state_num
2916 build_state (state_num s, struct dfa *d, unsigned char uc)
     /* [previous][next][first][last][top][bottom][index][help] */
2917 {
2918   position_set follows;         /* Union of the follows for each
2919                                    position of the current state.  */
2920   position_set group;           /* Positions that match the input char.  */
2921   position_set tmp;             /* Temporary space for merging sets.  */
2922   state_num state;              /* New state.  */
2923   state_num state_newline;      /* New state on a newline transition.  */
2924   state_num state_letter;       /* New state on a letter transition.  */
2925 
2926 #ifdef DEBUG
2927   fprintf (stderr, "build state %td\n", s);
2928 #endif
2929 
2930   /* A pointer to the new transition table, and the table itself.  */
2931   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2932   state_num *trans = *ptrans;
2933 
2934   if (!trans)
2935     {
2936       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2937          transition tables that can exist at once, other than for
2938          initial states.  Often-used transition tables are quickly
2939          rebuilt, whereas rarely-used ones are cleared away.  */
2940       if (MAX_TRCOUNT <= d->trcount)
2941         {
2942           for (state_num i = d->min_trcount; i < d->tralloc; i++)
2943             {
2944               free (d->trans[i]);
2945               free (d->fails[i]);
2946               d->trans[i] = d->fails[i] = NULL;
2947             }
2948           d->trcount = 0;
2949         }
2950 
2951       d->trcount++;
2952       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2953 
2954       /* Fill transition table with a default value which means that the
2955          transited state has not been calculated yet.  */
2956       for (int i = 0; i < NOTCHAR; i++)
2957         trans[i] = -2;
2958     }
2959 
2960   /* Set up the success bits for this state.  */
2961   d->success[s] = 0;
2962   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2963     d->success[s] |= CTX_NEWLINE;
2964   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2965     d->success[s] |= CTX_LETTER;
2966   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2967     d->success[s] |= CTX_NONE;
2968 
2969   alloc_position_set (&follows, d->nleaves);
2970 
2971   /* Find the union of the follows of the positions of the group.
2972      This is a hideously inefficient loop.  Fix it someday.  */
2973   for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
2974     for (idx_t k = 0;
2975          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2976       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2977               &follows);
2978 
2979   /* Positions that match the input char.  */
2980   alloc_position_set (&group, d->nleaves);
2981 
2982   /* The group's label.  */
2983   charclass label;
2984   fillset (&label);
2985 
2986   for (idx_t i = 0; i < follows.nelem; i++)
2987     {
2988       charclass matches;            /* Set of matching characters.  */
2989       position pos = follows.elems[i];
2990       bool matched = false;
2991       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2992         {
2993           zeroset (&matches);
2994           setbit (d->tokens[pos.index], &matches);
2995           if (d->tokens[pos.index] == uc)
2996             matched = true;
2997         }
2998       else if (d->tokens[pos.index] >= CSET)
2999         {
3000           matches = d->charclasses[d->tokens[pos.index] - CSET];
3001           if (tstbit (uc, &matches))
3002             matched = true;
3003         }
3004       else if (d->tokens[pos.index] == ANYCHAR)
3005         {
3006           matches = d->charclasses[d->canychar];
3007           if (tstbit (uc, &matches))
3008             matched = true;
3009 
3010           /* ANYCHAR must match with a single character, so we must put
3011              it to D->states[s].mbps which contains the positions which
3012              can match with a single character not a byte.  If all
3013              positions which has ANYCHAR does not depend on context of
3014              next character, we put the follows instead of it to
3015              D->states[s].mbps to optimize.  */
3016           if (succeeds_in_context (pos.constraint, d->states[s].context,
3017                                    CTX_NONE))
3018             {
3019               if (d->states[s].mbps.nelem == 0)
3020                 alloc_position_set (&d->states[s].mbps, 1);
3021               insert (pos, &d->states[s].mbps);
3022             }
3023         }
3024       else
3025         continue;
3026 
3027       /* Some characters may need to be eliminated from matches because
3028          they fail in the current context.  */
3029       if (pos.constraint != NO_CONSTRAINT)
3030         {
3031           if (!succeeds_in_context (pos.constraint,
3032                                     d->states[s].context, CTX_NEWLINE))
3033             for (int j = 0; j < CHARCLASS_WORDS; j++)
3034               matches.w[j] &= ~d->syntax.newline.w[j];
3035           if (!succeeds_in_context (pos.constraint,
3036                                     d->states[s].context, CTX_LETTER))
3037             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3038               matches.w[j] &= ~d->syntax.letters.w[j];
3039           if (!succeeds_in_context (pos.constraint,
3040                                     d->states[s].context, CTX_NONE))
3041             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3042               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3043 
3044           /* If there are no characters left, there's no point in going on.  */
3045           if (emptyset (&matches))
3046             continue;
3047 
3048           /* If we have reset the bit that made us declare "matched", reset
3049              that indicator, too.  This is required to avoid an infinite loop
3050              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
3051           if (!tstbit (uc, &matches))
3052             matched = false;
3053         }
3054 
3055 #ifdef DEBUG
3056       fprintf (stderr, " nextpos %td:", pos.index);
3057       prtok (d->tokens[pos.index]);
3058       fprintf (stderr, " of");
3059       for (unsigned j = 0; j < NOTCHAR; j++)
3060         if (tstbit (j, &matches))
3061           fprintf (stderr, " 0x%02x", j);
3062       fprintf (stderr, "\n");
3063 #endif
3064 
3065       if (matched)
3066         {
3067           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3068             label.w[k] &= matches.w[k];
3069           append (pos, &group);
3070         }
3071       else
3072         {
3073           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3074             label.w[k] &= ~matches.w[k];
3075         }
3076     }
3077 
3078   alloc_position_set (&tmp, d->nleaves);
3079 
3080   if (group.nelem > 0)
3081     {
3082       /* If we are building a searching matcher, throw in the positions
3083          of state 0 as well, if possible.  */
3084       if (d->searchflag)
3085         {
3086           /* If a token in follows.elems is not 1st byte of a multibyte
3087              character, or the states of follows must accept the bytes
3088              which are not 1st byte of the multibyte character.
3089              Then, if a state of follows encounters a byte, it must not be
3090              a 1st byte of a multibyte character nor a single byte character.
3091              In this case, do not add state[0].follows to next state, because
3092              state[0] must accept 1st-byte.
3093 
3094              For example, suppose <sb a> is a certain single byte character,
3095              <mb A> is a certain multibyte character, and the codepoint of
3096              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
3097              state[0] accepts <sb a>, state[i] transits to state[i+1] by
3098              accepting the 1st byte of <mb A>, and state[i+1] accepts the
3099              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3100              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3101              not add state[0].  */
3102 
3103           bool mergeit = !d->localeinfo.multibyte;
3104           if (!mergeit)
3105             {
3106               mergeit = true;
3107               for (idx_t j = 0; mergeit && j < group.nelem; j++)
3108                 mergeit &= d->multibyte_prop[group.elems[j].index];
3109             }
3110           if (mergeit)
3111             merge2 (&group, &d->states[0].elems, &tmp);
3112         }
3113 
3114       /* Find out if the new state will want any context information,
3115          by calculating possible contexts that the group can match,
3116          and separate contexts that the new state wants to know.  */
3117       int possible_contexts = charclass_context (d, &label);
3118       int separate_contexts = state_separate_contexts (d, &group);
3119 
3120       /* Find the state(s) corresponding to the union of the follows.  */
3121       if (possible_contexts & ~separate_contexts)
3122         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3123       else
3124         state = -1;
3125       if (separate_contexts & possible_contexts & CTX_NEWLINE)
3126         state_newline = state_index (d, &group, CTX_NEWLINE);
3127       else
3128         state_newline = state;
3129       if (separate_contexts & possible_contexts & CTX_LETTER)
3130         state_letter = state_index (d, &group, CTX_LETTER);
3131       else
3132         state_letter = state;
3133 
3134       /* Reallocate now, to reallocate any newline transition properly.  */
3135       realloc_trans_if_necessary (d);
3136     }
3137 
3138   /* If we are a searching matcher, the default transition is to a state
3139      containing the positions of state 0, otherwise the default transition
3140      is to fail miserably.  */
3141   else if (d->searchflag)
3142     {
3143       state_newline = 0;
3144       state_letter = d->min_trcount - 1;
3145       state = d->initstate_notbol;
3146     }
3147   else
3148     {
3149       state_newline = -1;
3150       state_letter = -1;
3151       state = -1;
3152     }
3153 
3154   /* Set the transitions for each character in the label.  */
3155   for (int i = 0; i < NOTCHAR; i++)
3156     if (tstbit (i, &label))
3157       switch (d->syntax.sbit[i])
3158         {
3159         case CTX_NEWLINE:
3160           trans[i] = state_newline;
3161           break;
3162         case CTX_LETTER:
3163           trans[i] = state_letter;
3164           break;
3165         default:
3166           trans[i] = state;
3167           break;
3168         }
3169 
3170 #ifdef DEBUG
3171   fprintf (stderr, "trans table %td", s);
3172   for (int i = 0; i < NOTCHAR; ++i)
3173     {
3174       if (!(i & 0xf))
3175         fprintf (stderr, "\n");
3176       fprintf (stderr, " %2td", trans[i]);
3177     }
3178   fprintf (stderr, "\n");
3179 #endif
3180 
3181   free (group.elems);
3182   free (follows.elems);
3183   free (tmp.elems);
3184 
3185   /* Keep the newline transition in a special place so we can use it as
3186      a sentinel.  */
3187   if (tstbit (d->syntax.eolbyte, &label))
3188     {
3189       d->newlines[s] = trans[d->syntax.eolbyte];
3190       trans[d->syntax.eolbyte] = -1;
3191     }
3192 
3193   return trans[uc];
3194 }
3195 
3196 /* Multibyte character handling sub-routines for dfaexec.  */
3197 
3198 /* Consume a single byte and transit state from 's' to '*next_state'.
3199    This function is almost same as the state transition routin in dfaexec.
3200    But state transition is done just once, otherwise matching succeed or
3201    reach the end of the buffer.  */
3202 static state_num
3203 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
     /* [previous][next][first][last][top][bottom][index][help] */
3204 {
3205   state_num *t;
3206 
3207   if (d->trans[s])
3208     t = d->trans[s];
3209   else if (d->fails[s])
3210     t = d->fails[s];
3211   else
3212     {
3213       build_state (s, d, **pp);
3214       if (d->trans[s])
3215         t = d->trans[s];
3216       else
3217         {
3218           t = d->fails[s];
3219           assert (t);
3220         }
3221     }
3222 
3223   if (t[**pp] == -2)
3224     build_state (s, d, **pp);
3225 
3226   return t[*(*pp)++];
3227 }
3228 
3229 /* Transit state from s, then return new state and update the pointer of
3230    the buffer.  This function is for a period operator which can match a
3231    multi-byte character.  */
3232 static state_num
3233 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
     /* [previous][next][first][last][top][bottom][index][help] */
3234                unsigned char const *end)
3235 {
3236   wint_t wc;
3237 
3238   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3239 
3240   /* This state has some operators which can match a multibyte character.  */
3241   d->mb_follows.nelem = 0;
3242 
3243   /* Calculate the state which can be reached from the state 's' by
3244      consuming 'mbclen' single bytes from the buffer.  */
3245   state_num s1 = s;
3246   int mbci;
3247   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3248     s = transit_state_singlebyte (d, s, pp);
3249   *pp += mbclen - mbci;
3250 
3251   if (wc == WEOF)
3252     {
3253       /* It is an invalid character, so ANYCHAR is not accepted.  */
3254       return s;
3255     }
3256 
3257   /* If all positions which have ANYCHAR do not depend on the context
3258      of the next character, calculate the next state with
3259      pre-calculated follows and cache the result.  */
3260   if (d->states[s1].mb_trindex < 0)
3261     {
3262       if (MAX_TRCOUNT <= d->mb_trcount)
3263         {
3264           state_num s3;
3265           for (s3 = -1; s3 < d->tralloc; s3++)
3266             {
3267               free (d->mb_trans[s3]);
3268               d->mb_trans[s3] = NULL;
3269             }
3270 
3271           for (state_num i = 0; i < d->sindex; i++)
3272             d->states[i].mb_trindex = -1;
3273           d->mb_trcount = 0;
3274         }
3275       d->states[s1].mb_trindex = d->mb_trcount++;
3276     }
3277 
3278   if (! d->mb_trans[s])
3279     {
3280       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3281       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3282       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3283       for (int i = 0; i < MAX_TRCOUNT; i++)
3284         d->mb_trans[s][i] = -1;
3285     }
3286   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3287     return d->mb_trans[s][d->states[s1].mb_trindex];
3288 
3289   if (s == -1)
3290     copy (&d->states[s1].mbps, &d->mb_follows);
3291   else
3292     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3293 
3294   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3295   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3296   realloc_trans_if_necessary (d);
3297 
3298   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3299 
3300   return s2;
3301 }
3302 
3303 /* The initial state may encounter a byte which is not a single byte character
3304    nor the first byte of a multibyte character.  But it is incorrect for the
3305    initial state to accept such a byte.  For example, in Shift JIS the regular
3306    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3307    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3308    that are not a single byte character nor the first byte of a multibyte
3309    character.
3310 
3311    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3312    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3313    the result is greater than P, set *WCP to the final wide character
3314    processed, or to WEOF if no wide character is processed.  Otherwise,
3315    if WCP is non-NULL, *WCP may or may not be updated.
3316 
3317    Both P and MBP must be no larger than END.  */
3318 static unsigned char const *
3319 skip_remains_mb (struct dfa *d, unsigned char const *p,
     /* [previous][next][first][last][top][bottom][index][help] */
3320                  unsigned char const *mbp, char const *end)
3321 {
3322   if (d->syntax.never_trail[*p])
3323     return p;
3324   while (mbp < p)
3325     {
3326       wint_t wc;
3327       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3328                            end - (char const *) mbp, d);
3329     }
3330   return mbp;
3331 }
3332 
3333 /* Search through a buffer looking for a match to the struct dfa *D.
3334    Find the first occurrence of a string matching the regexp in the
3335    buffer, and the shortest possible version thereof.  Return a pointer to
3336    the first character after the match, or NULL if none is found.  BEGIN
3337    points to the beginning of the buffer, and END points to the first byte
3338    after its end.  Note however that we store a sentinel byte (usually
3339    newline) in *END, so the actual buffer must be one byte longer.
3340    When ALLOW_NL, newlines may appear in the matching string.
3341    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3342    If MULTIBYTE, the input consists of multibyte characters and/or
3343    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3344    Here is the list of features that make this DFA matcher punt:
3345     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3346     - [^...] in non-simple locale
3347     - [[=foo=]] or [[.foo.]]
3348     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3349     - back-reference: (.)\1
3350     - word-delimiter in multibyte locale: \<, \>, \b, \B
3351    See struct localeinfo.simple for the definition of "simple locale".  */
3352 
3353 static inline char *
3354 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
     /* [previous][next][first][last][top][bottom][index][help] */
3355               idx_t *count, bool multibyte)
3356 {
3357   if (MAX_TRCOUNT <= d->sindex)
3358     {
3359       for (state_num s = d->min_trcount; s < d->sindex; s++)
3360         {
3361           free (d->states[s].elems.elems);
3362           free (d->states[s].mbps.elems);
3363         }
3364       d->sindex = d->min_trcount;
3365 
3366       if (d->trans)
3367         {
3368           for (state_num s = 0; s < d->tralloc; s++)
3369             {
3370               free (d->trans[s]);
3371               free (d->fails[s]);
3372               d->trans[s] = d->fails[s] = NULL;
3373             }
3374           d->trcount = 0;
3375         }
3376 
3377       if (d->localeinfo.multibyte && d->mb_trans)
3378         {
3379           for (state_num s = -1; s < d->tralloc; s++)
3380             {
3381               free (d->mb_trans[s]);
3382               d->mb_trans[s] = NULL;
3383             }
3384           for (state_num s = 0; s < d->min_trcount; s++)
3385             d->states[s].mb_trindex = -1;
3386           d->mb_trcount = 0;
3387         }
3388     }
3389 
3390   if (!d->tralloc)
3391     realloc_trans_if_necessary (d);
3392 
3393   /* Current state.  */
3394   state_num s = 0, s1 = 0;
3395 
3396   /* Current input character.  */
3397   unsigned char const *p = (unsigned char const *) begin;
3398   unsigned char const *mbp = p;
3399 
3400   /* Copy of d->trans so it can be optimized into a register.  */
3401   state_num **trans = d->trans;
3402   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3403   unsigned char saved_end = *(unsigned char *) end;
3404   *end = eol;
3405 
3406   if (multibyte)
3407     {
3408       memset (&d->mbs, 0, sizeof d->mbs);
3409       if (d->mb_follows.alloc == 0)
3410         alloc_position_set (&d->mb_follows, d->nleaves);
3411     }
3412 
3413   idx_t nlcount = 0;
3414   for (;;)
3415     {
3416       state_num *t;
3417       while ((t = trans[s]) != NULL)
3418         {
3419           if (s < d->min_trcount)
3420             {
3421               if (!multibyte || d->states[s].mbps.nelem == 0)
3422                 {
3423                   while (t[*p] == s)
3424                     p++;
3425                 }
3426               if (multibyte)
3427                 p = mbp = skip_remains_mb (d, p, mbp, end);
3428             }
3429 
3430           if (multibyte)
3431             {
3432               s1 = s;
3433 
3434               if (d->states[s].mbps.nelem == 0
3435                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3436                 {
3437                   /* If an input character does not match ANYCHAR, do it
3438                      like a single-byte character.  */
3439                   s = t[*p++];
3440                 }
3441               else
3442                 {
3443                   s = transit_state (d, s, &p, (unsigned char *) end);
3444                   mbp = p;
3445                   trans = d->trans;
3446                 }
3447             }
3448           else
3449             {
3450               s1 = t[*p++];
3451               t = trans[s1];
3452               if (! t)
3453                 {
3454                   state_num tmp = s;
3455                   s = s1;
3456                   s1 = tmp;     /* swap */
3457                   break;
3458                 }
3459               if (s < d->min_trcount)
3460                 {
3461                   while (t[*p] == s1)
3462                     p++;
3463                 }
3464               s = t[*p++];
3465             }
3466         }
3467 
3468       if (s < 0)
3469         {
3470           if (s == -2)
3471             {
3472               s = build_state (s1, d, p[-1]);
3473               trans = d->trans;
3474             }
3475           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3476             {
3477               /* The previous character was a newline.  Count it, and skip
3478                  checking of multibyte character boundary until here.  */
3479               nlcount++;
3480               mbp = p;
3481 
3482               s = (allow_nl ? d->newlines[s1]
3483                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3484                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3485                    : d->initstate_notbol);
3486             }
3487           else
3488             {
3489               p = NULL;
3490               goto done;
3491             }
3492         }
3493       else if (d->fails[s])
3494         {
3495           if ((d->success[s] & d->syntax.sbit[*p])
3496               || ((char *) p == end
3497                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3498                                          d)))
3499             goto done;
3500 
3501           if (multibyte && s < d->min_trcount)
3502             p = mbp = skip_remains_mb (d, p, mbp, end);
3503 
3504           s1 = s;
3505           if (!multibyte || d->states[s].mbps.nelem == 0
3506               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3507             {
3508               /* If a input character does not match ANYCHAR, do it
3509                  like a single-byte character.  */
3510               s = d->fails[s][*p++];
3511             }
3512           else
3513             {
3514               s = transit_state (d, s, &p, (unsigned char *) end);
3515               mbp = p;
3516               trans = d->trans;
3517             }
3518         }
3519       else
3520         {
3521           build_state (s, d, p[0]);
3522           trans = d->trans;
3523         }
3524     }
3525 
3526  done:
3527   if (count)
3528     *count += nlcount;
3529   *end = saved_end;
3530   return (char *) p;
3531 }
3532 
3533 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3534    This is for performance, as dfaexec_main is an inline function.  */
3535 
3536 static char *
3537 dfaexec_mb (struct dfa *d, char const *begin, char *end,
     /* [previous][next][first][last][top][bottom][index][help] */
3538             bool allow_nl, idx_t *count, bool *backref)
3539 {
3540   return dfaexec_main (d, begin, end, allow_nl, count, true);
3541 }
3542 
3543 static char *
3544 dfaexec_sb (struct dfa *d, char const *begin, char *end,
     /* [previous][next][first][last][top][bottom][index][help] */
3545             bool allow_nl, idx_t *count, bool *backref)
3546 {
3547   return dfaexec_main (d, begin, end, allow_nl, count, false);
3548 }
3549 
3550 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3551    any regexp that uses a construct not supported by this code.  */
3552 static char *
3553 dfaexec_noop (struct dfa *d, char const *begin, char *end,
     /* [previous][next][first][last][top][bottom][index][help] */
3554               bool allow_nl, idx_t *count, bool *backref)
3555 {
3556   *backref = true;
3557   return (char *) begin;
3558 }
3559 
3560 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3561    but faster and set *BACKREF if the DFA code does not support this
3562    regexp usage.  */
3563 
3564 char *
3565 dfaexec (struct dfa *d, char const *begin, char *end,
     /* [previous][next][first][last][top][bottom][index][help] */
3566          bool allow_nl, idx_t *count, bool *backref)
3567 {
3568   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3569 }
3570 
3571 struct dfa *
3572 dfasuperset (struct dfa const *d)
     /* [previous][next][first][last][top][bottom][index][help] */
3573 {
3574   return d->superset;
3575 }
3576 
3577 bool
3578 dfaisfast (struct dfa const *d)
     /* [previous][next][first][last][top][bottom][index][help] */
3579 {
3580   return d->fast;
3581 }
3582 
3583 static void
3584 free_mbdata (struct dfa *d)
     /* [previous][next][first][last][top][bottom][index][help] */
3585 {
3586   free (d->multibyte_prop);
3587   free (d->lex.brack.chars);
3588   free (d->mb_follows.elems);
3589 
3590   if (d->mb_trans)
3591     {
3592       state_num s;
3593       for (s = -1; s < d->tralloc; s++)
3594         free (d->mb_trans[s]);
3595       free (d->mb_trans - 2);
3596     }
3597 }
3598 
3599 /* Return true if every construct in D is supported by this DFA matcher.  */
3600 bool
3601 dfasupported (struct dfa const *d)
     /* [previous][next][first][last][top][bottom][index][help] */
3602 {
3603   for (idx_t i = 0; i < d->tindex; i++)
3604     {
3605       switch (d->tokens[i])
3606         {
3607         case BEGWORD:
3608         case ENDWORD:
3609         case LIMWORD:
3610         case NOTLIMWORD:
3611           if (!d->localeinfo.multibyte)
3612             continue;
3613           FALLTHROUGH;
3614         case BACKREF:
3615         case MBCSET:
3616           return false;
3617         }
3618     }
3619   return true;
3620 }
3621 
3622 /* Disable use of the superset DFA if it is not likely to help
3623    performance.  */
3624 static void
3625 maybe_disable_superset_dfa (struct dfa *d)
     /* [previous][next][first][last][top][bottom][index][help] */
3626 {
3627   if (!d->localeinfo.using_utf8)
3628     return;
3629 
3630   bool have_backref = false;
3631   for (idx_t i = 0; i < d->tindex; i++)
3632     {
3633       switch (d->tokens[i])
3634         {
3635         case ANYCHAR:
3636           /* Lowered.  */
3637           abort ();
3638         case BACKREF:
3639           have_backref = true;
3640           break;
3641         case MBCSET:
3642           /* Requires multi-byte algorithm.  */
3643           return;
3644         default:
3645           break;
3646         }
3647     }
3648 
3649   if (!have_backref && d->superset)
3650     {
3651       /* The superset DFA is not likely to be much faster, so remove it.  */
3652       dfafree (d->superset);
3653       free (d->superset);
3654       d->superset = NULL;
3655     }
3656 
3657   free_mbdata (d);
3658   d->localeinfo.multibyte = false;
3659   d->dfaexec = dfaexec_sb;
3660   d->fast = true;
3661 }
3662 
3663 static void
3664 dfassbuild (struct dfa *d)
     /* [previous][next][first][last][top][bottom][index][help] */
3665 {
3666   struct dfa *sup = dfaalloc ();
3667 
3668   *sup = *d;
3669   sup->localeinfo.multibyte = false;
3670   sup->dfaexec = dfaexec_sb;
3671   sup->multibyte_prop = NULL;
3672   sup->superset = NULL;
3673   sup->states = NULL;
3674   sup->sindex = 0;
3675   sup->constraints = NULL;
3676   sup->separates = NULL;
3677   sup->follows = NULL;
3678   sup->tralloc = 0;
3679   sup->trans = NULL;
3680   sup->fails = NULL;
3681   sup->success = NULL;
3682   sup->newlines = NULL;
3683 
3684   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3685   if (d->cindex)
3686     {
3687       memcpy (sup->charclasses, d->charclasses,
3688               d->cindex * sizeof *sup->charclasses);
3689     }
3690 
3691   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3692   sup->talloc = d->tindex * 2;
3693 
3694   bool have_achar = false;
3695   bool have_nchar = false;
3696   idx_t j;
3697   for (idx_t i = j = 0; i < d->tindex; i++)
3698     {
3699       switch (d->tokens[i])
3700         {
3701         case ANYCHAR:
3702         case MBCSET:
3703         case BACKREF:
3704           {
3705             charclass ccl;
3706             fillset (&ccl);
3707             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3708             sup->tokens[j++] = STAR;
3709             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3710                 || d->tokens[i + 1] == PLUS)
3711               i++;
3712             have_achar = true;
3713           }
3714           break;
3715         case BEGWORD:
3716         case ENDWORD:
3717         case LIMWORD:
3718         case NOTLIMWORD:
3719           if (d->localeinfo.multibyte)
3720             {
3721               /* These constraints aren't supported in a multibyte locale.
3722                  Ignore them in the superset DFA.  */
3723               sup->tokens[j++] = EMPTY;
3724               break;
3725             }
3726           FALLTHROUGH;
3727         default:
3728           sup->tokens[j++] = d->tokens[i];
3729           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3730               || d->tokens[i] >= CSET)
3731             have_nchar = true;
3732           break;
3733         }
3734     }
3735   sup->tindex = j;
3736 
3737   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3738     d->superset = sup;
3739   else
3740     {
3741       dfafree (sup);
3742       free (sup);
3743     }
3744 }
3745 
3746 /* Parse a string S of length LEN into D (but skip this step if S is null).
3747    Then analyze D and build a matcher for it.
3748    SEARCHFLAG says whether to build a searching or an exact matcher.  */
3749 void
3750 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
     /* [previous][next][first][last][top][bottom][index][help] */
3751 {
3752   if (s != NULL)
3753     dfaparse (s, len, d);
3754 
3755   dfassbuild (d);
3756 
3757   if (dfasupported (d))
3758     {
3759       maybe_disable_superset_dfa (d);
3760       dfaanalyze (d, searchflag);
3761     }
3762   else
3763     {
3764       d->dfaexec = dfaexec_noop;
3765     }
3766 
3767   if (d->superset)
3768     {
3769       d->fast = true;
3770       dfaanalyze (d->superset, searchflag);
3771     }
3772 }
3773 
3774 /* Free the storage held by the components of a dfa.  */
3775 void
3776 dfafree (struct dfa *d)
     /* [previous][next][first][last][top][bottom][index][help] */
3777 {
3778   free (d->charclasses);
3779   free (d->tokens);
3780 
3781   if (d->localeinfo.multibyte)
3782     free_mbdata (d);
3783 
3784   free (d->constraints);
3785   free (d->separates);
3786 
3787   for (idx_t i = 0; i < d->sindex; i++)
3788     {
3789       free (d->states[i].elems.elems);
3790       free (d->states[i].mbps.elems);
3791     }
3792   free (d->states);
3793 
3794   if (d->follows)
3795     {
3796       for (idx_t i = 0; i < d->tindex; i++)
3797         free (d->follows[i].elems);
3798       free (d->follows);
3799     }
3800 
3801   if (d->trans)
3802     {
3803       for (idx_t i = 0; i < d->tralloc; i++)
3804         {
3805           free (d->trans[i]);
3806           free (d->fails[i]);
3807         }
3808 
3809       free (d->trans - 2);
3810       free (d->fails);
3811       free (d->newlines);
3812       free (d->success);
3813     }
3814 
3815   if (d->superset)
3816     {
3817       dfafree (d->superset);
3818       free (d->superset);
3819     }
3820 }
3821 
3822 /* Having found the postfix representation of the regular expression,
3823    try to find a long sequence of characters that must appear in any line
3824    containing the r.e.
3825    Finding a "longest" sequence is beyond the scope here;
3826    we take an easy way out and hope for the best.
3827    (Take "(ab|a)b"--please.)
3828 
3829    We do a bottom-up calculation of sequences of characters that must appear
3830    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3831    representation:
3832         sequences that must appear at the left of the match ("left")
3833         sequences that must appear at the right of the match ("right")
3834         lists of sequences that must appear somewhere in the match ("in")
3835         sequences that must constitute the match ("is")
3836 
3837    When we get to the root of the tree, we use one of the longest of its
3838    calculated "in" sequences as our answer.
3839 
3840    The sequences calculated for the various types of node (in pseudo ANSI c)
3841    are shown below.  "p" is the operand of unary operators (and the left-hand
3842    operand of binary operators); "q" is the right-hand operand of binary
3843    operators.
3844 
3845    "ZERO" means "a zero-length sequence" below.
3846 
3847         Type    left            right           is              in
3848         ----    ----            -----           --              --
3849         char c  # c             # c             # c             # c
3850 
3851         ANYCHAR ZERO            ZERO            ZERO            ZERO
3852 
3853         MBCSET  ZERO            ZERO            ZERO            ZERO
3854 
3855         CSET    ZERO            ZERO            ZERO            ZERO
3856 
3857         STAR    ZERO            ZERO            ZERO            ZERO
3858 
3859         QMARK   ZERO            ZERO            ZERO            ZERO
3860 
3861         PLUS    p->left         p->right        ZERO            p->in
3862 
3863         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3864                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3865                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3866                                                 ZERO
3867 
3868         OR      longest common  longest common  (do p->is and substrings common
3869                 leading         trailing        to q->is have same p->in and
3870                 (sub)sequence   (sub)sequence   q->in length and content) ?
3871                 of p->left      of p->right
3872                 and q->left     and q->right    p->is : NULL
3873 
3874    If there's anything else we recognize in the tree, all four sequences get set
3875    to zero-length sequences.  If there's something we don't recognize in the
3876    tree, we just return a zero-length sequence.
3877 
3878    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3879    'aaa')?
3880 
3881    And ... is it here or someplace that we might ponder "optimizations" such as
3882         egrep 'psi|epsilon'     ->      egrep 'psi'
3883         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3884                                         (Yes, we now find "epsi" as a "string
3885                                         that must occur", but we might also
3886                                         simplify the *entire* r.e. being sought)
3887         grep '[c]'              ->      grep 'c'
3888         grep '(ab|a)b'          ->      grep 'ab'
3889         grep 'ab*'              ->      grep 'a'
3890         grep 'a*b'              ->      grep 'b'
3891 
3892    There are several issues:
3893 
3894    Is optimization easy (enough)?
3895 
3896    Does optimization actually accomplish anything,
3897    or is the automaton you get from "psi|epsilon" (for example)
3898    the same as the one you get from "psi" (for example)?
3899 
3900    Are optimizable r.e.'s likely to be used in real-life situations
3901    (something like 'ab*' is probably unlikely; something like is
3902    'psi|epsilon' is likelier)?  */
3903 
3904 static char *
3905 icatalloc (char *old, char const *new)
     /* [previous][next][first][last][top][bottom][index][help] */
3906 {
3907   idx_t newsize = strlen (new);
3908   if (newsize == 0)
3909     return old;
3910   idx_t oldsize = strlen (old);
3911   char *result = xirealloc (old, oldsize + newsize + 1);
3912   memcpy (result + oldsize, new, newsize + 1);
3913   return result;
3914 }
3915 
3916 static void
3917 freelist (char **cpp)
     /* [previous][next][first][last][top][bottom][index][help] */
3918 {
3919   while (*cpp)
3920     free (*cpp++);
3921 }
3922 
3923 static char **
3924 enlistnew (char **cpp, char *new)
     /* [previous][next][first][last][top][bottom][index][help] */
3925 {
3926   /* Is there already something in the list that's new (or longer)?  */
3927   idx_t i;
3928   for (i = 0; cpp[i] != NULL; i++)
3929     if (strstr (cpp[i], new) != NULL)
3930       {
3931         free (new);
3932         return cpp;
3933       }
3934   /* Eliminate any obsoleted strings.  */
3935   for (idx_t j = 0; cpp[j] != NULL; )
3936     if (strstr (new, cpp[j]) == NULL)
3937       ++j;
3938     else
3939       {
3940         free (cpp[j]);
3941         if (--i == j)
3942           break;
3943         cpp[j] = cpp[i];
3944         cpp[i] = NULL;
3945       }
3946   /* Add the new string.  */
3947   cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
3948   cpp[i] = new;
3949   cpp[i + 1] = NULL;
3950   return cpp;
3951 }
3952 
3953 static char **
3954 enlist (char **cpp, char const *str, idx_t len)
     /* [previous][next][first][last][top][bottom][index][help] */
3955 {
3956   return enlistnew (cpp, ximemdup0 (str, len));
3957 }
3958 
3959 /* Given pointers to two strings, return a pointer to an allocated
3960    list of their distinct common substrings.  */
3961 static char **
3962 comsubs (char *left, char const *right)
     /* [previous][next][first][last][top][bottom][index][help] */
3963 {
3964   char **cpp = xzalloc (sizeof *cpp);
3965 
3966   for (char *lcp = left; *lcp != '\0'; lcp++)
3967     {
3968       idx_t len = 0;
3969       char *rcp = strchr (right, *lcp);
3970       while (rcp != NULL)
3971         {
3972           idx_t i;
3973           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3974             continue;
3975           if (i > len)
3976             len = i;
3977           rcp = strchr (rcp + 1, *lcp);
3978         }
3979       if (len != 0)
3980         cpp = enlist (cpp, lcp, len);
3981     }
3982   return cpp;
3983 }
3984 
3985 static char **
3986 addlists (char **old, char **new)
     /* [previous][next][first][last][top][bottom][index][help] */
3987 {
3988   for (; *new; new++)
3989     old = enlistnew (old, xstrdup (*new));
3990   return old;
3991 }
3992 
3993 /* Given two lists of substrings, return a new list giving substrings
3994    common to both.  */
3995 static char **
3996 inboth (char **left, char **right)
     /* [previous][next][first][last][top][bottom][index][help] */
3997 {
3998   char **both = xzalloc (sizeof *both);
3999 
4000   for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4001     {
4002       for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4003         {
4004           char **temp = comsubs (left[lnum], right[rnum]);
4005           both = addlists (both, temp);
4006           freelist (temp);
4007           free (temp);
4008         }
4009     }
4010   return both;
4011 }
4012 
4013 typedef struct must must;
4014 
4015 struct must
4016 {
4017   char **in;
4018   char *left;
4019   char *right;
4020   char *is;
4021   bool begline;
4022   bool endline;
4023   must *prev;
4024 };
4025 
4026 static must *
4027 allocmust (must *mp, idx_t size)
     /* [previous][next][first][last][top][bottom][index][help] */
4028 {
4029   must *new_mp = xmalloc (sizeof *new_mp);
4030   new_mp->in = xzalloc (sizeof *new_mp->in);
4031   new_mp->left = xizalloc (size);
4032   new_mp->right = xizalloc (size);
4033   new_mp->is = xizalloc (size);
4034   new_mp->begline = false;
4035   new_mp->endline = false;
4036   new_mp->prev = mp;
4037   return new_mp;
4038 }
4039 
4040 static void
4041 resetmust (must *mp)
     /* [previous][next][first][last][top][bottom][index][help] */
4042 {
4043   freelist (mp->in);
4044   mp->in[0] = NULL;
4045   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4046   mp->begline = false;
4047   mp->endline = false;
4048 }
4049 
4050 static void
4051 freemust (must *mp)
     /* [previous][next][first][last][top][bottom][index][help] */
4052 {
4053   freelist (mp->in);
4054   free (mp->in);
4055   free (mp->left);
4056   free (mp->right);
4057   free (mp->is);
4058   free (mp);
4059 }
4060 
4061 struct dfamust *
4062 dfamust (struct dfa const *d)
     /* [previous][next][first][last][top][bottom][index][help] */
4063 {
4064   must *mp = NULL;
4065   char const *result = "";
4066   bool exact = false;
4067   bool begline = false;
4068   bool endline = false;
4069   bool need_begline = false;
4070   bool need_endline = false;
4071   bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4072 
4073   for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4074     {
4075       token t = d->tokens[ri];
4076       switch (t)
4077         {
4078         case BEGLINE:
4079           mp = allocmust (mp, 2);
4080           mp->begline = true;
4081           need_begline = true;
4082           break;
4083         case ENDLINE:
4084           mp = allocmust (mp, 2);
4085           mp->endline = true;
4086           need_endline = true;
4087           break;
4088         case LPAREN:
4089         case RPAREN:
4090           assert (!"neither LPAREN nor RPAREN may appear here");
4091 
4092         case EMPTY:
4093         case BEGWORD:
4094         case ENDWORD:
4095         case LIMWORD:
4096         case NOTLIMWORD:
4097         case BACKREF:
4098         case ANYCHAR:
4099         case MBCSET:
4100           mp = allocmust (mp, 2);
4101           break;
4102 
4103         case STAR:
4104         case QMARK:
4105           assume_nonnull (mp);
4106           resetmust (mp);
4107           break;
4108 
4109         case OR:
4110           {
4111             char **new;
4112             must *rmp = mp;
4113             assume_nonnull (rmp);
4114             must *lmp = mp = mp->prev;
4115             assume_nonnull (lmp);
4116             idx_t j, ln, rn, n;
4117 
4118             /* Guaranteed to be.  Unlikely, but ...  */
4119             if (streq (lmp->is, rmp->is))
4120               {
4121                 lmp->begline &= rmp->begline;
4122                 lmp->endline &= rmp->endline;
4123               }
4124             else
4125               {
4126                 lmp->is[0] = '\0';
4127                 lmp->begline = false;
4128                 lmp->endline = false;
4129               }
4130             /* Left side--easy */
4131             idx_t i = 0;
4132             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4133               ++i;
4134             lmp->left[i] = '\0';
4135             /* Right side */
4136             ln = strlen (lmp->right);
4137             rn = strlen (rmp->right);
4138             n = ln;
4139             if (n > rn)
4140               n = rn;
4141             for (i = 0; i < n; ++i)
4142               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4143                 break;
4144             for (j = 0; j < i; ++j)
4145               lmp->right[j] = lmp->right[(ln - i) + j];
4146             lmp->right[j] = '\0';
4147             new = inboth (lmp->in, rmp->in);
4148             freelist (lmp->in);
4149             free (lmp->in);
4150             lmp->in = new;
4151             freemust (rmp);
4152           }
4153           break;
4154 
4155         case PLUS:
4156           assume_nonnull (mp);
4157           mp->is[0] = '\0';
4158           break;
4159 
4160         case END:
4161           assume_nonnull (mp);
4162           assert (!mp->prev);
4163           for (idx_t i = 0; mp->in[i] != NULL; i++)
4164             if (strlen (mp->in[i]) > strlen (result))
4165               result = mp->in[i];
4166           if (streq (result, mp->is))
4167             {
4168               if ((!need_begline || mp->begline) && (!need_endline
4169                                                      || mp->endline))
4170                 exact = true;
4171               begline = mp->begline;
4172               endline = mp->endline;
4173             }
4174           goto done;
4175 
4176         case CAT:
4177           {
4178             must *rmp = mp;
4179             assume_nonnull (rmp);
4180             must *lmp = mp = mp->prev;
4181             assume_nonnull (lmp);
4182 
4183             /* In.  Everything in left, plus everything in
4184                right, plus concatenation of
4185                left's right and right's left.  */
4186             lmp->in = addlists (lmp->in, rmp->in);
4187             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4188               {
4189                 idx_t lrlen = strlen (lmp->right);
4190                 idx_t rllen = strlen (rmp->left);
4191                 char *tp = ximalloc (lrlen + rllen + 1);
4192                 memcpy (tp + lrlen, rmp->left, rllen + 1);
4193                 memcpy (tp, lmp->right, lrlen);
4194                 lmp->in = enlistnew (lmp->in, tp);
4195               }
4196             /* Left-hand */
4197             if (lmp->is[0] != '\0')
4198               lmp->left = icatalloc (lmp->left, rmp->left);
4199             /* Right-hand */
4200             if (rmp->is[0] == '\0')
4201               lmp->right[0] = '\0';
4202             lmp->right = icatalloc (lmp->right, rmp->right);
4203             /* Guaranteed to be */
4204             if ((lmp->is[0] != '\0' || lmp->begline)
4205                 && (rmp->is[0] != '\0' || rmp->endline))
4206               {
4207                 lmp->is = icatalloc (lmp->is, rmp->is);
4208                 lmp->endline = rmp->endline;
4209               }
4210             else
4211               {
4212                 lmp->is[0] = '\0';
4213                 lmp->begline = false;
4214                 lmp->endline = false;
4215               }
4216             freemust (rmp);
4217           }
4218           break;
4219 
4220         case '\0':
4221           /* Not on *my* shift.  */
4222           goto done;
4223 
4224         default:
4225           if (CSET <= t)
4226             {
4227               /* If T is a singleton, or if case-folding in a unibyte
4228                  locale and T's members all case-fold to the same char,
4229                  convert T to one of its members.  Otherwise, do
4230                  nothing further with T.  */
4231               charclass *ccl = &d->charclasses[t - CSET];
4232               int j;
4233               for (j = 0; j < NOTCHAR; j++)
4234                 if (tstbit (j, ccl))
4235                   break;
4236               if (! (j < NOTCHAR))
4237                 {
4238                   mp = allocmust (mp, 2);
4239                   break;
4240                 }
4241               t = j;
4242               while (++j < NOTCHAR)
4243                 if (tstbit (j, ccl)
4244                     && ! (case_fold_unibyte
4245                           && toupper (j) == toupper (t)))
4246                   break;
4247               if (j < NOTCHAR)
4248                 {
4249                   mp = allocmust (mp, 2);
4250                   break;
4251                 }
4252             }
4253 
4254           idx_t rj = ri + 2;
4255           if (d->tokens[ri + 1] == CAT)
4256             {
4257               for (; rj < d->tindex - 1; rj += 2)
4258                 {
4259                   if ((rj != ri && (d->tokens[rj] <= 0
4260                                     || NOTCHAR <= d->tokens[rj]))
4261                       || d->tokens[rj + 1] != CAT)
4262                     break;
4263                 }
4264             }
4265           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4266           mp->is[0] = mp->left[0] = mp->right[0]
4267             = case_fold_unibyte ? toupper (t) : t;
4268 
4269           idx_t i;
4270           for (i = 1; ri + 2 < rj; i++)
4271             {
4272               ri += 2;
4273               t = d->tokens[ri];
4274               mp->is[i] = mp->left[i] = mp->right[i]
4275                 = case_fold_unibyte ? toupper (t) : t;
4276             }
4277           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4278           mp->in = enlist (mp->in, mp->is, i);
4279           break;
4280         }
4281     }
4282  done:;
4283 
4284   struct dfamust *dm = NULL;
4285   if (*result)
4286     {
4287       dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4288       dm->exact = exact;
4289       dm->begline = begline;
4290       dm->endline = endline;
4291       strcpy (dm->must, result);
4292     }
4293 
4294   while (mp)
4295     {
4296       must *prev = mp->prev;
4297       freemust (mp);
4298       mp = prev;
4299     }
4300 
4301   return dm;
4302 }
4303 
4304 void
4305 dfamustfree (struct dfamust *dm)
     /* [previous][next][first][last][top][bottom][index][help] */
4306 {
4307   free (dm);
4308 }
4309 
4310 struct dfa *
4311 dfaalloc (void)
     /* [previous][next][first][last][top][bottom][index][help] */
4312 {
4313   return xmalloc (sizeof (struct dfa));
4314 }
4315 
4316 /* Initialize DFA.  */
4317 void
4318 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
     /* [previous][next][first][last][top][bottom][index][help] */
4319            reg_syntax_t bits, int dfaopts)
4320 {
4321   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4322   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4323   dfa->localeinfo = *linfo;
4324 
4325   dfa->fast = !dfa->localeinfo.multibyte;
4326 
4327   dfa->canychar = -1;
4328   dfa->syntax.syntax_bits_set = true;
4329   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4330   dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4331   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4332   dfa->syntax.syntax_bits = bits;
4333 
4334   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4335     {
4336       unsigned char uc = i;
4337 
4338       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4339       switch (dfa->syntax.sbit[uc])
4340         {
4341         case CTX_LETTER:
4342           setbit (uc, &dfa->syntax.letters);
4343           break;
4344         case CTX_NEWLINE:
4345           setbit (uc, &dfa->syntax.newline);
4346           break;
4347         }
4348 
4349       /* POSIX requires that the five bytes in "\n\r./" (including the
4350          terminating NUL) cannot occur inside a multibyte character.  */
4351       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4352                                      ? (uc & 0xc0) != 0x80
4353                                      : strchr ("\n\r./", uc) != NULL);
4354     }
4355 }
4356 
4357 /* Initialize TO by copying FROM's syntax settings.  */
4358 void
4359 dfacopysyntax (struct dfa *to, struct dfa const *from)
     /* [previous][next][first][last][top][bottom][index][help] */
4360 {
4361   memset (to, 0, offsetof (struct dfa, syntax));
4362   to->canychar = -1;
4363   to->fast = from->fast;
4364   to->syntax = from->syntax;
4365   to->dfaexec = from->dfaexec;
4366   to->localeinfo = from->localeinfo;
4367 }
4368 
4369 /* vim:set shiftwidth=2: */

/* [previous][next][first][last][top][bottom][index][help] */