This source file includes following definitions.
- streq
- isasciidigit
- isblank
- to_uchar
- newline_constraint
- letter_constraint
- other_constraint
- succeeds_in_context
- prev_newline_dependent
- prev_letter_dependent
- accepting
- accepts_in_context
- mbs_to_wchar
- prtok
- tstbit
- setbit
- clrbit
- zeroset
- fillset
- notset
- equal
- emptyset
- maybe_realloc
- charclass_index
- unibyte_word_constituent
- char_context
- setbit_wc
- setbit_case_fold_c
- fetch_wc
- bracket_fetch_wc
- find_pred
- parse_bracket_exp
- push_lex_state
- pop_lex_state
- lex
- addtok_mb
- addtok
- addtok_wc
- add_utf8_anychar
- atom
- nsubtoks
- copytoks
- closure
- branch
- regexp
- dfaparse
- copy
- alloc_position_set
- insert
- append
- merge_constrained
- merge
- merge2
- delete
- replace
- state_index
- epsclosure
- charclass_context
- state_separate_contexts
- merge_nfa_state
- compare
- reorder_tokens
- dfaoptimize
- dfaanalyze
- realloc_trans_if_necessary
- build_state
- transit_state_singlebyte
- transit_state
- skip_remains_mb
- dfaexec_main
- dfaexec_mb
- dfaexec_sb
- dfaexec_noop
- dfaexec
- dfasuperset
- dfaisfast
- free_mbdata
- dfasupported
- maybe_disable_superset_dfa
- dfassbuild
- dfacomp
- dfafree
- icatalloc
- freelist
- enlistnew
- enlist
- comsubs
- addlists
- inboth
- allocmust
- resetmust
- freemust
- dfamust
- dfamustfree
- dfaalloc
- dfasyntax
- dfacopysyntax
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 #include <config.h>
24
25 #include "dfa.h"
26
27 #include "flexmember.h"
28 #include "idx.h"
29 #include "verify.h"
30
31 #include <assert.h>
32 #include <ctype.h>
33 #include <stdint.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <limits.h>
37 #include <string.h>
38
39
40
41
42
43
44 #define assume_nonnull(x) assume ((x) != NULL)
45
46 static bool
47 streq (char const *a, char const *b)
48 {
49 return strcmp (a, b) == 0;
50 }
51
52 static bool
53 isasciidigit (char c)
54 {
55 return '0' <= c && c <= '9';
56 }
57
58 #include "gettext.h"
59 #define _(str) gettext (str)
60
61 #include <wchar.h>
62
63 #include "xalloc.h"
64 #include "localeinfo.h"
65
66 #ifndef FALLTHROUGH
67 # if 201710L < __STDC_VERSION__
68 # define FALLTHROUGH [[__fallthrough__]]
69 # elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
70 # define FALLTHROUGH __attribute__ ((__fallthrough__))
71 # else
72 # define FALLTHROUGH ((void) 0)
73 # endif
74 #endif
75
76 #ifndef MIN
77 # define MIN(a,b) ((a) < (b) ? (a) : (b))
78 #endif
79
80
81 #ifdef setbit
82 # undef setbit
83 #endif
84 #ifdef clrbit
85 # undef clrbit
86 #endif
87
88
89 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
90 # define isblank dfa_isblank
91 static int
92 isblank (int c)
93 {
94 return c == ' ' || c == '\t';
95 }
96 #endif
97
98
99 enum { NOTCHAR = 1 << CHAR_BIT };
100
101 #ifdef UINT_LEAST64_MAX
102
103
104 enum { CHARCLASS_WORD_BITS = 64 };
105
106
107
108 typedef uint_least64_t charclass_word;
109
110
111
112
113 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
114
115 #else
116
117 enum { CHARCLASS_WORD_BITS = 32 };
118 typedef unsigned long charclass_word;
119 # define CHARCLASS_PAIR(lo, hi) lo, hi
120 #endif
121
122
123 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
124 {{ \
125 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
126 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
127 }}
128
129
130 static charclass_word const CHARCLASS_WORD_MASK
131 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
132
133
134 enum
135 {
136 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
137 };
138
139
140 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
141
142
143
144
145 static unsigned char
146 to_uchar (char ch)
147 {
148 return ch;
149 }
150
151
152
153
154
155
156
157
158
159
160
161
162
163 enum
164 {
165 CTX_NONE = 1,
166 CTX_LETTER = 2,
167 CTX_NEWLINE = 4,
168 CTX_ANY = 7
169 };
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186 static int
187 newline_constraint (int constraint)
188 {
189 return (constraint >> 6) & 7;
190 }
191 static int
192 letter_constraint (int constraint)
193 {
194 return (constraint >> 3) & 7;
195 }
196 static int
197 other_constraint (int constraint)
198 {
199 return constraint & 7;
200 }
201
202 static bool
203 succeeds_in_context (int constraint, int prev, int curr)
204 {
205 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
206 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
207 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
208 & prev);
209 }
210
211
212 static bool
213 prev_newline_dependent (int constraint)
214 {
215 return ((constraint ^ constraint >> 2) & 0111) != 0;
216 }
217 static bool
218 prev_letter_dependent (int constraint)
219 {
220 return ((constraint ^ constraint >> 1) & 0111) != 0;
221 }
222
223
224
225
226
227 enum
228 {
229 NO_CONSTRAINT = 0777,
230 BEGLINE_CONSTRAINT = 0444,
231 ENDLINE_CONSTRAINT = 0700,
232 BEGWORD_CONSTRAINT = 0050,
233 ENDWORD_CONSTRAINT = 0202,
234 LIMWORD_CONSTRAINT = 0252,
235 NOTLIMWORD_CONSTRAINT = 0525
236 };
237
238
239
240
241
242 typedef ptrdiff_t token;
243 static token const TOKEN_MAX = PTRDIFF_MAX;
244
245
246
247 typedef ptrdiff_t state_num;
248
249
250 enum
251 {
252 END = -1,
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267 EMPTY = NOTCHAR,
268
269
270 QMARK,
271
272
273
274 STAR,
275
276
277
278 PLUS,
279
280
281
282 REPMN,
283
284
285
286 CAT,
287
288
289
290
291 OR,
292
293
294 LPAREN,
295
296
297 RPAREN,
298
299 WCHAR,
300
301
302 ANYCHAR,
303
304
305
306 BEG,
307
308
309 BEGLINE,
310
311
312
313 ENDLINE,
314
315
316 BEGWORD,
317
318
319
320 ENDWORD,
321
322
323 LIMWORD,
324
325
326
327 NOTLIMWORD,
328
329
330
331 BACKREF,
332
333
334
335
336
337
338
339 MBCSET,
340
341
342 CSET
343
344
345 };
346
347
348
349
350
351
352 typedef struct
353 {
354 idx_t index;
355 unsigned int constraint;
356 } position;
357
358
359 typedef struct
360 {
361 position *elems;
362 idx_t nelem;
363 idx_t alloc;
364 } position_set;
365
366
367
368
369 typedef struct
370 {
371 size_t hash;
372 position_set elems;
373 unsigned char context;
374 unsigned short constraint;
375 position_set mbps;
376
377
378 state_num mb_trindex;
379
380
381 } dfa_state;
382
383
384
385 enum { MAX_TRCOUNT = 1024 };
386
387
388
389 struct mb_char_classes
390 {
391 ptrdiff_t cset;
392 bool invert;
393 wchar_t *chars;
394 idx_t nchars;
395 idx_t nchars_alloc;
396 };
397
398 struct regex_syntax
399 {
400
401 reg_syntax_t syntax_bits;
402 bool syntax_bits_set;
403
404
405 bool case_fold;
406
407
408
409 bool anchor;
410
411
412 unsigned char eolbyte;
413
414
415 char sbit[NOTCHAR];
416
417
418
419 bool never_trail[NOTCHAR];
420
421
422 charclass letters;
423
424
425 charclass newline;
426 };
427
428
429
430
431
432 struct lexer_state
433 {
434 char const *ptr;
435 idx_t left;
436 token lasttok;
437 idx_t parens;
438 int minrep, maxrep;
439
440
441
442
443 wint_t wctok;
444
445
446 struct mb_char_classes brack;
447
448
449 bool laststart;
450 };
451
452
453
454 struct parser_state
455 {
456 token tok;
457 idx_t depth;
458
459
460
461
462 };
463
464
465 struct dfa
466 {
467
468 charclass *charclasses;
469 idx_t cindex;
470 idx_t calloc;
471 ptrdiff_t canychar;
472
473
474 struct lexer_state lex;
475
476
477 struct parser_state parse;
478
479
480 token *tokens;
481 idx_t tindex;
482 idx_t talloc;
483 idx_t depth;
484
485
486 idx_t nleaves;
487
488 idx_t nregexps;
489
490 bool fast;
491 bool epsilon;
492 token utf8_anychar_classes[9];
493 mbstate_t mbs;
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511 char *multibyte_prop;
512
513
514 struct dfa *superset;
515
516
517 dfa_state *states;
518 state_num sindex;
519 idx_t salloc;
520
521
522 position_set *follows;
523
524
525
526
527
528
529 bool searchflag;
530
531
532
533
534
535
536
537
538 int *constraints;
539
540 int *separates;
541
542
543
544 state_num tralloc;
545
546
547 int trcount;
548
549
550 int min_trcount;
551
552
553 state_num **trans;
554
555
556
557
558
559
560
561 state_num **fails;
562
563
564
565 char *success;
566
567 state_num *newlines;
568
569
570
571
572
573
574 state_num initstate_notbol;
575
576
577
578 position_set mb_follows;
579 state_num **mb_trans;
580
581 state_num mb_trcount;
582
583
584
585
586 struct regex_syntax syntax;
587
588
589
590
591
592 char *(*dfaexec) (struct dfa *, char const *, char *,
593 bool, idx_t *, bool *);
594
595
596 struct localeinfo localeinfo;
597 };
598
599
600
601
602 static bool
603 accepting (state_num s, struct dfa const *r)
604 {
605 return r->states[s].constraint != 0;
606 }
607
608
609 static bool
610 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
611 {
612 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
613 }
614
615 static void regexp (struct dfa *dfa);
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634 static int
635 mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
636 {
637 unsigned char uc = s[0];
638 wint_t wc = d->localeinfo.sbctowc[uc];
639
640 if (wc == WEOF)
641 {
642 wchar_t wch;
643 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
644 if (0 < nbytes && nbytes < (size_t) -2)
645 {
646 *pwc = wch;
647 return nbytes;
648 }
649 memset (&d->mbs, 0, sizeof d->mbs);
650 }
651
652 *pwc = wc;
653 return 1;
654 }
655
656 #ifdef DEBUG
657
658 static void
659 prtok (token t)
660 {
661 if (t <= END)
662 fprintf (stderr, "END");
663 else if (0 <= t && t < NOTCHAR)
664 {
665 unsigned int ch = t;
666 fprintf (stderr, "0x%02x", ch);
667 }
668 else
669 {
670 char const *s;
671 switch (t)
672 {
673 case BEG:
674 s = "BEG";
675 break;
676 case EMPTY:
677 s = "EMPTY";
678 break;
679 case BACKREF:
680 s = "BACKREF";
681 break;
682 case BEGLINE:
683 s = "BEGLINE";
684 break;
685 case ENDLINE:
686 s = "ENDLINE";
687 break;
688 case BEGWORD:
689 s = "BEGWORD";
690 break;
691 case ENDWORD:
692 s = "ENDWORD";
693 break;
694 case LIMWORD:
695 s = "LIMWORD";
696 break;
697 case NOTLIMWORD:
698 s = "NOTLIMWORD";
699 break;
700 case QMARK:
701 s = "QMARK";
702 break;
703 case STAR:
704 s = "STAR";
705 break;
706 case PLUS:
707 s = "PLUS";
708 break;
709 case CAT:
710 s = "CAT";
711 break;
712 case OR:
713 s = "OR";
714 break;
715 case LPAREN:
716 s = "LPAREN";
717 break;
718 case RPAREN:
719 s = "RPAREN";
720 break;
721 case ANYCHAR:
722 s = "ANYCHAR";
723 break;
724 case MBCSET:
725 s = "MBCSET";
726 break;
727 default:
728 s = "CSET";
729 break;
730 }
731 fprintf (stderr, "%s", s);
732 }
733 }
734 #endif
735
736
737
738 static bool
739 tstbit (unsigned int b, charclass const *c)
740 {
741 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
742 }
743
744 static void
745 setbit (unsigned int b, charclass *c)
746 {
747 charclass_word one = 1;
748 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
749 }
750
751 static void
752 clrbit (unsigned int b, charclass *c)
753 {
754 charclass_word one = 1;
755 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
756 }
757
758 static void
759 zeroset (charclass *s)
760 {
761 memset (s, 0, sizeof *s);
762 }
763
764 static void
765 fillset (charclass *s)
766 {
767 for (int i = 0; i < CHARCLASS_WORDS; i++)
768 s->w[i] = CHARCLASS_WORD_MASK;
769 }
770
771 static void
772 notset (charclass *s)
773 {
774 for (int i = 0; i < CHARCLASS_WORDS; ++i)
775 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
776 }
777
778 static bool
779 equal (charclass const *s1, charclass const *s2)
780 {
781 charclass_word w = 0;
782 for (int i = 0; i < CHARCLASS_WORDS; i++)
783 w |= s1->w[i] ^ s2->w[i];
784 return w == 0;
785 }
786
787 static bool
788 emptyset (charclass const *s)
789 {
790 charclass_word w = 0;
791 for (int i = 0; i < CHARCLASS_WORDS; i++)
792 w |= s->w[i];
793 return w == 0;
794 }
795
796
797
798
799
800
801
802
803
804
805 static void *
806 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
807 ptrdiff_t nitems_max, idx_t item_size)
808 {
809 if (i < *nitems)
810 return pa;
811 return xpalloc (pa, nitems, 1, nitems_max, item_size);
812 }
813
814
815 static idx_t
816 charclass_index (struct dfa *d, charclass const *s)
817 {
818 idx_t i;
819
820 for (i = 0; i < d->cindex; ++i)
821 if (equal (s, &d->charclasses[i]))
822 return i;
823 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
824 TOKEN_MAX - CSET, sizeof *d->charclasses);
825 ++d->cindex;
826 d->charclasses[i] = *s;
827 return i;
828 }
829
830 static bool
831 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
832 {
833 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
834 }
835
836 static int
837 char_context (struct dfa const *dfa, unsigned char c)
838 {
839 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
840 return CTX_NEWLINE;
841 if (unibyte_word_constituent (dfa, c))
842 return CTX_LETTER;
843 return CTX_NONE;
844 }
845
846
847
848
849
850
851 static bool
852 setbit_wc (wint_t wc, charclass *c)
853 {
854 int b = wctob (wc);
855 if (b < 0)
856 return false;
857
858 setbit (b, c);
859 return true;
860 }
861
862
863
864 static void
865 setbit_case_fold_c (int b, charclass *c)
866 {
867 int ub = toupper (b);
868 for (int i = 0; i < NOTCHAR; i++)
869 if (toupper (i) == ub)
870 setbit (i, c);
871 }
872
873
874
875
876
877
878
879 static int
880 fetch_wc (struct dfa *dfa)
881 {
882 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
883 dfa);
884 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
885 dfa->lex.ptr += nbytes;
886 dfa->lex.left -= nbytes;
887 return c;
888 }
889
890
891
892 static int
893 bracket_fetch_wc (struct dfa *dfa)
894 {
895 if (! dfa->lex.left)
896 dfaerror (_("unbalanced ["));
897 return fetch_wc (dfa);
898 }
899
900 typedef int predicate (int);
901
902
903
904
905
906 struct dfa_ctype
907 {
908 const char *name;
909 predicate *func;
910 bool single_byte_only;
911 };
912
913 static const struct dfa_ctype prednames[] = {
914 {"alpha", isalpha, false},
915 {"upper", isupper, false},
916 {"lower", islower, false},
917 {"digit", isdigit, true},
918 {"xdigit", isxdigit, false},
919 {"space", isspace, false},
920 {"punct", ispunct, false},
921 {"alnum", isalnum, false},
922 {"print", isprint, false},
923 {"graph", isgraph, false},
924 {"cntrl", iscntrl, false},
925 {"blank", isblank, false},
926 {NULL, NULL, false}
927 };
928
929 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
930 find_pred (const char *str)
931 {
932 for (int i = 0; prednames[i].name; i++)
933 if (streq (str, prednames[i].name))
934 return &prednames[i];
935 return NULL;
936 }
937
938
939
940 static token
941 parse_bracket_exp (struct dfa *dfa)
942 {
943
944
945 bool known_bracket_exp = true;
946
947
948
949
950
951
952 int colon_warning_state;
953
954 dfa->lex.brack.nchars = 0;
955 charclass ccl;
956 zeroset (&ccl);
957 int c = bracket_fetch_wc (dfa);
958 bool invert = c == '^';
959 if (invert)
960 {
961 c = bracket_fetch_wc (dfa);
962 known_bracket_exp = dfa->localeinfo.simple;
963 }
964 wint_t wc = dfa->lex.wctok;
965 int c1;
966 wint_t wc1;
967 colon_warning_state = (c == ':');
968 do
969 {
970 c1 = NOTCHAR;
971 colon_warning_state &= ~2;
972
973
974
975
976
977 if (c == '[')
978 {
979 c1 = bracket_fetch_wc (dfa);
980 wc1 = dfa->lex.wctok;
981
982 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
983 || c1 == '.' || c1 == '=')
984 {
985 enum { MAX_BRACKET_STRING_LEN = 32 };
986 char str[MAX_BRACKET_STRING_LEN + 1];
987 int len = 0;
988 for (;;)
989 {
990 c = bracket_fetch_wc (dfa);
991 if (dfa->lex.left == 0
992 || (c == c1 && dfa->lex.ptr[0] == ']'))
993 break;
994 if (len < MAX_BRACKET_STRING_LEN)
995 str[len++] = c;
996 else
997
998 str[0] = '\0';
999 }
1000 str[len] = '\0';
1001
1002
1003 c = bracket_fetch_wc (dfa);
1004 wc = dfa->lex.wctok;
1005 if (c1 == ':')
1006
1007
1008
1009
1010 {
1011 char const *class
1012 = (dfa->syntax.case_fold && (streq (str, "upper")
1013 || streq (str, "lower"))
1014 ? "alpha" : str);
1015 const struct dfa_ctype *pred = find_pred (class);
1016 if (!pred)
1017 dfaerror (_("invalid character class"));
1018
1019 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1020 known_bracket_exp = false;
1021 else
1022 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1023 if (pred->func (c2))
1024 setbit (c2, &ccl);
1025 }
1026 else
1027 known_bracket_exp = false;
1028
1029 colon_warning_state |= 8;
1030
1031
1032 c1 = bracket_fetch_wc (dfa);
1033 wc1 = dfa->lex.wctok;
1034 continue;
1035 }
1036
1037
1038
1039 }
1040
1041 if (c == '\\'
1042 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1043 {
1044 c = bracket_fetch_wc (dfa);
1045 wc = dfa->lex.wctok;
1046 }
1047
1048 if (c1 == NOTCHAR)
1049 {
1050 c1 = bracket_fetch_wc (dfa);
1051 wc1 = dfa->lex.wctok;
1052 }
1053
1054 if (c1 == '-')
1055
1056 {
1057 int c2 = bracket_fetch_wc (dfa);
1058 wint_t wc2 = dfa->lex.wctok;
1059
1060
1061
1062
1063 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1064 {
1065 known_bracket_exp = false;
1066 c2 = ']';
1067 }
1068
1069 if (c2 == ']')
1070 {
1071
1072
1073 dfa->lex.ptr--;
1074 dfa->lex.left++;
1075 }
1076 else
1077 {
1078 if (c2 == '\\' && (dfa->syntax.syntax_bits
1079 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1080 {
1081 c2 = bracket_fetch_wc (dfa);
1082 wc2 = dfa->lex.wctok;
1083 }
1084
1085 colon_warning_state |= 8;
1086 c1 = bracket_fetch_wc (dfa);
1087 wc1 = dfa->lex.wctok;
1088
1089
1090 if (wc != wc2 || wc == WEOF)
1091 {
1092 if (dfa->localeinfo.simple
1093 || (isasciidigit (c) & isasciidigit (c2)))
1094 {
1095 for (int ci = c; ci <= c2; ci++)
1096 if (dfa->syntax.case_fold && isalpha (ci))
1097 setbit_case_fold_c (ci, &ccl);
1098 else
1099 setbit (ci, &ccl);
1100 }
1101 else
1102 known_bracket_exp = false;
1103
1104 continue;
1105 }
1106 }
1107 }
1108
1109 colon_warning_state |= (c == ':') ? 2 : 4;
1110
1111 if (!dfa->localeinfo.multibyte)
1112 {
1113 if (dfa->syntax.case_fold && isalpha (c))
1114 setbit_case_fold_c (c, &ccl);
1115 else
1116 setbit (c, &ccl);
1117 continue;
1118 }
1119
1120 if (wc == WEOF)
1121 known_bracket_exp = false;
1122 else
1123 {
1124 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1125 int n = (dfa->syntax.case_fold
1126 ? case_folded_counterparts (wc, folded + 1) + 1
1127 : 1);
1128 folded[0] = wc;
1129 for (int i = 0; i < n; i++)
1130 if (!setbit_wc (folded[i], &ccl))
1131 {
1132 dfa->lex.brack.chars
1133 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1134 &dfa->lex.brack.nchars_alloc, -1,
1135 sizeof *dfa->lex.brack.chars);
1136 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1137 }
1138 }
1139 }
1140 while ((wc = wc1, (c = c1) != ']'));
1141
1142 if (colon_warning_state == 7)
1143 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1144
1145 if (! known_bracket_exp)
1146 return BACKREF;
1147
1148 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1149 {
1150 dfa->lex.brack.invert = invert;
1151 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1152 return MBCSET;
1153 }
1154
1155 if (invert)
1156 {
1157 notset (&ccl);
1158 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1159 clrbit ('\n', &ccl);
1160 }
1161
1162 return CSET + charclass_index (dfa, &ccl);
1163 }
1164
1165 struct lexptr
1166 {
1167 char const *ptr;
1168 idx_t left;
1169 };
1170
1171 static void
1172 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1173 {
1174 ls->ptr = dfa->lex.ptr;
1175 ls->left = dfa->lex.left;
1176 dfa->lex.ptr = s;
1177 dfa->lex.left = strlen (s);
1178 }
1179
1180 static void
1181 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1182 {
1183 dfa->lex.ptr = ls->ptr;
1184 dfa->lex.left = ls->left;
1185 }
1186
1187 static token
1188 lex (struct dfa *dfa)
1189 {
1190 bool backslash = false;
1191
1192
1193
1194
1195
1196
1197
1198 for (int i = 0; i < 2; ++i)
1199 {
1200 if (! dfa->lex.left)
1201 return dfa->lex.lasttok = END;
1202 int c = fetch_wc (dfa);
1203
1204 switch (c)
1205 {
1206 case '\\':
1207 if (backslash)
1208 goto normal_char;
1209 if (dfa->lex.left == 0)
1210 dfaerror (_("unfinished \\ escape"));
1211 backslash = true;
1212 break;
1213
1214 case '^':
1215 if (backslash)
1216 goto normal_char;
1217 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1218 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1219 || dfa->lex.lasttok == OR)
1220 return dfa->lex.lasttok = BEGLINE;
1221 goto normal_char;
1222
1223 case '$':
1224 if (backslash)
1225 goto normal_char;
1226 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1227 || dfa->lex.left == 0
1228 || ((dfa->lex.left
1229 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1230 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1231 & (dfa->lex.ptr[0] == '\\')]
1232 == ')'))
1233 || ((dfa->lex.left
1234 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1235 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1236 & (dfa->lex.ptr[0] == '\\')]
1237 == '|'))
1238 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1239 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1240 return dfa->lex.lasttok = ENDLINE;
1241 goto normal_char;
1242
1243 case '1':
1244 case '2':
1245 case '3':
1246 case '4':
1247 case '5':
1248 case '6':
1249 case '7':
1250 case '8':
1251 case '9':
1252 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1253 {
1254 dfa->lex.laststart = false;
1255 return dfa->lex.lasttok = BACKREF;
1256 }
1257 goto normal_char;
1258
1259 case '`':
1260 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1261 {
1262
1263 return dfa->lex.lasttok = BEGLINE;
1264 }
1265 goto normal_char;
1266
1267 case '\'':
1268 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1269 {
1270
1271 return dfa->lex.lasttok = ENDLINE;
1272 }
1273 goto normal_char;
1274
1275 case '<':
1276 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1277 return dfa->lex.lasttok = BEGWORD;
1278 goto normal_char;
1279
1280 case '>':
1281 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1282 return dfa->lex.lasttok = ENDWORD;
1283 goto normal_char;
1284
1285 case 'b':
1286 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1287 return dfa->lex.lasttok = LIMWORD;
1288 goto normal_char;
1289
1290 case 'B':
1291 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1292 return dfa->lex.lasttok = NOTLIMWORD;
1293 goto normal_char;
1294
1295 case '?':
1296 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1297 goto normal_char;
1298 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1299 goto normal_char;
1300 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1301 && dfa->lex.laststart)
1302 goto normal_char;
1303 return dfa->lex.lasttok = QMARK;
1304
1305 case '*':
1306 if (backslash)
1307 goto normal_char;
1308 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1309 && dfa->lex.laststart)
1310 goto normal_char;
1311 return dfa->lex.lasttok = STAR;
1312
1313 case '+':
1314 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1315 goto normal_char;
1316 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1317 goto normal_char;
1318 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1319 && dfa->lex.laststart)
1320 goto normal_char;
1321 return dfa->lex.lasttok = PLUS;
1322
1323 case '{':
1324 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1325 goto normal_char;
1326 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1327 goto normal_char;
1328 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1329 && dfa->lex.laststart)
1330 goto normal_char;
1331
1332
1333
1334
1335
1336
1337
1338 {
1339 char const *p = dfa->lex.ptr;
1340 char const *lim = p + dfa->lex.left;
1341 dfa->lex.minrep = dfa->lex.maxrep = -1;
1342 for (; p != lim && isasciidigit (*p); p++)
1343 dfa->lex.minrep = (dfa->lex.minrep < 0
1344 ? *p - '0'
1345 : MIN (RE_DUP_MAX + 1,
1346 dfa->lex.minrep * 10 + *p - '0'));
1347 if (p != lim)
1348 {
1349 if (*p != ',')
1350 dfa->lex.maxrep = dfa->lex.minrep;
1351 else
1352 {
1353 if (dfa->lex.minrep < 0)
1354 dfa->lex.minrep = 0;
1355 while (++p != lim && isasciidigit (*p))
1356 dfa->lex.maxrep
1357 = (dfa->lex.maxrep < 0
1358 ? *p - '0'
1359 : MIN (RE_DUP_MAX + 1,
1360 dfa->lex.maxrep * 10 + *p - '0'));
1361 }
1362 }
1363 if (! ((! backslash || (p != lim && *p++ == '\\'))
1364 && p != lim && *p++ == '}'
1365 && 0 <= dfa->lex.minrep
1366 && (dfa->lex.maxrep < 0
1367 || dfa->lex.minrep <= dfa->lex.maxrep)))
1368 {
1369 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1370 goto normal_char;
1371 dfaerror (_("invalid content of \\{\\}"));
1372 }
1373 if (RE_DUP_MAX < dfa->lex.maxrep)
1374 dfaerror (_("regular expression too big"));
1375 dfa->lex.ptr = p;
1376 dfa->lex.left = lim - p;
1377 }
1378 dfa->lex.laststart = false;
1379 return dfa->lex.lasttok = REPMN;
1380
1381 case '|':
1382 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1383 goto normal_char;
1384 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1385 goto normal_char;
1386 dfa->lex.laststart = true;
1387 return dfa->lex.lasttok = OR;
1388
1389 case '\n':
1390 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1391 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1392 goto normal_char;
1393 dfa->lex.laststart = true;
1394 return dfa->lex.lasttok = OR;
1395
1396 case '(':
1397 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1398 goto normal_char;
1399 dfa->lex.parens++;
1400 dfa->lex.laststart = true;
1401 return dfa->lex.lasttok = LPAREN;
1402
1403 case ')':
1404 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1405 goto normal_char;
1406 if (dfa->lex.parens == 0
1407 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1408 goto normal_char;
1409 dfa->lex.parens--;
1410 dfa->lex.laststart = false;
1411 return dfa->lex.lasttok = RPAREN;
1412
1413 case '.':
1414 if (backslash)
1415 goto normal_char;
1416 if (dfa->canychar < 0)
1417 {
1418 charclass ccl;
1419 fillset (&ccl);
1420 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1421 clrbit ('\n', &ccl);
1422 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1423 clrbit ('\0', &ccl);
1424 if (dfa->localeinfo.multibyte)
1425 for (int c2 = 0; c2 < NOTCHAR; c2++)
1426 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1427 clrbit (c2, &ccl);
1428 dfa->canychar = charclass_index (dfa, &ccl);
1429 }
1430 dfa->lex.laststart = false;
1431 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1432 ? ANYCHAR
1433 : CSET + dfa->canychar);
1434
1435 case 's':
1436 case 'S':
1437 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1438 goto normal_char;
1439 if (!dfa->localeinfo.multibyte)
1440 {
1441 charclass ccl;
1442 zeroset (&ccl);
1443 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1444 if (isspace (c2))
1445 setbit (c2, &ccl);
1446 if (c == 'S')
1447 notset (&ccl);
1448 dfa->lex.laststart = false;
1449 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1450 }
1451
1452
1453
1454
1455
1456
1457
1458 {
1459 struct lexptr ls;
1460 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1461 dfa->lex.lasttok = parse_bracket_exp (dfa);
1462 pop_lex_state (dfa, &ls);
1463 }
1464
1465 dfa->lex.laststart = false;
1466 return dfa->lex.lasttok;
1467
1468 case 'w':
1469 case 'W':
1470 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1471 goto normal_char;
1472
1473 if (!dfa->localeinfo.multibyte)
1474 {
1475 charclass ccl;
1476 zeroset (&ccl);
1477 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1478 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1479 setbit (c2, &ccl);
1480 if (c == 'W')
1481 notset (&ccl);
1482 dfa->lex.laststart = false;
1483 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1484 }
1485
1486
1487
1488
1489
1490
1491
1492 {
1493 struct lexptr ls;
1494 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1495 dfa->lex.lasttok = parse_bracket_exp (dfa);
1496 pop_lex_state (dfa, &ls);
1497 }
1498
1499 dfa->lex.laststart = false;
1500 return dfa->lex.lasttok;
1501
1502 case '[':
1503 if (backslash)
1504 goto normal_char;
1505 dfa->lex.laststart = false;
1506 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1507
1508 default:
1509 normal_char:
1510 dfa->lex.laststart = false;
1511
1512
1513 if (dfa->localeinfo.multibyte)
1514 return dfa->lex.lasttok = WCHAR;
1515
1516 if (dfa->syntax.case_fold && isalpha (c))
1517 {
1518 charclass ccl;
1519 zeroset (&ccl);
1520 setbit_case_fold_c (c, &ccl);
1521 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1522 }
1523
1524 return dfa->lex.lasttok = c;
1525 }
1526 }
1527
1528
1529
1530 abort ();
1531 return END;
1532 }
1533
1534 static void
1535 addtok_mb (struct dfa *dfa, token t, char mbprop)
1536 {
1537 if (dfa->talloc == dfa->tindex)
1538 {
1539 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1540 sizeof *dfa->tokens);
1541 if (dfa->localeinfo.multibyte)
1542 dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1543 sizeof *dfa->multibyte_prop);
1544 }
1545 if (dfa->localeinfo.multibyte)
1546 dfa->multibyte_prop[dfa->tindex] = mbprop;
1547 dfa->tokens[dfa->tindex++] = t;
1548
1549 switch (t)
1550 {
1551 case QMARK:
1552 case STAR:
1553 case PLUS:
1554 break;
1555
1556 case CAT:
1557 case OR:
1558 dfa->parse.depth--;
1559 break;
1560
1561 case EMPTY:
1562 dfa->epsilon = true;
1563 goto increment_depth;
1564
1565 case BACKREF:
1566 dfa->fast = false;
1567 goto increment_nleaves;
1568
1569 case BEGLINE:
1570 case ENDLINE:
1571 case BEGWORD:
1572 case ENDWORD:
1573 case LIMWORD:
1574 case NOTLIMWORD:
1575 dfa->epsilon = true;
1576 FALLTHROUGH;
1577 default:
1578 increment_nleaves:
1579 dfa->nleaves++;
1580 increment_depth:
1581 dfa->parse.depth++;
1582 if (dfa->depth < dfa->parse.depth)
1583 dfa->depth = dfa->parse.depth;
1584 break;
1585 }
1586 }
1587
1588 static void addtok_wc (struct dfa *dfa, wint_t wc);
1589
1590
1591
1592 static void
1593 addtok (struct dfa *dfa, token t)
1594 {
1595 if (dfa->localeinfo.multibyte && t == MBCSET)
1596 {
1597 bool need_or = false;
1598
1599
1600
1601 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1602 {
1603 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1604 if (need_or)
1605 addtok (dfa, OR);
1606 need_or = true;
1607 }
1608 dfa->lex.brack.nchars = 0;
1609
1610
1611
1612 if (dfa->lex.brack.cset != -1)
1613 {
1614 addtok (dfa, CSET + dfa->lex.brack.cset);
1615 if (need_or)
1616 addtok (dfa, OR);
1617 }
1618 }
1619 else
1620 {
1621 addtok_mb (dfa, t, 3);
1622 }
1623 }
1624
1625
1626
1627
1628
1629
1630
1631 static void
1632 addtok_wc (struct dfa *dfa, wint_t wc)
1633 {
1634 unsigned char buf[MB_LEN_MAX];
1635 mbstate_t s = { 0 };
1636 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1637 int buflen;
1638
1639 if (stored_bytes != (size_t) -1)
1640 buflen = stored_bytes;
1641 else
1642 {
1643
1644
1645 buflen = 1;
1646 buf[0] = 0;
1647 }
1648
1649 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1650 for (int i = 1; i < buflen; i++)
1651 {
1652 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1653 addtok (dfa, CAT);
1654 }
1655 }
1656
1657 static void
1658 add_utf8_anychar (struct dfa *dfa)
1659 {
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681 enum { A, B, C, E, F, H, J, K, M };
1682
1683
1684 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1685
1686 static charclass const utf8_classes[] = {
1687
1688 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1689
1690
1691 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1692
1693
1694 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1695
1696
1697
1698
1699 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1700
1701
1702 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1703
1704
1705
1706
1707 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1708
1709
1710
1711
1712 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1713
1714
1715 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1716
1717
1718
1719
1720 CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1721 };
1722
1723
1724 if (dfa->utf8_anychar_classes[0] == 0)
1725 {
1726 charclass c = utf8_classes[0];
1727 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1728 clrbit ('\n', &c);
1729 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1730 clrbit ('\0', &c);
1731 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1732
1733 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1734 dfa->utf8_anychar_classes[i]
1735 = CSET + charclass_index (dfa, &utf8_classes[i]);
1736 }
1737
1738
1739
1740
1741
1742 addtok (dfa, dfa->utf8_anychar_classes[A]);
1743 addtok (dfa, dfa->utf8_anychar_classes[B]);
1744 addtok (dfa, D_token);
1745 addtok (dfa, dfa->utf8_anychar_classes[E]);
1746 addtok (dfa, CAT);
1747 addtok (dfa, OR);
1748 addtok (dfa, G_token);
1749 addtok (dfa, dfa->utf8_anychar_classes[H]);
1750 addtok (dfa, CAT);
1751 addtok (dfa, OR);
1752 addtok (dfa, dfa->utf8_anychar_classes[F]);
1753 addtok (dfa, I_token);
1754 addtok (dfa, dfa->utf8_anychar_classes[J]);
1755 addtok (dfa, CAT);
1756 addtok (dfa, OR);
1757 addtok (dfa, L_token);
1758 addtok (dfa, dfa->utf8_anychar_classes[M]);
1759 addtok (dfa, CAT);
1760 addtok (dfa, OR);
1761 addtok (dfa, dfa->utf8_anychar_classes[K]);
1762 for (int i = 0; i < 3; i++)
1763 {
1764 addtok (dfa, dfa->utf8_anychar_classes[C]);
1765 addtok (dfa, CAT);
1766 addtok (dfa, OR);
1767 }
1768 }
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805 static void
1806 atom (struct dfa *dfa)
1807 {
1808 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1809 || dfa->parse.tok >= CSET
1810 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1811 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1812 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1813 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1814 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1815 {
1816 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1817 {
1818
1819
1820
1821
1822
1823
1824
1825 add_utf8_anychar (dfa);
1826 }
1827 else
1828 addtok (dfa, dfa->parse.tok);
1829 dfa->parse.tok = lex (dfa);
1830 }
1831 else if (dfa->parse.tok == WCHAR)
1832 {
1833 if (dfa->lex.wctok == WEOF)
1834 addtok (dfa, BACKREF);
1835 else
1836 {
1837 addtok_wc (dfa, dfa->lex.wctok);
1838
1839 if (dfa->syntax.case_fold)
1840 {
1841 wchar_t folded[CASE_FOLDED_BUFSIZE];
1842 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1843 for (int i = 0; i < n; i++)
1844 {
1845 addtok_wc (dfa, folded[i]);
1846 addtok (dfa, OR);
1847 }
1848 }
1849 }
1850
1851 dfa->parse.tok = lex (dfa);
1852 }
1853 else if (dfa->parse.tok == LPAREN)
1854 {
1855 dfa->parse.tok = lex (dfa);
1856 regexp (dfa);
1857 if (dfa->parse.tok != RPAREN)
1858 dfaerror (_("unbalanced ("));
1859 dfa->parse.tok = lex (dfa);
1860 }
1861 else
1862 addtok (dfa, EMPTY);
1863 }
1864
1865
1866 static idx_t _GL_ATTRIBUTE_PURE
1867 nsubtoks (struct dfa const *dfa, idx_t tindex)
1868 {
1869 switch (dfa->tokens[tindex - 1])
1870 {
1871 default:
1872 return 1;
1873 case QMARK:
1874 case STAR:
1875 case PLUS:
1876 return 1 + nsubtoks (dfa, tindex - 1);
1877 case CAT:
1878 case OR:
1879 {
1880 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1881 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1882 }
1883 }
1884 }
1885
1886
1887 static void
1888 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1889 {
1890 if (dfa->localeinfo.multibyte)
1891 for (idx_t i = 0; i < ntokens; i++)
1892 addtok_mb (dfa, dfa->tokens[tindex + i],
1893 dfa->multibyte_prop[tindex + i]);
1894 else
1895 for (idx_t i = 0; i < ntokens; i++)
1896 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1897 }
1898
1899 static void
1900 closure (struct dfa *dfa)
1901 {
1902 atom (dfa);
1903 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1904 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1905 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1906 {
1907 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1908 idx_t tindex = dfa->tindex - ntokens;
1909 if (dfa->lex.maxrep < 0)
1910 addtok (dfa, PLUS);
1911 if (dfa->lex.minrep == 0)
1912 addtok (dfa, QMARK);
1913 int i;
1914 for (i = 1; i < dfa->lex.minrep; i++)
1915 {
1916 copytoks (dfa, tindex, ntokens);
1917 addtok (dfa, CAT);
1918 }
1919 for (; i < dfa->lex.maxrep; i++)
1920 {
1921 copytoks (dfa, tindex, ntokens);
1922 addtok (dfa, QMARK);
1923 addtok (dfa, CAT);
1924 }
1925 dfa->parse.tok = lex (dfa);
1926 }
1927 else if (dfa->parse.tok == REPMN)
1928 {
1929 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1930 dfa->parse.tok = lex (dfa);
1931 closure (dfa);
1932 }
1933 else
1934 {
1935 addtok (dfa, dfa->parse.tok);
1936 dfa->parse.tok = lex (dfa);
1937 }
1938 }
1939
1940 static void
1941 branch (struct dfa* dfa)
1942 {
1943 closure (dfa);
1944 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1945 && dfa->parse.tok >= 0)
1946 {
1947 closure (dfa);
1948 addtok (dfa, CAT);
1949 }
1950 }
1951
1952 static void
1953 regexp (struct dfa *dfa)
1954 {
1955 branch (dfa);
1956 while (dfa->parse.tok == OR)
1957 {
1958 dfa->parse.tok = lex (dfa);
1959 branch (dfa);
1960 addtok (dfa, OR);
1961 }
1962 }
1963
1964
1965
1966 void
1967 dfaparse (char const *s, idx_t len, struct dfa *d)
1968 {
1969 d->lex.ptr = s;
1970 d->lex.left = len;
1971 d->lex.lasttok = END;
1972 d->lex.laststart = true;
1973
1974 if (!d->syntax.syntax_bits_set)
1975 dfaerror (_("no syntax specified"));
1976
1977 if (!d->nregexps)
1978 addtok (d, BEG);
1979
1980 d->parse.tok = lex (d);
1981 d->parse.depth = d->depth;
1982
1983 regexp (d);
1984
1985 if (d->parse.tok != END)
1986 dfaerror (_("unbalanced )"));
1987
1988 addtok (d, END - d->nregexps);
1989 addtok (d, CAT);
1990
1991 if (d->nregexps)
1992 addtok (d, OR);
1993
1994 ++d->nregexps;
1995 }
1996
1997
1998
1999
2000 static void
2001 copy (position_set const *src, position_set *dst)
2002 {
2003 if (dst->alloc < src->nelem)
2004 {
2005 free (dst->elems);
2006 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2007 sizeof *dst->elems);
2008 }
2009 dst->nelem = src->nelem;
2010 if (src->nelem != 0)
2011 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2012 }
2013
2014 static void
2015 alloc_position_set (position_set *s, idx_t size)
2016 {
2017 s->elems = xnmalloc (size, sizeof *s->elems);
2018 s->alloc = size;
2019 s->nelem = 0;
2020 }
2021
2022
2023
2024
2025
2026 static void
2027 insert (position p, position_set *s)
2028 {
2029 idx_t count = s->nelem;
2030 idx_t lo = 0, hi = count;
2031 while (lo < hi)
2032 {
2033 idx_t mid = (lo + hi) >> 1;
2034 if (s->elems[mid].index < p.index)
2035 lo = mid + 1;
2036 else if (s->elems[mid].index == p.index)
2037 {
2038 s->elems[mid].constraint |= p.constraint;
2039 return;
2040 }
2041 else
2042 hi = mid;
2043 }
2044
2045 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2046 for (idx_t i = count; i > lo; i--)
2047 s->elems[i] = s->elems[i - 1];
2048 s->elems[lo] = p;
2049 ++s->nelem;
2050 }
2051
2052 static void
2053 append (position p, position_set *s)
2054 {
2055 idx_t count = s->nelem;
2056 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2057 s->elems[s->nelem++] = p;
2058 }
2059
2060
2061
2062
2063 static void
2064 merge_constrained (position_set const *s1, position_set const *s2,
2065 unsigned int c2, position_set *m)
2066 {
2067 idx_t i = 0, j = 0;
2068
2069 if (m->alloc - s1->nelem < s2->nelem)
2070 {
2071 free (m->elems);
2072 m->alloc = s1->nelem;
2073 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2074 }
2075 m->nelem = 0;
2076 while (i < s1->nelem || j < s2->nelem)
2077 if (! (j < s2->nelem)
2078 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2079 {
2080 unsigned int c = ((i < s1->nelem && j < s2->nelem
2081 && s1->elems[i].index == s2->elems[j].index)
2082 ? s2->elems[j++].constraint & c2
2083 : 0);
2084 m->elems[m->nelem].index = s1->elems[i].index;
2085 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2086 }
2087 else
2088 {
2089 if (s2->elems[j].constraint & c2)
2090 {
2091 m->elems[m->nelem].index = s2->elems[j].index;
2092 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2093 }
2094 j++;
2095 }
2096 }
2097
2098
2099
2100 static void
2101 merge (position_set const *s1, position_set const *s2, position_set *m)
2102 {
2103 merge_constrained (s1, s2, -1, m);
2104 }
2105
2106
2107
2108 static void
2109 merge2 (position_set *dst, position_set const *src, position_set *m)
2110 {
2111 if (src->nelem < 4)
2112 {
2113 for (idx_t i = 0; i < src->nelem; i++)
2114 insert (src->elems[i], dst);
2115 }
2116 else
2117 {
2118 merge (src, dst, m);
2119 copy (m, dst);
2120 }
2121 }
2122
2123
2124
2125 static unsigned int
2126 delete (idx_t del, position_set *s)
2127 {
2128 idx_t count = s->nelem;
2129 idx_t lo = 0, hi = count;
2130 while (lo < hi)
2131 {
2132 idx_t mid = (lo + hi) >> 1;
2133 if (s->elems[mid].index < del)
2134 lo = mid + 1;
2135 else if (s->elems[mid].index == del)
2136 {
2137 unsigned int c = s->elems[mid].constraint;
2138 idx_t i;
2139 for (i = mid; i + 1 < count; i++)
2140 s->elems[i] = s->elems[i + 1];
2141 s->nelem = i;
2142 return c;
2143 }
2144 else
2145 hi = mid;
2146 }
2147 return 0;
2148 }
2149
2150
2151 static void
2152 replace (position_set *dst, idx_t del, position_set *add,
2153 unsigned int constraint, position_set *tmp)
2154 {
2155 unsigned int c = delete (del, dst) & constraint;
2156
2157 if (c)
2158 {
2159 copy (dst, tmp);
2160 merge_constrained (tmp, add, c, dst);
2161 }
2162 }
2163
2164
2165
2166
2167 static state_num
2168 state_index (struct dfa *d, position_set const *s, int context)
2169 {
2170 size_t hash = 0;
2171 int constraint = 0;
2172 state_num i;
2173
2174 for (i = 0; i < s->nelem; ++i)
2175 {
2176 idx_t ind = s->elems[i].index;
2177 hash ^= ind + s->elems[i].constraint;
2178 }
2179
2180
2181 for (i = 0; i < d->sindex; ++i)
2182 {
2183 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2184 || context != d->states[i].context)
2185 continue;
2186 state_num j;
2187 for (j = 0; j < s->nelem; ++j)
2188 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2189 || s->elems[j].index != d->states[i].elems.elems[j].index)
2190 break;
2191 if (j == s->nelem)
2192 return i;
2193 }
2194
2195 #ifdef DEBUG
2196 fprintf (stderr, "new state %td\n nextpos:", i);
2197 for (state_num j = 0; j < s->nelem; j++)
2198 {
2199 fprintf (stderr, " %td:", s->elems[j].index);
2200 prtok (d->tokens[s->elems[j].index]);
2201 }
2202 fprintf (stderr, "\n context:");
2203 if (context ^ CTX_ANY)
2204 {
2205 if (context & CTX_NONE)
2206 fprintf (stderr, " CTX_NONE");
2207 if (context & CTX_LETTER)
2208 fprintf (stderr, " CTX_LETTER");
2209 if (context & CTX_NEWLINE)
2210 fprintf (stderr, " CTX_NEWLINE");
2211 }
2212 else
2213 fprintf (stderr, " CTX_ANY");
2214 fprintf (stderr, "\n");
2215 #endif
2216
2217 for (state_num j = 0; j < s->nelem; j++)
2218 {
2219 int c = d->constraints[s->elems[j].index];
2220
2221 if (c != 0)
2222 {
2223 if (succeeds_in_context (c, context, CTX_ANY))
2224 constraint |= c;
2225 }
2226 else if (d->tokens[s->elems[j].index] == BACKREF)
2227 constraint = NO_CONSTRAINT;
2228 }
2229
2230
2231
2232 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2233 sizeof *d->states);
2234 d->states[i].hash = hash;
2235 alloc_position_set (&d->states[i].elems, s->nelem);
2236 copy (s, &d->states[i].elems);
2237 d->states[i].context = context;
2238 d->states[i].constraint = constraint;
2239 d->states[i].mbps.nelem = 0;
2240 d->states[i].mbps.elems = NULL;
2241 d->states[i].mb_trindex = -1;
2242
2243 ++d->sindex;
2244
2245 return i;
2246 }
2247
2248
2249
2250
2251
2252
2253
2254 static void
2255 epsclosure (struct dfa const *d, position_set *backward)
2256 {
2257 position_set tmp;
2258 alloc_position_set (&tmp, d->nleaves);
2259 for (idx_t i = 0; i < d->tindex; i++)
2260 if (0 < d->follows[i].nelem)
2261 {
2262 unsigned int constraint;
2263 switch (d->tokens[i])
2264 {
2265 default:
2266 continue;
2267
2268 case BEGLINE:
2269 constraint = BEGLINE_CONSTRAINT;
2270 break;
2271 case ENDLINE:
2272 constraint = ENDLINE_CONSTRAINT;
2273 break;
2274 case BEGWORD:
2275 constraint = BEGWORD_CONSTRAINT;
2276 break;
2277 case ENDWORD:
2278 constraint = ENDWORD_CONSTRAINT;
2279 break;
2280 case LIMWORD:
2281 constraint = LIMWORD_CONSTRAINT;
2282 break;
2283 case NOTLIMWORD:
2284 constraint = NOTLIMWORD_CONSTRAINT;
2285 break;
2286 case EMPTY:
2287 constraint = NO_CONSTRAINT;
2288 break;
2289 }
2290
2291 delete (i, &d->follows[i]);
2292
2293 for (idx_t j = 0; j < backward[i].nelem; j++)
2294 replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2295 constraint, &tmp);
2296 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2297 replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2298 NO_CONSTRAINT, &tmp);
2299 }
2300 free (tmp.elems);
2301 }
2302
2303
2304
2305
2306 static int
2307 charclass_context (struct dfa const *dfa, charclass const *c)
2308 {
2309 int context = 0;
2310
2311 for (int j = 0; j < CHARCLASS_WORDS; j++)
2312 {
2313 if (c->w[j] & dfa->syntax.newline.w[j])
2314 context |= CTX_NEWLINE;
2315 if (c->w[j] & dfa->syntax.letters.w[j])
2316 context |= CTX_LETTER;
2317 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2318 context |= CTX_NONE;
2319 }
2320
2321 return context;
2322 }
2323
2324
2325
2326
2327
2328
2329
2330 static int _GL_ATTRIBUTE_PURE
2331 state_separate_contexts (struct dfa *d, position_set const *s)
2332 {
2333 int separate_contexts = 0;
2334
2335 for (idx_t j = 0; j < s->nelem; j++)
2336 separate_contexts |= d->separates[s->elems[j].index];
2337
2338 return separate_contexts;
2339 }
2340
2341 enum
2342 {
2343
2344 OPT_REPEAT = (1 << 0),
2345
2346
2347
2348 OPT_LPAREN = (1 << 1),
2349
2350
2351 OPT_RPAREN = (1 << 2),
2352
2353
2354
2355 OPT_WALKED = (1 << 3),
2356
2357
2358 OPT_QUEUED = (1 << 4)
2359 };
2360
2361 static void
2362 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2363 position_set *merged)
2364 {
2365 position_set *follows = d->follows;
2366 idx_t nelem = 0;
2367
2368 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2369 {
2370 idx_t sindex = follows[tindex].elems[i].index;
2371
2372
2373 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2374 if (iconstraint == 0)
2375 continue;
2376
2377 if (d->tokens[follows[tindex].elems[i].index] <= END)
2378 {
2379 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2380 continue;
2381 }
2382
2383 if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2384 {
2385 idx_t j;
2386
2387 for (j = 0; j < nelem; j++)
2388 {
2389 idx_t dindex = follows[tindex].elems[j].index;
2390
2391 if (dindex == tindex)
2392 continue;
2393
2394 if (follows[tindex].elems[j].constraint != iconstraint)
2395 continue;
2396
2397 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2398 continue;
2399
2400 if (d->tokens[sindex] != d->tokens[dindex])
2401 continue;
2402
2403 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2404 continue;
2405
2406 if (flags[sindex] & OPT_REPEAT)
2407 delete (sindex, &follows[sindex]);
2408
2409 merge2 (&follows[dindex], &follows[sindex], merged);
2410
2411 break;
2412 }
2413
2414 if (j < nelem)
2415 continue;
2416 }
2417
2418 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2419 flags[sindex] |= OPT_QUEUED;
2420 }
2421
2422 follows[tindex].nelem = nelem;
2423 }
2424
2425 static int
2426 compare (const void *a, const void *b)
2427 {
2428 position const *p = a, *q = b;
2429 return (p->index > q->index) - (p->index < q->index);
2430 }
2431
2432 static void
2433 reorder_tokens (struct dfa *d)
2434 {
2435 idx_t nleaves = 0;
2436 ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2437 map[0] = nleaves++;
2438 for (idx_t i = 1; i < d->tindex; i++)
2439 map[i] = -1;
2440
2441 token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2442 position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2443 int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2444 char *multibyte_prop = (d->localeinfo.multibyte
2445 ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2446 : NULL);
2447
2448 for (idx_t i = 0; i < d->tindex; i++)
2449 {
2450 if (map[i] < 0)
2451 {
2452 free (d->follows[i].elems);
2453 d->follows[i].elems = NULL;
2454 d->follows[i].nelem = 0;
2455 continue;
2456 }
2457
2458 tokens[map[i]] = d->tokens[i];
2459 follows[map[i]] = d->follows[i];
2460 constraints[map[i]] = d->constraints[i];
2461
2462 if (multibyte_prop != NULL)
2463 multibyte_prop[map[i]] = d->multibyte_prop[i];
2464
2465 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2466 {
2467 if (map[d->follows[i].elems[j].index] == -1)
2468 map[d->follows[i].elems[j].index] = nleaves++;
2469
2470 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2471 }
2472
2473 qsort (d->follows[i].elems, d->follows[i].nelem,
2474 sizeof *d->follows[i].elems, compare);
2475 }
2476
2477 for (idx_t i = 0; i < nleaves; i++)
2478 {
2479 d->tokens[i] = tokens[i];
2480 d->follows[i] = follows[i];
2481 d->constraints[i] = constraints[i];
2482
2483 if (multibyte_prop != NULL)
2484 d->multibyte_prop[i] = multibyte_prop[i];
2485 }
2486
2487 d->tindex = d->nleaves = nleaves;
2488
2489 free (tokens);
2490 free (follows);
2491 free (constraints);
2492 free (multibyte_prop);
2493 free (map);
2494 }
2495
2496 static void
2497 dfaoptimize (struct dfa *d)
2498 {
2499 char *flags = xizalloc (d->tindex);
2500
2501 for (idx_t i = 0; i < d->tindex; i++)
2502 {
2503 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2504 {
2505 if (d->follows[i].elems[j].index == i)
2506 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2507 else if (d->follows[i].elems[j].index < i)
2508 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2509 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2510 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2511 else
2512 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2513 }
2514 }
2515
2516 flags[0] |= OPT_QUEUED;
2517
2518 position_set merged0;
2519 position_set *merged = &merged0;
2520 alloc_position_set (merged, d->nleaves);
2521
2522 d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2523
2524 for (idx_t i = 0; i < d->tindex; i++)
2525 if (flags[i] & OPT_QUEUED)
2526 merge_nfa_state (d, i, flags, merged);
2527
2528 reorder_tokens (d);
2529
2530 free (merged->elems);
2531 free (flags);
2532 }
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586 static void
2587 dfaanalyze (struct dfa *d, bool searchflag)
2588 {
2589
2590 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2591
2592 position *firstpos = posalloc;
2593 position *lastpos = firstpos + d->nleaves;
2594 position pos;
2595 position_set tmp;
2596
2597
2598 struct
2599 {
2600
2601 bool nullable;
2602
2603
2604 idx_t nfirstpos;
2605 idx_t nlastpos;
2606 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2607
2608 position_set merged;
2609
2610 addtok (d, CAT);
2611 idx_t tindex = d->tindex;
2612
2613 #ifdef DEBUG
2614 fprintf (stderr, "dfaanalyze:\n");
2615 for (idx_t i = 0; i < tindex; i++)
2616 {
2617 fprintf (stderr, " %td:", i);
2618 prtok (d->tokens[i]);
2619 }
2620 putc ('\n', stderr);
2621 #endif
2622
2623 d->searchflag = searchflag;
2624 alloc_position_set (&merged, d->nleaves);
2625 d->follows = xicalloc (tindex, sizeof *d->follows);
2626 position_set *backward
2627 = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2628
2629 for (idx_t i = 0; i < tindex; i++)
2630 {
2631 switch (d->tokens[i])
2632 {
2633 case EMPTY:
2634
2635 stk->nullable = true;
2636
2637
2638 stk->nfirstpos = stk->nlastpos = 0;
2639 stk++;
2640 break;
2641
2642 case STAR:
2643 case PLUS:
2644
2645
2646 if (d->epsilon)
2647 {
2648 tmp.elems = lastpos - stk[-1].nlastpos;
2649 tmp.nelem = stk[-1].nlastpos;
2650 for (position *p = firstpos - stk[-1].nfirstpos;
2651 p < firstpos; p++)
2652 merge2 (&backward[p->index], &tmp, &merged);
2653 }
2654
2655
2656
2657 {
2658 tmp.elems = firstpos - stk[-1].nfirstpos;
2659 tmp.nelem = stk[-1].nfirstpos;
2660 for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2661 merge2 (&d->follows[p->index], &tmp, &merged);
2662 }
2663 FALLTHROUGH;
2664 case QMARK:
2665
2666 if (d->tokens[i] != PLUS)
2667 stk[-1].nullable = true;
2668 break;
2669
2670 case CAT:
2671
2672
2673
2674 if (backward)
2675 {
2676 tmp.nelem = stk[-2].nlastpos;
2677 tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2678 for (position *p = firstpos - stk[-1].nfirstpos;
2679 p < firstpos; p++)
2680 merge2 (&backward[p->index], &tmp, &merged);
2681 }
2682
2683
2684
2685 {
2686 tmp.nelem = stk[-1].nfirstpos;
2687 tmp.elems = firstpos - stk[-1].nfirstpos;
2688 for (position *plim = lastpos - stk[-1].nlastpos,
2689 *p = plim - stk[-2].nlastpos;
2690 p < plim; p++)
2691 merge2 (&d->follows[p->index], &tmp, &merged);
2692 }
2693
2694
2695
2696 if (stk[-2].nullable)
2697 stk[-2].nfirstpos += stk[-1].nfirstpos;
2698 else
2699 firstpos -= stk[-1].nfirstpos;
2700
2701
2702
2703 if (stk[-1].nullable)
2704 stk[-2].nlastpos += stk[-1].nlastpos;
2705 else
2706 {
2707 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2708 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2709 p[j] = p[j + stk[-2].nlastpos];
2710 lastpos -= stk[-2].nlastpos;
2711 stk[-2].nlastpos = stk[-1].nlastpos;
2712 }
2713
2714
2715 stk[-2].nullable &= stk[-1].nullable;
2716 stk--;
2717 break;
2718
2719 case OR:
2720
2721 stk[-2].nfirstpos += stk[-1].nfirstpos;
2722
2723
2724 stk[-2].nlastpos += stk[-1].nlastpos;
2725
2726
2727 stk[-2].nullable |= stk[-1].nullable;
2728 stk--;
2729 break;
2730
2731 default:
2732
2733
2734
2735
2736
2737 stk->nullable = d->tokens[i] == BACKREF;
2738
2739
2740 stk->nfirstpos = stk->nlastpos = 1;
2741 stk++;
2742
2743 firstpos->index = lastpos->index = i;
2744 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2745 firstpos++, lastpos++;
2746
2747 break;
2748 }
2749 #ifdef DEBUG
2750
2751 fprintf (stderr, "node %td:", i);
2752 prtok (d->tokens[i]);
2753 putc ('\n', stderr);
2754 fprintf (stderr,
2755 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2756 fprintf (stderr, " firstpos:");
2757 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2758 {
2759 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2760 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2761 }
2762 fprintf (stderr, "\n lastpos:");
2763 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2764 {
2765 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2766 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2767 }
2768 putc ('\n', stderr);
2769 #endif
2770 }
2771
2772 if (backward)
2773 {
2774
2775
2776 epsclosure (d, backward);
2777
2778 for (idx_t i = 0; i < tindex; i++)
2779 free (backward[i].elems);
2780 free (backward);
2781 }
2782
2783 dfaoptimize (d);
2784
2785 #ifdef DEBUG
2786 for (idx_t i = 0; i < tindex; i++)
2787 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2788 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2789 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2790 {
2791 fprintf (stderr, "follows(%td:", i);
2792 prtok (d->tokens[i]);
2793 fprintf (stderr, "):");
2794 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2795 {
2796 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2797 prtok (d->tokens[d->follows[i].elems[j].index]);
2798 }
2799 putc ('\n', stderr);
2800 }
2801 #endif
2802
2803 pos.index = 0;
2804 pos.constraint = NO_CONSTRAINT;
2805
2806 alloc_position_set (&tmp, 1);
2807
2808 append (pos, &tmp);
2809
2810 d->separates = xicalloc (tindex, sizeof *d->separates);
2811
2812 for (idx_t i = 0; i < tindex; i++)
2813 {
2814 if (prev_newline_dependent (d->constraints[i]))
2815 d->separates[i] |= CTX_NEWLINE;
2816 if (prev_letter_dependent (d->constraints[i]))
2817 d->separates[i] |= CTX_LETTER;
2818
2819 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2820 {
2821 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2822 d->separates[i] |= CTX_NEWLINE;
2823 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2824 d->separates[i] |= CTX_LETTER;
2825 }
2826 }
2827
2828
2829 int separate_contexts = state_separate_contexts (d, &tmp);
2830
2831
2832 if (separate_contexts & CTX_NEWLINE)
2833 state_index (d, &tmp, CTX_NEWLINE);
2834 d->initstate_notbol = d->min_trcount
2835 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2836 if (separate_contexts & CTX_LETTER)
2837 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2838 d->min_trcount++;
2839 d->trcount = 0;
2840
2841 free (posalloc);
2842 free (stkalloc);
2843 free (merged.elems);
2844 free (tmp.elems);
2845 }
2846
2847
2848 static void
2849 realloc_trans_if_necessary (struct dfa *d)
2850 {
2851 state_num oldalloc = d->tralloc;
2852 if (oldalloc < d->sindex)
2853 {
2854 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2855 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2856 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2857 -1, sizeof *realtrans);
2858 realtrans[0] = realtrans[1] = NULL;
2859 d->trans = realtrans + 2;
2860 idx_t newalloc = d->tralloc = newalloc1 - 2;
2861 d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2862 d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2863 d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2864 if (d->localeinfo.multibyte)
2865 {
2866 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2867 realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2868 if (oldalloc == 0)
2869 realtrans[0] = realtrans[1] = NULL;
2870 d->mb_trans = realtrans + 2;
2871 }
2872 for (; oldalloc < newalloc; oldalloc++)
2873 {
2874 d->trans[oldalloc] = NULL;
2875 d->fails[oldalloc] = NULL;
2876 if (d->localeinfo.multibyte)
2877 d->mb_trans[oldalloc] = NULL;
2878 }
2879 }
2880 }
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915 static state_num
2916 build_state (state_num s, struct dfa *d, unsigned char uc)
2917 {
2918 position_set follows;
2919
2920 position_set group;
2921 position_set tmp;
2922 state_num state;
2923 state_num state_newline;
2924 state_num state_letter;
2925
2926 #ifdef DEBUG
2927 fprintf (stderr, "build state %td\n", s);
2928 #endif
2929
2930
2931 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2932 state_num *trans = *ptrans;
2933
2934 if (!trans)
2935 {
2936
2937
2938
2939
2940 if (MAX_TRCOUNT <= d->trcount)
2941 {
2942 for (state_num i = d->min_trcount; i < d->tralloc; i++)
2943 {
2944 free (d->trans[i]);
2945 free (d->fails[i]);
2946 d->trans[i] = d->fails[i] = NULL;
2947 }
2948 d->trcount = 0;
2949 }
2950
2951 d->trcount++;
2952 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2953
2954
2955
2956 for (int i = 0; i < NOTCHAR; i++)
2957 trans[i] = -2;
2958 }
2959
2960
2961 d->success[s] = 0;
2962 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2963 d->success[s] |= CTX_NEWLINE;
2964 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2965 d->success[s] |= CTX_LETTER;
2966 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2967 d->success[s] |= CTX_NONE;
2968
2969 alloc_position_set (&follows, d->nleaves);
2970
2971
2972
2973 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
2974 for (idx_t k = 0;
2975 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2976 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2977 &follows);
2978
2979
2980 alloc_position_set (&group, d->nleaves);
2981
2982
2983 charclass label;
2984 fillset (&label);
2985
2986 for (idx_t i = 0; i < follows.nelem; i++)
2987 {
2988 charclass matches;
2989 position pos = follows.elems[i];
2990 bool matched = false;
2991 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2992 {
2993 zeroset (&matches);
2994 setbit (d->tokens[pos.index], &matches);
2995 if (d->tokens[pos.index] == uc)
2996 matched = true;
2997 }
2998 else if (d->tokens[pos.index] >= CSET)
2999 {
3000 matches = d->charclasses[d->tokens[pos.index] - CSET];
3001 if (tstbit (uc, &matches))
3002 matched = true;
3003 }
3004 else if (d->tokens[pos.index] == ANYCHAR)
3005 {
3006 matches = d->charclasses[d->canychar];
3007 if (tstbit (uc, &matches))
3008 matched = true;
3009
3010
3011
3012
3013
3014
3015
3016 if (succeeds_in_context (pos.constraint, d->states[s].context,
3017 CTX_NONE))
3018 {
3019 if (d->states[s].mbps.nelem == 0)
3020 alloc_position_set (&d->states[s].mbps, 1);
3021 insert (pos, &d->states[s].mbps);
3022 }
3023 }
3024 else
3025 continue;
3026
3027
3028
3029 if (pos.constraint != NO_CONSTRAINT)
3030 {
3031 if (!succeeds_in_context (pos.constraint,
3032 d->states[s].context, CTX_NEWLINE))
3033 for (int j = 0; j < CHARCLASS_WORDS; j++)
3034 matches.w[j] &= ~d->syntax.newline.w[j];
3035 if (!succeeds_in_context (pos.constraint,
3036 d->states[s].context, CTX_LETTER))
3037 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3038 matches.w[j] &= ~d->syntax.letters.w[j];
3039 if (!succeeds_in_context (pos.constraint,
3040 d->states[s].context, CTX_NONE))
3041 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3042 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3043
3044
3045 if (emptyset (&matches))
3046 continue;
3047
3048
3049
3050
3051 if (!tstbit (uc, &matches))
3052 matched = false;
3053 }
3054
3055 #ifdef DEBUG
3056 fprintf (stderr, " nextpos %td:", pos.index);
3057 prtok (d->tokens[pos.index]);
3058 fprintf (stderr, " of");
3059 for (unsigned j = 0; j < NOTCHAR; j++)
3060 if (tstbit (j, &matches))
3061 fprintf (stderr, " 0x%02x", j);
3062 fprintf (stderr, "\n");
3063 #endif
3064
3065 if (matched)
3066 {
3067 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3068 label.w[k] &= matches.w[k];
3069 append (pos, &group);
3070 }
3071 else
3072 {
3073 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3074 label.w[k] &= ~matches.w[k];
3075 }
3076 }
3077
3078 alloc_position_set (&tmp, d->nleaves);
3079
3080 if (group.nelem > 0)
3081 {
3082
3083
3084 if (d->searchflag)
3085 {
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103 bool mergeit = !d->localeinfo.multibyte;
3104 if (!mergeit)
3105 {
3106 mergeit = true;
3107 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3108 mergeit &= d->multibyte_prop[group.elems[j].index];
3109 }
3110 if (mergeit)
3111 merge2 (&group, &d->states[0].elems, &tmp);
3112 }
3113
3114
3115
3116
3117 int possible_contexts = charclass_context (d, &label);
3118 int separate_contexts = state_separate_contexts (d, &group);
3119
3120
3121 if (possible_contexts & ~separate_contexts)
3122 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3123 else
3124 state = -1;
3125 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3126 state_newline = state_index (d, &group, CTX_NEWLINE);
3127 else
3128 state_newline = state;
3129 if (separate_contexts & possible_contexts & CTX_LETTER)
3130 state_letter = state_index (d, &group, CTX_LETTER);
3131 else
3132 state_letter = state;
3133
3134
3135 realloc_trans_if_necessary (d);
3136 }
3137
3138
3139
3140
3141 else if (d->searchflag)
3142 {
3143 state_newline = 0;
3144 state_letter = d->min_trcount - 1;
3145 state = d->initstate_notbol;
3146 }
3147 else
3148 {
3149 state_newline = -1;
3150 state_letter = -1;
3151 state = -1;
3152 }
3153
3154
3155 for (int i = 0; i < NOTCHAR; i++)
3156 if (tstbit (i, &label))
3157 switch (d->syntax.sbit[i])
3158 {
3159 case CTX_NEWLINE:
3160 trans[i] = state_newline;
3161 break;
3162 case CTX_LETTER:
3163 trans[i] = state_letter;
3164 break;
3165 default:
3166 trans[i] = state;
3167 break;
3168 }
3169
3170 #ifdef DEBUG
3171 fprintf (stderr, "trans table %td", s);
3172 for (int i = 0; i < NOTCHAR; ++i)
3173 {
3174 if (!(i & 0xf))
3175 fprintf (stderr, "\n");
3176 fprintf (stderr, " %2td", trans[i]);
3177 }
3178 fprintf (stderr, "\n");
3179 #endif
3180
3181 free (group.elems);
3182 free (follows.elems);
3183 free (tmp.elems);
3184
3185
3186
3187 if (tstbit (d->syntax.eolbyte, &label))
3188 {
3189 d->newlines[s] = trans[d->syntax.eolbyte];
3190 trans[d->syntax.eolbyte] = -1;
3191 }
3192
3193 return trans[uc];
3194 }
3195
3196
3197
3198
3199
3200
3201
3202 static state_num
3203 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3204 {
3205 state_num *t;
3206
3207 if (d->trans[s])
3208 t = d->trans[s];
3209 else if (d->fails[s])
3210 t = d->fails[s];
3211 else
3212 {
3213 build_state (s, d, **pp);
3214 if (d->trans[s])
3215 t = d->trans[s];
3216 else
3217 {
3218 t = d->fails[s];
3219 assert (t);
3220 }
3221 }
3222
3223 if (t[**pp] == -2)
3224 build_state (s, d, **pp);
3225
3226 return t[*(*pp)++];
3227 }
3228
3229
3230
3231
3232 static state_num
3233 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3234 unsigned char const *end)
3235 {
3236 wint_t wc;
3237
3238 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3239
3240
3241 d->mb_follows.nelem = 0;
3242
3243
3244
3245 state_num s1 = s;
3246 int mbci;
3247 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3248 s = transit_state_singlebyte (d, s, pp);
3249 *pp += mbclen - mbci;
3250
3251 if (wc == WEOF)
3252 {
3253
3254 return s;
3255 }
3256
3257
3258
3259
3260 if (d->states[s1].mb_trindex < 0)
3261 {
3262 if (MAX_TRCOUNT <= d->mb_trcount)
3263 {
3264 state_num s3;
3265 for (s3 = -1; s3 < d->tralloc; s3++)
3266 {
3267 free (d->mb_trans[s3]);
3268 d->mb_trans[s3] = NULL;
3269 }
3270
3271 for (state_num i = 0; i < d->sindex; i++)
3272 d->states[i].mb_trindex = -1;
3273 d->mb_trcount = 0;
3274 }
3275 d->states[s1].mb_trindex = d->mb_trcount++;
3276 }
3277
3278 if (! d->mb_trans[s])
3279 {
3280 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3281 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3282 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3283 for (int i = 0; i < MAX_TRCOUNT; i++)
3284 d->mb_trans[s][i] = -1;
3285 }
3286 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3287 return d->mb_trans[s][d->states[s1].mb_trindex];
3288
3289 if (s == -1)
3290 copy (&d->states[s1].mbps, &d->mb_follows);
3291 else
3292 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3293
3294 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3295 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3296 realloc_trans_if_necessary (d);
3297
3298 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3299
3300 return s2;
3301 }
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318 static unsigned char const *
3319 skip_remains_mb (struct dfa *d, unsigned char const *p,
3320 unsigned char const *mbp, char const *end)
3321 {
3322 if (d->syntax.never_trail[*p])
3323 return p;
3324 while (mbp < p)
3325 {
3326 wint_t wc;
3327 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3328 end - (char const *) mbp, d);
3329 }
3330 return mbp;
3331 }
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353 static inline char *
3354 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3355 idx_t *count, bool multibyte)
3356 {
3357 if (MAX_TRCOUNT <= d->sindex)
3358 {
3359 for (state_num s = d->min_trcount; s < d->sindex; s++)
3360 {
3361 free (d->states[s].elems.elems);
3362 free (d->states[s].mbps.elems);
3363 }
3364 d->sindex = d->min_trcount;
3365
3366 if (d->trans)
3367 {
3368 for (state_num s = 0; s < d->tralloc; s++)
3369 {
3370 free (d->trans[s]);
3371 free (d->fails[s]);
3372 d->trans[s] = d->fails[s] = NULL;
3373 }
3374 d->trcount = 0;
3375 }
3376
3377 if (d->localeinfo.multibyte && d->mb_trans)
3378 {
3379 for (state_num s = -1; s < d->tralloc; s++)
3380 {
3381 free (d->mb_trans[s]);
3382 d->mb_trans[s] = NULL;
3383 }
3384 for (state_num s = 0; s < d->min_trcount; s++)
3385 d->states[s].mb_trindex = -1;
3386 d->mb_trcount = 0;
3387 }
3388 }
3389
3390 if (!d->tralloc)
3391 realloc_trans_if_necessary (d);
3392
3393
3394 state_num s = 0, s1 = 0;
3395
3396
3397 unsigned char const *p = (unsigned char const *) begin;
3398 unsigned char const *mbp = p;
3399
3400
3401 state_num **trans = d->trans;
3402 unsigned char eol = d->syntax.eolbyte;
3403 unsigned char saved_end = *(unsigned char *) end;
3404 *end = eol;
3405
3406 if (multibyte)
3407 {
3408 memset (&d->mbs, 0, sizeof d->mbs);
3409 if (d->mb_follows.alloc == 0)
3410 alloc_position_set (&d->mb_follows, d->nleaves);
3411 }
3412
3413 idx_t nlcount = 0;
3414 for (;;)
3415 {
3416 state_num *t;
3417 while ((t = trans[s]) != NULL)
3418 {
3419 if (s < d->min_trcount)
3420 {
3421 if (!multibyte || d->states[s].mbps.nelem == 0)
3422 {
3423 while (t[*p] == s)
3424 p++;
3425 }
3426 if (multibyte)
3427 p = mbp = skip_remains_mb (d, p, mbp, end);
3428 }
3429
3430 if (multibyte)
3431 {
3432 s1 = s;
3433
3434 if (d->states[s].mbps.nelem == 0
3435 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3436 {
3437
3438
3439 s = t[*p++];
3440 }
3441 else
3442 {
3443 s = transit_state (d, s, &p, (unsigned char *) end);
3444 mbp = p;
3445 trans = d->trans;
3446 }
3447 }
3448 else
3449 {
3450 s1 = t[*p++];
3451 t = trans[s1];
3452 if (! t)
3453 {
3454 state_num tmp = s;
3455 s = s1;
3456 s1 = tmp;
3457 break;
3458 }
3459 if (s < d->min_trcount)
3460 {
3461 while (t[*p] == s1)
3462 p++;
3463 }
3464 s = t[*p++];
3465 }
3466 }
3467
3468 if (s < 0)
3469 {
3470 if (s == -2)
3471 {
3472 s = build_state (s1, d, p[-1]);
3473 trans = d->trans;
3474 }
3475 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3476 {
3477
3478
3479 nlcount++;
3480 mbp = p;
3481
3482 s = (allow_nl ? d->newlines[s1]
3483 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3484 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3485 : d->initstate_notbol);
3486 }
3487 else
3488 {
3489 p = NULL;
3490 goto done;
3491 }
3492 }
3493 else if (d->fails[s])
3494 {
3495 if ((d->success[s] & d->syntax.sbit[*p])
3496 || ((char *) p == end
3497 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3498 d)))
3499 goto done;
3500
3501 if (multibyte && s < d->min_trcount)
3502 p = mbp = skip_remains_mb (d, p, mbp, end);
3503
3504 s1 = s;
3505 if (!multibyte || d->states[s].mbps.nelem == 0
3506 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3507 {
3508
3509
3510 s = d->fails[s][*p++];
3511 }
3512 else
3513 {
3514 s = transit_state (d, s, &p, (unsigned char *) end);
3515 mbp = p;
3516 trans = d->trans;
3517 }
3518 }
3519 else
3520 {
3521 build_state (s, d, p[0]);
3522 trans = d->trans;
3523 }
3524 }
3525
3526 done:
3527 if (count)
3528 *count += nlcount;
3529 *end = saved_end;
3530 return (char *) p;
3531 }
3532
3533
3534
3535
3536 static char *
3537 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3538 bool allow_nl, idx_t *count, bool *backref)
3539 {
3540 return dfaexec_main (d, begin, end, allow_nl, count, true);
3541 }
3542
3543 static char *
3544 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3545 bool allow_nl, idx_t *count, bool *backref)
3546 {
3547 return dfaexec_main (d, begin, end, allow_nl, count, false);
3548 }
3549
3550
3551
3552 static char *
3553 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3554 bool allow_nl, idx_t *count, bool *backref)
3555 {
3556 *backref = true;
3557 return (char *) begin;
3558 }
3559
3560
3561
3562
3563
3564 char *
3565 dfaexec (struct dfa *d, char const *begin, char *end,
3566 bool allow_nl, idx_t *count, bool *backref)
3567 {
3568 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3569 }
3570
3571 struct dfa *
3572 dfasuperset (struct dfa const *d)
3573 {
3574 return d->superset;
3575 }
3576
3577 bool
3578 dfaisfast (struct dfa const *d)
3579 {
3580 return d->fast;
3581 }
3582
3583 static void
3584 free_mbdata (struct dfa *d)
3585 {
3586 free (d->multibyte_prop);
3587 free (d->lex.brack.chars);
3588 free (d->mb_follows.elems);
3589
3590 if (d->mb_trans)
3591 {
3592 state_num s;
3593 for (s = -1; s < d->tralloc; s++)
3594 free (d->mb_trans[s]);
3595 free (d->mb_trans - 2);
3596 }
3597 }
3598
3599
3600 bool
3601 dfasupported (struct dfa const *d)
3602 {
3603 for (idx_t i = 0; i < d->tindex; i++)
3604 {
3605 switch (d->tokens[i])
3606 {
3607 case BEGWORD:
3608 case ENDWORD:
3609 case LIMWORD:
3610 case NOTLIMWORD:
3611 if (!d->localeinfo.multibyte)
3612 continue;
3613 FALLTHROUGH;
3614 case BACKREF:
3615 case MBCSET:
3616 return false;
3617 }
3618 }
3619 return true;
3620 }
3621
3622
3623
3624 static void
3625 maybe_disable_superset_dfa (struct dfa *d)
3626 {
3627 if (!d->localeinfo.using_utf8)
3628 return;
3629
3630 bool have_backref = false;
3631 for (idx_t i = 0; i < d->tindex; i++)
3632 {
3633 switch (d->tokens[i])
3634 {
3635 case ANYCHAR:
3636
3637 abort ();
3638 case BACKREF:
3639 have_backref = true;
3640 break;
3641 case MBCSET:
3642
3643 return;
3644 default:
3645 break;
3646 }
3647 }
3648
3649 if (!have_backref && d->superset)
3650 {
3651
3652 dfafree (d->superset);
3653 free (d->superset);
3654 d->superset = NULL;
3655 }
3656
3657 free_mbdata (d);
3658 d->localeinfo.multibyte = false;
3659 d->dfaexec = dfaexec_sb;
3660 d->fast = true;
3661 }
3662
3663 static void
3664 dfassbuild (struct dfa *d)
3665 {
3666 struct dfa *sup = dfaalloc ();
3667
3668 *sup = *d;
3669 sup->localeinfo.multibyte = false;
3670 sup->dfaexec = dfaexec_sb;
3671 sup->multibyte_prop = NULL;
3672 sup->superset = NULL;
3673 sup->states = NULL;
3674 sup->sindex = 0;
3675 sup->constraints = NULL;
3676 sup->separates = NULL;
3677 sup->follows = NULL;
3678 sup->tralloc = 0;
3679 sup->trans = NULL;
3680 sup->fails = NULL;
3681 sup->success = NULL;
3682 sup->newlines = NULL;
3683
3684 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3685 if (d->cindex)
3686 {
3687 memcpy (sup->charclasses, d->charclasses,
3688 d->cindex * sizeof *sup->charclasses);
3689 }
3690
3691 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3692 sup->talloc = d->tindex * 2;
3693
3694 bool have_achar = false;
3695 bool have_nchar = false;
3696 idx_t j;
3697 for (idx_t i = j = 0; i < d->tindex; i++)
3698 {
3699 switch (d->tokens[i])
3700 {
3701 case ANYCHAR:
3702 case MBCSET:
3703 case BACKREF:
3704 {
3705 charclass ccl;
3706 fillset (&ccl);
3707 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3708 sup->tokens[j++] = STAR;
3709 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3710 || d->tokens[i + 1] == PLUS)
3711 i++;
3712 have_achar = true;
3713 }
3714 break;
3715 case BEGWORD:
3716 case ENDWORD:
3717 case LIMWORD:
3718 case NOTLIMWORD:
3719 if (d->localeinfo.multibyte)
3720 {
3721
3722
3723 sup->tokens[j++] = EMPTY;
3724 break;
3725 }
3726 FALLTHROUGH;
3727 default:
3728 sup->tokens[j++] = d->tokens[i];
3729 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3730 || d->tokens[i] >= CSET)
3731 have_nchar = true;
3732 break;
3733 }
3734 }
3735 sup->tindex = j;
3736
3737 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3738 d->superset = sup;
3739 else
3740 {
3741 dfafree (sup);
3742 free (sup);
3743 }
3744 }
3745
3746
3747
3748
3749 void
3750 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3751 {
3752 if (s != NULL)
3753 dfaparse (s, len, d);
3754
3755 dfassbuild (d);
3756
3757 if (dfasupported (d))
3758 {
3759 maybe_disable_superset_dfa (d);
3760 dfaanalyze (d, searchflag);
3761 }
3762 else
3763 {
3764 d->dfaexec = dfaexec_noop;
3765 }
3766
3767 if (d->superset)
3768 {
3769 d->fast = true;
3770 dfaanalyze (d->superset, searchflag);
3771 }
3772 }
3773
3774
3775 void
3776 dfafree (struct dfa *d)
3777 {
3778 free (d->charclasses);
3779 free (d->tokens);
3780
3781 if (d->localeinfo.multibyte)
3782 free_mbdata (d);
3783
3784 free (d->constraints);
3785 free (d->separates);
3786
3787 for (idx_t i = 0; i < d->sindex; i++)
3788 {
3789 free (d->states[i].elems.elems);
3790 free (d->states[i].mbps.elems);
3791 }
3792 free (d->states);
3793
3794 if (d->follows)
3795 {
3796 for (idx_t i = 0; i < d->tindex; i++)
3797 free (d->follows[i].elems);
3798 free (d->follows);
3799 }
3800
3801 if (d->trans)
3802 {
3803 for (idx_t i = 0; i < d->tralloc; i++)
3804 {
3805 free (d->trans[i]);
3806 free (d->fails[i]);
3807 }
3808
3809 free (d->trans - 2);
3810 free (d->fails);
3811 free (d->newlines);
3812 free (d->success);
3813 }
3814
3815 if (d->superset)
3816 {
3817 dfafree (d->superset);
3818 free (d->superset);
3819 }
3820 }
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904 static char *
3905 icatalloc (char *old, char const *new)
3906 {
3907 idx_t newsize = strlen (new);
3908 if (newsize == 0)
3909 return old;
3910 idx_t oldsize = strlen (old);
3911 char *result = xirealloc (old, oldsize + newsize + 1);
3912 memcpy (result + oldsize, new, newsize + 1);
3913 return result;
3914 }
3915
3916 static void
3917 freelist (char **cpp)
3918 {
3919 while (*cpp)
3920 free (*cpp++);
3921 }
3922
3923 static char **
3924 enlistnew (char **cpp, char *new)
3925 {
3926
3927 idx_t i;
3928 for (i = 0; cpp[i] != NULL; i++)
3929 if (strstr (cpp[i], new) != NULL)
3930 {
3931 free (new);
3932 return cpp;
3933 }
3934
3935 for (idx_t j = 0; cpp[j] != NULL; )
3936 if (strstr (new, cpp[j]) == NULL)
3937 ++j;
3938 else
3939 {
3940 free (cpp[j]);
3941 if (--i == j)
3942 break;
3943 cpp[j] = cpp[i];
3944 cpp[i] = NULL;
3945 }
3946
3947 cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
3948 cpp[i] = new;
3949 cpp[i + 1] = NULL;
3950 return cpp;
3951 }
3952
3953 static char **
3954 enlist (char **cpp, char const *str, idx_t len)
3955 {
3956 return enlistnew (cpp, ximemdup0 (str, len));
3957 }
3958
3959
3960
3961 static char **
3962 comsubs (char *left, char const *right)
3963 {
3964 char **cpp = xzalloc (sizeof *cpp);
3965
3966 for (char *lcp = left; *lcp != '\0'; lcp++)
3967 {
3968 idx_t len = 0;
3969 char *rcp = strchr (right, *lcp);
3970 while (rcp != NULL)
3971 {
3972 idx_t i;
3973 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3974 continue;
3975 if (i > len)
3976 len = i;
3977 rcp = strchr (rcp + 1, *lcp);
3978 }
3979 if (len != 0)
3980 cpp = enlist (cpp, lcp, len);
3981 }
3982 return cpp;
3983 }
3984
3985 static char **
3986 addlists (char **old, char **new)
3987 {
3988 for (; *new; new++)
3989 old = enlistnew (old, xstrdup (*new));
3990 return old;
3991 }
3992
3993
3994
3995 static char **
3996 inboth (char **left, char **right)
3997 {
3998 char **both = xzalloc (sizeof *both);
3999
4000 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4001 {
4002 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4003 {
4004 char **temp = comsubs (left[lnum], right[rnum]);
4005 both = addlists (both, temp);
4006 freelist (temp);
4007 free (temp);
4008 }
4009 }
4010 return both;
4011 }
4012
4013 typedef struct must must;
4014
4015 struct must
4016 {
4017 char **in;
4018 char *left;
4019 char *right;
4020 char *is;
4021 bool begline;
4022 bool endline;
4023 must *prev;
4024 };
4025
4026 static must *
4027 allocmust (must *mp, idx_t size)
4028 {
4029 must *new_mp = xmalloc (sizeof *new_mp);
4030 new_mp->in = xzalloc (sizeof *new_mp->in);
4031 new_mp->left = xizalloc (size);
4032 new_mp->right = xizalloc (size);
4033 new_mp->is = xizalloc (size);
4034 new_mp->begline = false;
4035 new_mp->endline = false;
4036 new_mp->prev = mp;
4037 return new_mp;
4038 }
4039
4040 static void
4041 resetmust (must *mp)
4042 {
4043 freelist (mp->in);
4044 mp->in[0] = NULL;
4045 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4046 mp->begline = false;
4047 mp->endline = false;
4048 }
4049
4050 static void
4051 freemust (must *mp)
4052 {
4053 freelist (mp->in);
4054 free (mp->in);
4055 free (mp->left);
4056 free (mp->right);
4057 free (mp->is);
4058 free (mp);
4059 }
4060
4061 struct dfamust *
4062 dfamust (struct dfa const *d)
4063 {
4064 must *mp = NULL;
4065 char const *result = "";
4066 bool exact = false;
4067 bool begline = false;
4068 bool endline = false;
4069 bool need_begline = false;
4070 bool need_endline = false;
4071 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4072
4073 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4074 {
4075 token t = d->tokens[ri];
4076 switch (t)
4077 {
4078 case BEGLINE:
4079 mp = allocmust (mp, 2);
4080 mp->begline = true;
4081 need_begline = true;
4082 break;
4083 case ENDLINE:
4084 mp = allocmust (mp, 2);
4085 mp->endline = true;
4086 need_endline = true;
4087 break;
4088 case LPAREN:
4089 case RPAREN:
4090 assert (!"neither LPAREN nor RPAREN may appear here");
4091
4092 case EMPTY:
4093 case BEGWORD:
4094 case ENDWORD:
4095 case LIMWORD:
4096 case NOTLIMWORD:
4097 case BACKREF:
4098 case ANYCHAR:
4099 case MBCSET:
4100 mp = allocmust (mp, 2);
4101 break;
4102
4103 case STAR:
4104 case QMARK:
4105 assume_nonnull (mp);
4106 resetmust (mp);
4107 break;
4108
4109 case OR:
4110 {
4111 char **new;
4112 must *rmp = mp;
4113 assume_nonnull (rmp);
4114 must *lmp = mp = mp->prev;
4115 assume_nonnull (lmp);
4116 idx_t j, ln, rn, n;
4117
4118
4119 if (streq (lmp->is, rmp->is))
4120 {
4121 lmp->begline &= rmp->begline;
4122 lmp->endline &= rmp->endline;
4123 }
4124 else
4125 {
4126 lmp->is[0] = '\0';
4127 lmp->begline = false;
4128 lmp->endline = false;
4129 }
4130
4131 idx_t i = 0;
4132 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4133 ++i;
4134 lmp->left[i] = '\0';
4135
4136 ln = strlen (lmp->right);
4137 rn = strlen (rmp->right);
4138 n = ln;
4139 if (n > rn)
4140 n = rn;
4141 for (i = 0; i < n; ++i)
4142 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4143 break;
4144 for (j = 0; j < i; ++j)
4145 lmp->right[j] = lmp->right[(ln - i) + j];
4146 lmp->right[j] = '\0';
4147 new = inboth (lmp->in, rmp->in);
4148 freelist (lmp->in);
4149 free (lmp->in);
4150 lmp->in = new;
4151 freemust (rmp);
4152 }
4153 break;
4154
4155 case PLUS:
4156 assume_nonnull (mp);
4157 mp->is[0] = '\0';
4158 break;
4159
4160 case END:
4161 assume_nonnull (mp);
4162 assert (!mp->prev);
4163 for (idx_t i = 0; mp->in[i] != NULL; i++)
4164 if (strlen (mp->in[i]) > strlen (result))
4165 result = mp->in[i];
4166 if (streq (result, mp->is))
4167 {
4168 if ((!need_begline || mp->begline) && (!need_endline
4169 || mp->endline))
4170 exact = true;
4171 begline = mp->begline;
4172 endline = mp->endline;
4173 }
4174 goto done;
4175
4176 case CAT:
4177 {
4178 must *rmp = mp;
4179 assume_nonnull (rmp);
4180 must *lmp = mp = mp->prev;
4181 assume_nonnull (lmp);
4182
4183
4184
4185
4186 lmp->in = addlists (lmp->in, rmp->in);
4187 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4188 {
4189 idx_t lrlen = strlen (lmp->right);
4190 idx_t rllen = strlen (rmp->left);
4191 char *tp = ximalloc (lrlen + rllen + 1);
4192 memcpy (tp + lrlen, rmp->left, rllen + 1);
4193 memcpy (tp, lmp->right, lrlen);
4194 lmp->in = enlistnew (lmp->in, tp);
4195 }
4196
4197 if (lmp->is[0] != '\0')
4198 lmp->left = icatalloc (lmp->left, rmp->left);
4199
4200 if (rmp->is[0] == '\0')
4201 lmp->right[0] = '\0';
4202 lmp->right = icatalloc (lmp->right, rmp->right);
4203
4204 if ((lmp->is[0] != '\0' || lmp->begline)
4205 && (rmp->is[0] != '\0' || rmp->endline))
4206 {
4207 lmp->is = icatalloc (lmp->is, rmp->is);
4208 lmp->endline = rmp->endline;
4209 }
4210 else
4211 {
4212 lmp->is[0] = '\0';
4213 lmp->begline = false;
4214 lmp->endline = false;
4215 }
4216 freemust (rmp);
4217 }
4218 break;
4219
4220 case '\0':
4221
4222 goto done;
4223
4224 default:
4225 if (CSET <= t)
4226 {
4227
4228
4229
4230
4231 charclass *ccl = &d->charclasses[t - CSET];
4232 int j;
4233 for (j = 0; j < NOTCHAR; j++)
4234 if (tstbit (j, ccl))
4235 break;
4236 if (! (j < NOTCHAR))
4237 {
4238 mp = allocmust (mp, 2);
4239 break;
4240 }
4241 t = j;
4242 while (++j < NOTCHAR)
4243 if (tstbit (j, ccl)
4244 && ! (case_fold_unibyte
4245 && toupper (j) == toupper (t)))
4246 break;
4247 if (j < NOTCHAR)
4248 {
4249 mp = allocmust (mp, 2);
4250 break;
4251 }
4252 }
4253
4254 idx_t rj = ri + 2;
4255 if (d->tokens[ri + 1] == CAT)
4256 {
4257 for (; rj < d->tindex - 1; rj += 2)
4258 {
4259 if ((rj != ri && (d->tokens[rj] <= 0
4260 || NOTCHAR <= d->tokens[rj]))
4261 || d->tokens[rj + 1] != CAT)
4262 break;
4263 }
4264 }
4265 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4266 mp->is[0] = mp->left[0] = mp->right[0]
4267 = case_fold_unibyte ? toupper (t) : t;
4268
4269 idx_t i;
4270 for (i = 1; ri + 2 < rj; i++)
4271 {
4272 ri += 2;
4273 t = d->tokens[ri];
4274 mp->is[i] = mp->left[i] = mp->right[i]
4275 = case_fold_unibyte ? toupper (t) : t;
4276 }
4277 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4278 mp->in = enlist (mp->in, mp->is, i);
4279 break;
4280 }
4281 }
4282 done:;
4283
4284 struct dfamust *dm = NULL;
4285 if (*result)
4286 {
4287 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4288 dm->exact = exact;
4289 dm->begline = begline;
4290 dm->endline = endline;
4291 strcpy (dm->must, result);
4292 }
4293
4294 while (mp)
4295 {
4296 must *prev = mp->prev;
4297 freemust (mp);
4298 mp = prev;
4299 }
4300
4301 return dm;
4302 }
4303
4304 void
4305 dfamustfree (struct dfamust *dm)
4306 {
4307 free (dm);
4308 }
4309
4310 struct dfa *
4311 dfaalloc (void)
4312 {
4313 return xmalloc (sizeof (struct dfa));
4314 }
4315
4316
4317 void
4318 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4319 reg_syntax_t bits, int dfaopts)
4320 {
4321 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4322 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4323 dfa->localeinfo = *linfo;
4324
4325 dfa->fast = !dfa->localeinfo.multibyte;
4326
4327 dfa->canychar = -1;
4328 dfa->syntax.syntax_bits_set = true;
4329 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4330 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4331 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4332 dfa->syntax.syntax_bits = bits;
4333
4334 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4335 {
4336 unsigned char uc = i;
4337
4338 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4339 switch (dfa->syntax.sbit[uc])
4340 {
4341 case CTX_LETTER:
4342 setbit (uc, &dfa->syntax.letters);
4343 break;
4344 case CTX_NEWLINE:
4345 setbit (uc, &dfa->syntax.newline);
4346 break;
4347 }
4348
4349
4350
4351 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4352 ? (uc & 0xc0) != 0x80
4353 : strchr ("\n\r./", uc) != NULL);
4354 }
4355 }
4356
4357
4358 void
4359 dfacopysyntax (struct dfa *to, struct dfa const *from)
4360 {
4361 memset (to, 0, offsetof (struct dfa, syntax));
4362 to->canychar = -1;
4363 to->fast = from->fast;
4364 to->syntax = from->syntax;
4365 to->dfaexec = from->dfaexec;
4366 to->localeinfo = from->localeinfo;
4367 }
4368
4369