This source file includes following definitions.
- re_compile_pattern
- weak_alias
- weak_alias
- weak_alias
- re_compile_fastmap_iter
- regcomp
- libc_hidden_def
- free_dfa_content
- regfree
- libc_hidden_def
- libc_freeres_fn
- re_compile_internal
- init_dfa
- init_word_char
- free_workarea_compile
- create_initial_state
- optimize_utf8
- analyze
- postorder
- preorder
- optimize_subexps
- lower_subexps
- lower_subexp
- calc_first
- calc_next
- link_nfa_nodes
- duplicate_node_closure
- search_duplicated_node
- duplicate_node
- calc_inveclosure
- calc_eclosure
- calc_eclosure_iter
- fetch_token
- peek_token
- peek_token_bracket
- parse
- parse_reg_exp
- parse_branch
- parse_expression
- parse_sub_exp
- parse_dup_op
- parse_byte
- build_range_exp
- build_collating_symbol
- seek_collating_symbol_entry
- lookup_collation_sequence_value
- build_range_exp
- build_collating_symbol
- parse_bracket_exp
- parse_bracket_element
- parse_bracket_symbol
- build_equiv_class
- build_charclass
- build_charclass_op
- fetch_number
- free_charset
- create_tree
- create_token_tree
- mark_opt_subexp
- free_token
- free_tree
- duplicate_tree
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 static void free_charset (re_charset_t *cset);
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax);
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 re_charset_t *mbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *char_class_alloc,
96 const char *class_name,
97 reg_syntax_t syntax);
98 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
99 RE_TRANSLATE_TYPE trans,
100 const char *class_name,
101 const char *extra,
102 bool non_match, reg_errcode_t *err);
103 static bin_tree_t *create_tree (re_dfa_t *dfa,
104 bin_tree_t *left, bin_tree_t *right,
105 re_token_type_t type);
106 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
107 bin_tree_t *left, bin_tree_t *right,
108 const re_token_t *token);
109 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
110 static void free_token (re_token_t *node);
111 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
112 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
113
114
115
116
117
118
119 static const char __re_error_msgid[] =
120 {
121 #define REG_NOERROR_IDX 0
122 gettext_noop ("Success")
123 "\0"
124 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
125 gettext_noop ("No match")
126 "\0"
127 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
128 gettext_noop ("Invalid regular expression")
129 "\0"
130 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
131 gettext_noop ("Invalid collation character")
132 "\0"
133 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
134 gettext_noop ("Invalid character class name")
135 "\0"
136 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
137 gettext_noop ("Trailing backslash")
138 "\0"
139 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
140 gettext_noop ("Invalid back reference")
141 "\0"
142 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
143 gettext_noop ("Unmatched [, [^, [:, [., or [=")
144 "\0"
145 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
146 gettext_noop ("Unmatched ( or \\(")
147 "\0"
148 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
149 gettext_noop ("Unmatched \\{")
150 "\0"
151 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
152 gettext_noop ("Invalid content of \\{\\}")
153 "\0"
154 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
155 gettext_noop ("Invalid range end")
156 "\0"
157 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
158 gettext_noop ("Memory exhausted")
159 "\0"
160 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
161 gettext_noop ("Invalid preceding regular expression")
162 "\0"
163 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
164 gettext_noop ("Premature end of regular expression")
165 "\0"
166 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
167 gettext_noop ("Regular expression too big")
168 "\0"
169 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
170 gettext_noop ("Unmatched ) or \\)")
171 };
172
173 static const size_t __re_error_msgid_idx[] =
174 {
175 REG_NOERROR_IDX,
176 REG_NOMATCH_IDX,
177 REG_BADPAT_IDX,
178 REG_ECOLLATE_IDX,
179 REG_ECTYPE_IDX,
180 REG_EESCAPE_IDX,
181 REG_ESUBREG_IDX,
182 REG_EBRACK_IDX,
183 REG_EPAREN_IDX,
184 REG_EBRACE_IDX,
185 REG_BADBR_IDX,
186 REG_ERANGE_IDX,
187 REG_ESPACE_IDX,
188 REG_BADRPT_IDX,
189 REG_EEND_IDX,
190 REG_ESIZE_IDX,
191 REG_ERPAREN_IDX
192 };
193
194
195
196
197
198
199
200
201
202
203 const char *
204 re_compile_pattern (const char *pattern, size_t length,
205 struct re_pattern_buffer *bufp)
206 {
207 reg_errcode_t ret;
208
209
210
211
212 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
213
214
215 bufp->newline_anchor = 1;
216
217 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
218
219 if (!ret)
220 return NULL;
221 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
222 }
223 weak_alias (__re_compile_pattern, re_compile_pattern)
224
225
226
227
228
229
230 reg_syntax_t re_syntax_options;
231
232
233
234
235
236
237
238
239
240 reg_syntax_t
241 re_set_syntax (reg_syntax_t syntax)
242 {
243 reg_syntax_t ret = re_syntax_options;
244
245 re_syntax_options = syntax;
246 return ret;
247 }
248 weak_alias (__re_set_syntax, re_set_syntax)
249
250 int
251 re_compile_fastmap (struct re_pattern_buffer *bufp)
252 {
253 re_dfa_t *dfa = bufp->buffer;
254 char *fastmap = bufp->fastmap;
255
256 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
257 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
258 if (dfa->init_state != dfa->init_state_word)
259 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
260 if (dfa->init_state != dfa->init_state_nl)
261 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
262 if (dfa->init_state != dfa->init_state_begbuf)
263 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
264 bufp->fastmap_accurate = 1;
265 return 0;
266 }
267 weak_alias (__re_compile_fastmap, re_compile_fastmap)
268
269 static __always_inline void
270 re_set_fastmap (char *fastmap, bool icase, int ch)
271 {
272 fastmap[ch] = 1;
273 if (icase)
274 fastmap[tolower (ch)] = 1;
275 }
276
277
278
279
280 static void
281 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
282 char *fastmap)
283 {
284 re_dfa_t *dfa = bufp->buffer;
285 Idx node_cnt;
286 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
287 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
288 {
289 Idx node = init_state->nodes.elems[node_cnt];
290 re_token_type_t type = dfa->nodes[node].type;
291
292 if (type == CHARACTER)
293 {
294 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
295 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
296 {
297 unsigned char buf[MB_LEN_MAX];
298 unsigned char *p;
299 wchar_t wc;
300 mbstate_t state;
301
302 p = buf;
303 *p++ = dfa->nodes[node].opr.c;
304 while (++node < dfa->nodes_len
305 && dfa->nodes[node].type == CHARACTER
306 && dfa->nodes[node].mb_partial)
307 *p++ = dfa->nodes[node].opr.c;
308 memset (&state, '\0', sizeof (state));
309 if (__mbrtowc (&wc, (const char *) buf, p - buf,
310 &state) == p - buf
311 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
312 != (size_t) -1))
313 re_set_fastmap (fastmap, false, buf[0]);
314 }
315 }
316 else if (type == SIMPLE_BRACKET)
317 {
318 int i, ch;
319 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
320 {
321 int j;
322 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
323 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
324 if (w & ((bitset_word_t) 1 << j))
325 re_set_fastmap (fastmap, icase, ch);
326 }
327 }
328 else if (type == COMPLEX_BRACKET)
329 {
330 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
331 Idx i;
332
333 #ifdef _LIBC
334
335
336
337
338
339
340 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
341 && (cset->ncoll_syms || cset->nranges))
342 {
343 const int32_t *table = (const int32_t *)
344 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345 for (i = 0; i < SBC_MAX; ++i)
346 if (table[i] < 0)
347 re_set_fastmap (fastmap, icase, i);
348 }
349 #endif
350
351
352
353
354
355 if (dfa->mb_cur_max > 1
356 && (cset->nchar_classes || cset->non_match || cset->nranges
357 #ifdef _LIBC
358 || cset->nequiv_classes
359 #endif
360 ))
361 {
362 unsigned char c = 0;
363 do
364 {
365 mbstate_t mbs;
366 memset (&mbs, 0, sizeof (mbs));
367 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
368 re_set_fastmap (fastmap, false, (int) c);
369 }
370 while (++c != 0);
371 }
372
373 else
374 {
375
376 for (i = 0; i < cset->nmbchars; ++i)
377 {
378 char buf[256];
379 mbstate_t state;
380 memset (&state, '\0', sizeof (state));
381 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
382 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
383 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
384 {
385 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
386 != (size_t) -1)
387 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
388 }
389 }
390 }
391 }
392 else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
393 {
394 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395 if (type == END_OF_RE)
396 bufp->can_be_null = 1;
397 return;
398 }
399 }
400 }
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438 int
439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
440 {
441 reg_errcode_t ret;
442 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
443 : RE_SYNTAX_POSIX_BASIC);
444
445 preg->buffer = NULL;
446 preg->allocated = 0;
447 preg->used = 0;
448
449
450 preg->fastmap = re_malloc (char, SBC_MAX);
451 if (__glibc_unlikely (preg->fastmap == NULL))
452 return REG_ESPACE;
453
454 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
455
456
457 if (cflags & REG_NEWLINE)
458 {
459 syntax &= ~RE_DOT_NEWLINE;
460 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
461
462 preg->newline_anchor = 1;
463 }
464 else
465 preg->newline_anchor = 0;
466 preg->no_sub = !!(cflags & REG_NOSUB);
467 preg->translate = NULL;
468
469 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
470
471
472
473 if (ret == REG_ERPAREN)
474 ret = REG_EPAREN;
475
476
477 if (__glibc_likely (ret == REG_NOERROR))
478
479
480 (void) re_compile_fastmap (preg);
481 else
482 {
483
484 re_free (preg->fastmap);
485 preg->fastmap = NULL;
486 }
487
488 return (int) ret;
489 }
490 libc_hidden_def (__regcomp)
491 weak_alias (__regcomp, regcomp)
492
493
494
495
496 size_t
497 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
498 size_t errbuf_size)
499 {
500 const char *msg;
501 size_t msg_size;
502 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
503
504 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
505
506
507
508
509 abort ();
510
511 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
512
513 msg_size = strlen (msg) + 1;
514
515 if (__glibc_likely (errbuf_size != 0))
516 {
517 size_t cpy_size = msg_size;
518 if (__glibc_unlikely (msg_size > errbuf_size))
519 {
520 cpy_size = errbuf_size - 1;
521 errbuf[cpy_size] = '\0';
522 }
523 memcpy (errbuf, msg, cpy_size);
524 }
525
526 return msg_size;
527 }
528 weak_alias (__regerror, regerror)
529
530
531
532
533
534
535 static const bitset_t utf8_sb_map =
536 {
537
538 #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
539 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
540 #else
541 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
542 # error "bitset_word_t is narrower than 32 bits"
543 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
544 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
545 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
546 BITSET_WORD_MAX, BITSET_WORD_MAX,
547 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
548 BITSET_WORD_MAX,
549 # endif
550 (BITSET_WORD_MAX
551 >> (SBC_MAX % BITSET_WORD_BITS == 0
552 ? 0
553 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
554 #endif
555 };
556
557
558 static void
559 free_dfa_content (re_dfa_t *dfa)
560 {
561 Idx i, j;
562
563 if (dfa->nodes)
564 for (i = 0; i < dfa->nodes_len; ++i)
565 free_token (dfa->nodes + i);
566 re_free (dfa->nexts);
567 for (i = 0; i < dfa->nodes_len; ++i)
568 {
569 if (dfa->eclosures != NULL)
570 re_node_set_free (dfa->eclosures + i);
571 if (dfa->inveclosures != NULL)
572 re_node_set_free (dfa->inveclosures + i);
573 if (dfa->edests != NULL)
574 re_node_set_free (dfa->edests + i);
575 }
576 re_free (dfa->edests);
577 re_free (dfa->eclosures);
578 re_free (dfa->inveclosures);
579 re_free (dfa->nodes);
580
581 if (dfa->state_table)
582 for (i = 0; i <= dfa->state_hash_mask; ++i)
583 {
584 struct re_state_table_entry *entry = dfa->state_table + i;
585 for (j = 0; j < entry->num; ++j)
586 {
587 re_dfastate_t *state = entry->array[j];
588 free_state (state);
589 }
590 re_free (entry->array);
591 }
592 re_free (dfa->state_table);
593 if (dfa->sb_char != utf8_sb_map)
594 re_free (dfa->sb_char);
595 re_free (dfa->subexp_map);
596 #ifdef DEBUG
597 re_free (dfa->re_str);
598 #endif
599
600 re_free (dfa);
601 }
602
603
604
605
606 void
607 regfree (regex_t *preg)
608 {
609 re_dfa_t *dfa = preg->buffer;
610 if (__glibc_likely (dfa != NULL))
611 {
612 lock_fini (dfa->lock);
613 free_dfa_content (dfa);
614 }
615 preg->buffer = NULL;
616 preg->allocated = 0;
617
618 re_free (preg->fastmap);
619 preg->fastmap = NULL;
620
621 re_free (preg->translate);
622 preg->translate = NULL;
623 }
624 libc_hidden_def (__regfree)
625 weak_alias (__regfree, regfree)
626
627
628
629
630 #if defined _REGEX_RE_COMP || defined _LIBC
631
632
633 static struct re_pattern_buffer re_comp_buf;
634
635 char *
636 # ifdef _LIBC
637
638
639
640 weak_function
641 # endif
642 re_comp (const char *s)
643 {
644 reg_errcode_t ret;
645 char *fastmap;
646
647 if (!s)
648 {
649 if (!re_comp_buf.buffer)
650 return gettext ("No previous regular expression");
651 return 0;
652 }
653
654 if (re_comp_buf.buffer)
655 {
656 fastmap = re_comp_buf.fastmap;
657 re_comp_buf.fastmap = NULL;
658 __regfree (&re_comp_buf);
659 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660 re_comp_buf.fastmap = fastmap;
661 }
662
663 if (re_comp_buf.fastmap == NULL)
664 {
665 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
666 if (re_comp_buf.fastmap == NULL)
667 return (char *) gettext (__re_error_msgid
668 + __re_error_msgid_idx[(int) REG_ESPACE]);
669 }
670
671
672
673
674
675 re_comp_buf.newline_anchor = 1;
676
677 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
678
679 if (!ret)
680 return NULL;
681
682
683 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
684 }
685
686 #ifdef _LIBC
687 libc_freeres_fn (free_mem)
688 {
689 __regfree (&re_comp_buf);
690 }
691 #endif
692
693 #endif
694
695
696
697
698
699 static reg_errcode_t
700 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
701 reg_syntax_t syntax)
702 {
703 reg_errcode_t err = REG_NOERROR;
704 re_dfa_t *dfa;
705 re_string_t regexp;
706
707
708 preg->fastmap_accurate = 0;
709 preg->syntax = syntax;
710 preg->not_bol = preg->not_eol = 0;
711 preg->used = 0;
712 preg->re_nsub = 0;
713 preg->can_be_null = 0;
714 preg->regs_allocated = REGS_UNALLOCATED;
715
716
717 dfa = preg->buffer;
718 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
719 {
720
721
722
723
724 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
725 if (dfa == NULL)
726 return REG_ESPACE;
727 preg->allocated = sizeof (re_dfa_t);
728 preg->buffer = dfa;
729 }
730 preg->used = sizeof (re_dfa_t);
731
732 err = init_dfa (dfa, length);
733 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
734 err = REG_ESPACE;
735 if (__glibc_unlikely (err != REG_NOERROR))
736 {
737 free_dfa_content (dfa);
738 preg->buffer = NULL;
739 preg->allocated = 0;
740 return err;
741 }
742 #ifdef DEBUG
743
744 dfa->re_str = re_malloc (char, length + 1);
745 strncpy (dfa->re_str, pattern, length + 1);
746 #endif
747
748 err = re_string_construct (®exp, pattern, length, preg->translate,
749 (syntax & RE_ICASE) != 0, dfa);
750 if (__glibc_unlikely (err != REG_NOERROR))
751 {
752 re_compile_internal_free_return:
753 free_workarea_compile (preg);
754 re_string_destruct (®exp);
755 lock_fini (dfa->lock);
756 free_dfa_content (dfa);
757 preg->buffer = NULL;
758 preg->allocated = 0;
759 return err;
760 }
761
762
763 preg->re_nsub = 0;
764 dfa->str_tree = parse (®exp, preg, syntax, &err);
765 if (__glibc_unlikely (dfa->str_tree == NULL))
766 goto re_compile_internal_free_return;
767
768
769 err = analyze (preg);
770 if (__glibc_unlikely (err != REG_NOERROR))
771 goto re_compile_internal_free_return;
772
773
774 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
775 optimize_utf8 (dfa);
776
777
778 err = create_initial_state (dfa);
779
780
781 free_workarea_compile (preg);
782 re_string_destruct (®exp);
783
784 if (__glibc_unlikely (err != REG_NOERROR))
785 {
786 lock_fini (dfa->lock);
787 free_dfa_content (dfa);
788 preg->buffer = NULL;
789 preg->allocated = 0;
790 }
791
792 return err;
793 }
794
795
796
797
798 static reg_errcode_t
799 init_dfa (re_dfa_t *dfa, size_t pat_len)
800 {
801 __re_size_t table_size;
802 #ifndef _LIBC
803 const char *codeset_name;
804 #endif
805 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
806 size_t max_object_size =
807 MAX (sizeof (struct re_state_table_entry),
808 MAX (sizeof (re_token_t),
809 MAX (sizeof (re_node_set),
810 MAX (sizeof (regmatch_t),
811 max_i18n_object_size))));
812
813 memset (dfa, '\0', sizeof (re_dfa_t));
814
815
816 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
817
818
819
820
821
822 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
823 <= pat_len))
824 return REG_ESPACE;
825
826 dfa->nodes_alloc = pat_len + 1;
827 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
828
829
830 for (table_size = 1; ; table_size <<= 1)
831 if (table_size > pat_len)
832 break;
833
834 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
835 dfa->state_hash_mask = table_size - 1;
836
837 dfa->mb_cur_max = MB_CUR_MAX;
838 #ifdef _LIBC
839 if (dfa->mb_cur_max == 6
840 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
841 dfa->is_utf8 = 1;
842 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
843 != 0);
844 #else
845 codeset_name = nl_langinfo (CODESET);
846 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
847 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
848 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
849 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
850 dfa->is_utf8 = 1;
851
852
853
854 dfa->map_notascii = 0;
855 #endif
856
857 if (dfa->mb_cur_max > 1)
858 {
859 if (dfa->is_utf8)
860 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861 else
862 {
863 int i, j, ch;
864
865 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
866 if (__glibc_unlikely (dfa->sb_char == NULL))
867 return REG_ESPACE;
868
869
870 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
871 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
872 {
873 wint_t wch = __btowc (ch);
874 if (wch != WEOF)
875 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
876 #ifndef _LIBC
877 if (isascii (ch) && wch != ch)
878 dfa->map_notascii = 1;
879 #endif
880 }
881 }
882 }
883
884 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
885 return REG_ESPACE;
886 return REG_NOERROR;
887 }
888
889
890
891
892
893 static void
894 init_word_char (re_dfa_t *dfa)
895 {
896 int i = 0;
897 int j;
898 int ch = 0;
899 dfa->word_ops_used = 1;
900 if (__glibc_likely (dfa->map_notascii == 0))
901 {
902 bitset_word_t bits0 = 0x00000000;
903 bitset_word_t bits1 = 0x03ff0000;
904 bitset_word_t bits2 = 0x87fffffe;
905 bitset_word_t bits3 = 0x07fffffe;
906 if (BITSET_WORD_BITS == 64)
907 {
908
909 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
910 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
911 i = 2;
912 }
913 else if (BITSET_WORD_BITS == 32)
914 {
915 dfa->word_char[0] = bits0;
916 dfa->word_char[1] = bits1;
917 dfa->word_char[2] = bits2;
918 dfa->word_char[3] = bits3;
919 i = 4;
920 }
921 else
922 goto general_case;
923 ch = 128;
924
925 if (__glibc_likely (dfa->is_utf8))
926 {
927 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
928 return;
929 }
930 }
931
932 general_case:
933 for (; i < BITSET_WORDS; ++i)
934 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935 if (isalnum (ch) || ch == '_')
936 dfa->word_char[i] |= (bitset_word_t) 1 << j;
937 }
938
939
940
941 static void
942 free_workarea_compile (regex_t *preg)
943 {
944 re_dfa_t *dfa = preg->buffer;
945 bin_tree_storage_t *storage, *next;
946 for (storage = dfa->str_tree_storage; storage; storage = next)
947 {
948 next = storage->next;
949 re_free (storage);
950 }
951 dfa->str_tree_storage = NULL;
952 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
953 dfa->str_tree = NULL;
954 re_free (dfa->org_indices);
955 dfa->org_indices = NULL;
956 }
957
958
959
960 static reg_errcode_t
961 create_initial_state (re_dfa_t *dfa)
962 {
963 Idx first, i;
964 reg_errcode_t err;
965 re_node_set init_nodes;
966
967
968
969 first = dfa->str_tree->first->node_idx;
970 dfa->init_node = first;
971 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
972 if (__glibc_unlikely (err != REG_NOERROR))
973 return err;
974
975
976
977
978
979 if (dfa->nbackref > 0)
980 for (i = 0; i < init_nodes.nelem; ++i)
981 {
982 Idx node_idx = init_nodes.elems[i];
983 re_token_type_t type = dfa->nodes[node_idx].type;
984
985 Idx clexp_idx;
986 if (type != OP_BACK_REF)
987 continue;
988 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
989 {
990 re_token_t *clexp_node;
991 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
992 if (clexp_node->type == OP_CLOSE_SUBEXP
993 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
994 break;
995 }
996 if (clexp_idx == init_nodes.nelem)
997 continue;
998
999 if (type == OP_BACK_REF)
1000 {
1001 Idx dest_idx = dfa->edests[node_idx].elems[0];
1002 if (!re_node_set_contains (&init_nodes, dest_idx))
1003 {
1004 reg_errcode_t merge_err
1005 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1006 if (merge_err != REG_NOERROR)
1007 return merge_err;
1008 i = 0;
1009 }
1010 }
1011 }
1012
1013
1014 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1015
1016 if (__glibc_unlikely (dfa->init_state == NULL))
1017 return err;
1018 if (dfa->init_state->has_constraint)
1019 {
1020 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1021 CONTEXT_WORD);
1022 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1023 CONTEXT_NEWLINE);
1024 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1025 &init_nodes,
1026 CONTEXT_NEWLINE
1027 | CONTEXT_BEGBUF);
1028 if (__glibc_unlikely (dfa->init_state_word == NULL
1029 || dfa->init_state_nl == NULL
1030 || dfa->init_state_begbuf == NULL))
1031 return err;
1032 }
1033 else
1034 dfa->init_state_word = dfa->init_state_nl
1035 = dfa->init_state_begbuf = dfa->init_state;
1036
1037 re_node_set_free (&init_nodes);
1038 return REG_NOERROR;
1039 }
1040
1041
1042
1043
1044
1045 static void
1046 optimize_utf8 (re_dfa_t *dfa)
1047 {
1048 Idx node;
1049 int i;
1050 bool mb_chars = false;
1051 bool has_period = false;
1052
1053 for (node = 0; node < dfa->nodes_len; ++node)
1054 switch (dfa->nodes[node].type)
1055 {
1056 case CHARACTER:
1057 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1058 mb_chars = true;
1059 break;
1060 case ANCHOR:
1061 switch (dfa->nodes[node].opr.ctx_type)
1062 {
1063 case LINE_FIRST:
1064 case LINE_LAST:
1065 case BUF_FIRST:
1066 case BUF_LAST:
1067 break;
1068 default:
1069
1070
1071
1072 return;
1073 }
1074 break;
1075 case OP_PERIOD:
1076 has_period = true;
1077 break;
1078 case OP_BACK_REF:
1079 case OP_ALT:
1080 case END_OF_RE:
1081 case OP_DUP_ASTERISK:
1082 case OP_OPEN_SUBEXP:
1083 case OP_CLOSE_SUBEXP:
1084 break;
1085 case COMPLEX_BRACKET:
1086 return;
1087 case SIMPLE_BRACKET:
1088
1089 {
1090 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1091 ? 0
1092 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1093 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1094 {
1095 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096 return;
1097 rshift = 0;
1098 }
1099 }
1100 break;
1101 default:
1102 abort ();
1103 }
1104
1105 if (mb_chars || has_period)
1106 for (node = 0; node < dfa->nodes_len; ++node)
1107 {
1108 if (dfa->nodes[node].type == CHARACTER
1109 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1110 dfa->nodes[node].mb_partial = 0;
1111 else if (dfa->nodes[node].type == OP_PERIOD)
1112 dfa->nodes[node].type = OP_UTF8_PERIOD;
1113 }
1114
1115
1116 dfa->mb_cur_max = 1;
1117 dfa->is_utf8 = 0;
1118 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119 }
1120
1121
1122
1123
1124 static reg_errcode_t
1125 analyze (regex_t *preg)
1126 {
1127 re_dfa_t *dfa = preg->buffer;
1128 reg_errcode_t ret;
1129
1130
1131 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1132 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1133 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1136 || dfa->edests == NULL || dfa->eclosures == NULL))
1137 return REG_ESPACE;
1138
1139 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1140 if (dfa->subexp_map != NULL)
1141 {
1142 Idx i;
1143 for (i = 0; i < preg->re_nsub; i++)
1144 dfa->subexp_map[i] = i;
1145 preorder (dfa->str_tree, optimize_subexps, dfa);
1146 for (i = 0; i < preg->re_nsub; i++)
1147 if (dfa->subexp_map[i] != i)
1148 break;
1149 if (i == preg->re_nsub)
1150 {
1151 re_free (dfa->subexp_map);
1152 dfa->subexp_map = NULL;
1153 }
1154 }
1155
1156 ret = postorder (dfa->str_tree, lower_subexps, preg);
1157 if (__glibc_unlikely (ret != REG_NOERROR))
1158 return ret;
1159 ret = postorder (dfa->str_tree, calc_first, dfa);
1160 if (__glibc_unlikely (ret != REG_NOERROR))
1161 return ret;
1162 preorder (dfa->str_tree, calc_next, dfa);
1163 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164 if (__glibc_unlikely (ret != REG_NOERROR))
1165 return ret;
1166 ret = calc_eclosure (dfa);
1167 if (__glibc_unlikely (ret != REG_NOERROR))
1168 return ret;
1169
1170
1171
1172 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173 || dfa->nbackref)
1174 {
1175 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176 if (__glibc_unlikely (dfa->inveclosures == NULL))
1177 return REG_ESPACE;
1178 ret = calc_inveclosure (dfa);
1179 }
1180
1181 return ret;
1182 }
1183
1184
1185
1186
1187 static reg_errcode_t
1188 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189 void *extra)
1190 {
1191 bin_tree_t *node, *prev;
1192
1193 for (node = root; ; )
1194 {
1195
1196
1197 while (node->left || node->right)
1198 if (node->left)
1199 node = node->left;
1200 else
1201 node = node->right;
1202
1203 do
1204 {
1205 reg_errcode_t err = fn (extra, node);
1206 if (__glibc_unlikely (err != REG_NOERROR))
1207 return err;
1208 if (node->parent == NULL)
1209 return REG_NOERROR;
1210 prev = node;
1211 node = node->parent;
1212 }
1213
1214 while (node->right == prev || node->right == NULL);
1215 node = node->right;
1216 }
1217 }
1218
1219 static reg_errcode_t
1220 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1221 void *extra)
1222 {
1223 bin_tree_t *node;
1224
1225 for (node = root; ; )
1226 {
1227 reg_errcode_t err = fn (extra, node);
1228 if (__glibc_unlikely (err != REG_NOERROR))
1229 return err;
1230
1231
1232 if (node->left)
1233 node = node->left;
1234 else
1235 {
1236 bin_tree_t *prev = NULL;
1237 while (node->right == prev || node->right == NULL)
1238 {
1239 prev = node;
1240 node = node->parent;
1241 if (!node)
1242 return REG_NOERROR;
1243 }
1244 node = node->right;
1245 }
1246 }
1247 }
1248
1249
1250
1251
1252 static reg_errcode_t
1253 optimize_subexps (void *extra, bin_tree_t *node)
1254 {
1255 re_dfa_t *dfa = (re_dfa_t *) extra;
1256
1257 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1258 {
1259 int idx = node->token.opr.idx;
1260 node->token.opr.idx = dfa->subexp_map[idx];
1261 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1262 }
1263
1264 else if (node->token.type == SUBEXP
1265 && node->left && node->left->token.type == SUBEXP)
1266 {
1267 Idx other_idx = node->left->token.opr.idx;
1268
1269 node->left = node->left->left;
1270 if (node->left)
1271 node->left->parent = node;
1272
1273 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1274 if (other_idx < BITSET_WORD_BITS)
1275 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1276 }
1277
1278 return REG_NOERROR;
1279 }
1280
1281
1282
1283 static reg_errcode_t
1284 lower_subexps (void *extra, bin_tree_t *node)
1285 {
1286 regex_t *preg = (regex_t *) extra;
1287 reg_errcode_t err = REG_NOERROR;
1288
1289 if (node->left && node->left->token.type == SUBEXP)
1290 {
1291 node->left = lower_subexp (&err, preg, node->left);
1292 if (node->left)
1293 node->left->parent = node;
1294 }
1295 if (node->right && node->right->token.type == SUBEXP)
1296 {
1297 node->right = lower_subexp (&err, preg, node->right);
1298 if (node->right)
1299 node->right->parent = node;
1300 }
1301
1302 return err;
1303 }
1304
1305 static bin_tree_t *
1306 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1307 {
1308 re_dfa_t *dfa = preg->buffer;
1309 bin_tree_t *body = node->left;
1310 bin_tree_t *op, *cls, *tree1, *tree;
1311
1312 if (preg->no_sub
1313
1314
1315
1316
1317 && node->left != NULL
1318 && (node->token.opr.idx >= BITSET_WORD_BITS
1319 || !(dfa->used_bkref_map
1320 & ((bitset_word_t) 1 << node->token.opr.idx))))
1321 return node->left;
1322
1323
1324
1325 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1326 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1327 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1328 tree = create_tree (dfa, op, tree1, CONCAT);
1329 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1330 || op == NULL || cls == NULL))
1331 {
1332 *err = REG_ESPACE;
1333 return NULL;
1334 }
1335
1336 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1337 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1338 return tree;
1339 }
1340
1341
1342
1343 static reg_errcode_t
1344 calc_first (void *extra, bin_tree_t *node)
1345 {
1346 re_dfa_t *dfa = (re_dfa_t *) extra;
1347 if (node->token.type == CONCAT)
1348 {
1349 node->first = node->left->first;
1350 node->node_idx = node->left->node_idx;
1351 }
1352 else
1353 {
1354 node->first = node;
1355 node->node_idx = re_dfa_add_node (dfa, node->token);
1356 if (__glibc_unlikely (node->node_idx == -1))
1357 return REG_ESPACE;
1358 if (node->token.type == ANCHOR)
1359 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1360 }
1361 return REG_NOERROR;
1362 }
1363
1364
1365 static reg_errcode_t
1366 calc_next (void *extra, bin_tree_t *node)
1367 {
1368 switch (node->token.type)
1369 {
1370 case OP_DUP_ASTERISK:
1371 node->left->next = node;
1372 break;
1373 case CONCAT:
1374 node->left->next = node->right->first;
1375 node->right->next = node->next;
1376 break;
1377 default:
1378 if (node->left)
1379 node->left->next = node->next;
1380 if (node->right)
1381 node->right->next = node->next;
1382 break;
1383 }
1384 return REG_NOERROR;
1385 }
1386
1387
1388 static reg_errcode_t
1389 link_nfa_nodes (void *extra, bin_tree_t *node)
1390 {
1391 re_dfa_t *dfa = (re_dfa_t *) extra;
1392 Idx idx = node->node_idx;
1393 reg_errcode_t err = REG_NOERROR;
1394
1395 switch (node->token.type)
1396 {
1397 case CONCAT:
1398 break;
1399
1400 case END_OF_RE:
1401 DEBUG_ASSERT (node->next == NULL);
1402 break;
1403
1404 case OP_DUP_ASTERISK:
1405 case OP_ALT:
1406 {
1407 Idx left, right;
1408 dfa->has_plural_match = 1;
1409 if (node->left != NULL)
1410 left = node->left->first->node_idx;
1411 else
1412 left = node->next->node_idx;
1413 if (node->right != NULL)
1414 right = node->right->first->node_idx;
1415 else
1416 right = node->next->node_idx;
1417 DEBUG_ASSERT (left > -1);
1418 DEBUG_ASSERT (right > -1);
1419 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1420 }
1421 break;
1422
1423 case ANCHOR:
1424 case OP_OPEN_SUBEXP:
1425 case OP_CLOSE_SUBEXP:
1426 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1427 break;
1428
1429 case OP_BACK_REF:
1430 dfa->nexts[idx] = node->next->node_idx;
1431 if (node->token.type == OP_BACK_REF)
1432 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1433 break;
1434
1435 default:
1436 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1437 dfa->nexts[idx] = node->next->node_idx;
1438 break;
1439 }
1440
1441 return err;
1442 }
1443
1444
1445
1446
1447
1448 static reg_errcode_t
1449 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1450 Idx root_node, unsigned int init_constraint)
1451 {
1452 Idx org_node, clone_node;
1453 bool ok;
1454 unsigned int constraint = init_constraint;
1455 for (org_node = top_org_node, clone_node = top_clone_node;;)
1456 {
1457 Idx org_dest, clone_dest;
1458 if (dfa->nodes[org_node].type == OP_BACK_REF)
1459 {
1460
1461
1462
1463
1464 org_dest = dfa->nexts[org_node];
1465 re_node_set_empty (dfa->edests + clone_node);
1466 clone_dest = duplicate_node (dfa, org_dest, constraint);
1467 if (__glibc_unlikely (clone_dest == -1))
1468 return REG_ESPACE;
1469 dfa->nexts[clone_node] = dfa->nexts[org_node];
1470 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1471 if (__glibc_unlikely (! ok))
1472 return REG_ESPACE;
1473 }
1474 else if (dfa->edests[org_node].nelem == 0)
1475 {
1476
1477
1478
1479 dfa->nexts[clone_node] = dfa->nexts[org_node];
1480 break;
1481 }
1482 else if (dfa->edests[org_node].nelem == 1)
1483 {
1484
1485
1486 org_dest = dfa->edests[org_node].elems[0];
1487 re_node_set_empty (dfa->edests + clone_node);
1488
1489
1490 if (org_node == root_node && clone_node != org_node)
1491 {
1492 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1493 if (__glibc_unlikely (! ok))
1494 return REG_ESPACE;
1495 break;
1496 }
1497
1498 constraint |= dfa->nodes[org_node].constraint;
1499 clone_dest = duplicate_node (dfa, org_dest, constraint);
1500 if (__glibc_unlikely (clone_dest == -1))
1501 return REG_ESPACE;
1502 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503 if (__glibc_unlikely (! ok))
1504 return REG_ESPACE;
1505 }
1506 else
1507 {
1508
1509
1510 org_dest = dfa->edests[org_node].elems[0];
1511 re_node_set_empty (dfa->edests + clone_node);
1512
1513 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1514 if (clone_dest == -1)
1515 {
1516
1517 reg_errcode_t err;
1518 clone_dest = duplicate_node (dfa, org_dest, constraint);
1519 if (__glibc_unlikely (clone_dest == -1))
1520 return REG_ESPACE;
1521 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1522 if (__glibc_unlikely (! ok))
1523 return REG_ESPACE;
1524 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1525 root_node, constraint);
1526 if (__glibc_unlikely (err != REG_NOERROR))
1527 return err;
1528 }
1529 else
1530 {
1531
1532
1533 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 if (__glibc_unlikely (! ok))
1535 return REG_ESPACE;
1536 }
1537
1538 org_dest = dfa->edests[org_node].elems[1];
1539 clone_dest = duplicate_node (dfa, org_dest, constraint);
1540 if (__glibc_unlikely (clone_dest == -1))
1541 return REG_ESPACE;
1542 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543 if (__glibc_unlikely (! ok))
1544 return REG_ESPACE;
1545 }
1546 org_node = org_dest;
1547 clone_node = clone_dest;
1548 }
1549 return REG_NOERROR;
1550 }
1551
1552
1553
1554
1555 static Idx
1556 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1557 unsigned int constraint)
1558 {
1559 Idx idx;
1560 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1561 {
1562 if (org_node == dfa->org_indices[idx]
1563 && constraint == dfa->nodes[idx].constraint)
1564 return idx;
1565 }
1566 return -1;
1567 }
1568
1569
1570
1571
1572
1573 static Idx
1574 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1575 {
1576 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1577 if (__glibc_likely (dup_idx != -1))
1578 {
1579 dfa->nodes[dup_idx].constraint = constraint;
1580 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1581 dfa->nodes[dup_idx].duplicated = 1;
1582
1583
1584 dfa->org_indices[dup_idx] = org_idx;
1585 }
1586 return dup_idx;
1587 }
1588
1589 static reg_errcode_t
1590 calc_inveclosure (re_dfa_t *dfa)
1591 {
1592 Idx src, idx;
1593 bool ok;
1594 for (idx = 0; idx < dfa->nodes_len; ++idx)
1595 re_node_set_init_empty (dfa->inveclosures + idx);
1596
1597 for (src = 0; src < dfa->nodes_len; ++src)
1598 {
1599 Idx *elems = dfa->eclosures[src].elems;
1600 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1601 {
1602 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1603 if (__glibc_unlikely (! ok))
1604 return REG_ESPACE;
1605 }
1606 }
1607
1608 return REG_NOERROR;
1609 }
1610
1611
1612
1613 static reg_errcode_t
1614 calc_eclosure (re_dfa_t *dfa)
1615 {
1616 Idx node_idx;
1617 bool incomplete;
1618 DEBUG_ASSERT (dfa->nodes_len > 0);
1619 incomplete = false;
1620
1621 for (node_idx = 0; ; ++node_idx)
1622 {
1623 reg_errcode_t err;
1624 re_node_set eclosure_elem;
1625 if (node_idx == dfa->nodes_len)
1626 {
1627 if (!incomplete)
1628 break;
1629 incomplete = false;
1630 node_idx = 0;
1631 }
1632
1633 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1634
1635
1636 if (dfa->eclosures[node_idx].nelem != 0)
1637 continue;
1638
1639 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1640 if (__glibc_unlikely (err != REG_NOERROR))
1641 return err;
1642
1643 if (dfa->eclosures[node_idx].nelem == 0)
1644 {
1645 incomplete = true;
1646 re_node_set_free (&eclosure_elem);
1647 }
1648 }
1649 return REG_NOERROR;
1650 }
1651
1652
1653
1654 static reg_errcode_t
1655 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1656 {
1657 reg_errcode_t err;
1658 Idx i;
1659 re_node_set eclosure;
1660 bool incomplete = false;
1661 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662 if (__glibc_unlikely (err != REG_NOERROR))
1663 return err;
1664
1665
1666 eclosure.elems[eclosure.nelem++] = node;
1667
1668
1669
1670 dfa->eclosures[node].nelem = -1;
1671
1672
1673
1674 if (dfa->nodes[node].constraint
1675 && dfa->edests[node].nelem
1676 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1677 {
1678 err = duplicate_node_closure (dfa, node, node, node,
1679 dfa->nodes[node].constraint);
1680 if (__glibc_unlikely (err != REG_NOERROR))
1681 return err;
1682 }
1683
1684
1685 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1686 for (i = 0; i < dfa->edests[node].nelem; ++i)
1687 {
1688 re_node_set eclosure_elem;
1689 Idx edest = dfa->edests[node].elems[i];
1690
1691
1692 if (dfa->eclosures[edest].nelem == -1)
1693 {
1694 incomplete = true;
1695 continue;
1696 }
1697
1698
1699 if (dfa->eclosures[edest].nelem == 0)
1700 {
1701 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1702 if (__glibc_unlikely (err != REG_NOERROR))
1703 return err;
1704 }
1705 else
1706 eclosure_elem = dfa->eclosures[edest];
1707
1708 err = re_node_set_merge (&eclosure, &eclosure_elem);
1709 if (__glibc_unlikely (err != REG_NOERROR))
1710 return err;
1711
1712
1713 if (dfa->eclosures[edest].nelem == 0)
1714 {
1715 incomplete = true;
1716 re_node_set_free (&eclosure_elem);
1717 }
1718 }
1719
1720 if (incomplete && !root)
1721 dfa->eclosures[node].nelem = 0;
1722 else
1723 dfa->eclosures[node] = eclosure;
1724 *new_set = eclosure;
1725 return REG_NOERROR;
1726 }
1727
1728
1729
1730
1731
1732
1733 static void
1734 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1735 {
1736 re_string_skip_bytes (input, peek_token (result, input, syntax));
1737 }
1738
1739
1740
1741
1742 static int
1743 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1744 {
1745 unsigned char c;
1746
1747 if (re_string_eoi (input))
1748 {
1749 token->type = END_OF_RE;
1750 return 0;
1751 }
1752
1753 c = re_string_peek_byte (input, 0);
1754 token->opr.c = c;
1755
1756 token->word_char = 0;
1757 token->mb_partial = 0;
1758 if (input->mb_cur_max > 1
1759 && !re_string_first_byte (input, re_string_cur_idx (input)))
1760 {
1761 token->type = CHARACTER;
1762 token->mb_partial = 1;
1763 return 1;
1764 }
1765 if (c == '\\')
1766 {
1767 unsigned char c2;
1768 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1769 {
1770 token->type = BACK_SLASH;
1771 return 1;
1772 }
1773
1774 c2 = re_string_peek_byte_case (input, 1);
1775 token->opr.c = c2;
1776 token->type = CHARACTER;
1777 if (input->mb_cur_max > 1)
1778 {
1779 wint_t wc = re_string_wchar_at (input,
1780 re_string_cur_idx (input) + 1);
1781 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1782 }
1783 else
1784 token->word_char = IS_WORD_CHAR (c2) != 0;
1785
1786 switch (c2)
1787 {
1788 case '|':
1789 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1790 token->type = OP_ALT;
1791 break;
1792 case '1': case '2': case '3': case '4': case '5':
1793 case '6': case '7': case '8': case '9':
1794 if (!(syntax & RE_NO_BK_REFS))
1795 {
1796 token->type = OP_BACK_REF;
1797 token->opr.idx = c2 - '1';
1798 }
1799 break;
1800 case '<':
1801 if (!(syntax & RE_NO_GNU_OPS))
1802 {
1803 token->type = ANCHOR;
1804 token->opr.ctx_type = WORD_FIRST;
1805 }
1806 break;
1807 case '>':
1808 if (!(syntax & RE_NO_GNU_OPS))
1809 {
1810 token->type = ANCHOR;
1811 token->opr.ctx_type = WORD_LAST;
1812 }
1813 break;
1814 case 'b':
1815 if (!(syntax & RE_NO_GNU_OPS))
1816 {
1817 token->type = ANCHOR;
1818 token->opr.ctx_type = WORD_DELIM;
1819 }
1820 break;
1821 case 'B':
1822 if (!(syntax & RE_NO_GNU_OPS))
1823 {
1824 token->type = ANCHOR;
1825 token->opr.ctx_type = NOT_WORD_DELIM;
1826 }
1827 break;
1828 case 'w':
1829 if (!(syntax & RE_NO_GNU_OPS))
1830 token->type = OP_WORD;
1831 break;
1832 case 'W':
1833 if (!(syntax & RE_NO_GNU_OPS))
1834 token->type = OP_NOTWORD;
1835 break;
1836 case 's':
1837 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = OP_SPACE;
1839 break;
1840 case 'S':
1841 if (!(syntax & RE_NO_GNU_OPS))
1842 token->type = OP_NOTSPACE;
1843 break;
1844 case '`':
1845 if (!(syntax & RE_NO_GNU_OPS))
1846 {
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = BUF_FIRST;
1849 }
1850 break;
1851 case '\'':
1852 if (!(syntax & RE_NO_GNU_OPS))
1853 {
1854 token->type = ANCHOR;
1855 token->opr.ctx_type = BUF_LAST;
1856 }
1857 break;
1858 case '(':
1859 if (!(syntax & RE_NO_BK_PARENS))
1860 token->type = OP_OPEN_SUBEXP;
1861 break;
1862 case ')':
1863 if (!(syntax & RE_NO_BK_PARENS))
1864 token->type = OP_CLOSE_SUBEXP;
1865 break;
1866 case '+':
1867 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1868 token->type = OP_DUP_PLUS;
1869 break;
1870 case '?':
1871 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1872 token->type = OP_DUP_QUESTION;
1873 break;
1874 case '{':
1875 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1876 token->type = OP_OPEN_DUP_NUM;
1877 break;
1878 case '}':
1879 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1880 token->type = OP_CLOSE_DUP_NUM;
1881 break;
1882 default:
1883 break;
1884 }
1885 return 2;
1886 }
1887
1888 token->type = CHARACTER;
1889 if (input->mb_cur_max > 1)
1890 {
1891 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1893 }
1894 else
1895 token->word_char = IS_WORD_CHAR (token->opr.c);
1896
1897 switch (c)
1898 {
1899 case '\n':
1900 if (syntax & RE_NEWLINE_ALT)
1901 token->type = OP_ALT;
1902 break;
1903 case '|':
1904 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1905 token->type = OP_ALT;
1906 break;
1907 case '*':
1908 token->type = OP_DUP_ASTERISK;
1909 break;
1910 case '+':
1911 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1912 token->type = OP_DUP_PLUS;
1913 break;
1914 case '?':
1915 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1916 token->type = OP_DUP_QUESTION;
1917 break;
1918 case '{':
1919 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1920 token->type = OP_OPEN_DUP_NUM;
1921 break;
1922 case '}':
1923 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1924 token->type = OP_CLOSE_DUP_NUM;
1925 break;
1926 case '(':
1927 if (syntax & RE_NO_BK_PARENS)
1928 token->type = OP_OPEN_SUBEXP;
1929 break;
1930 case ')':
1931 if (syntax & RE_NO_BK_PARENS)
1932 token->type = OP_CLOSE_SUBEXP;
1933 break;
1934 case '[':
1935 token->type = OP_OPEN_BRACKET;
1936 break;
1937 case '.':
1938 token->type = OP_PERIOD;
1939 break;
1940 case '^':
1941 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1942 && re_string_cur_idx (input) != 0)
1943 {
1944 char prev = re_string_peek_byte (input, -1);
1945 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1946 break;
1947 }
1948 token->type = ANCHOR;
1949 token->opr.ctx_type = LINE_FIRST;
1950 break;
1951 case '$':
1952 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1953 && re_string_cur_idx (input) + 1 != re_string_length (input))
1954 {
1955 re_token_t next;
1956 re_string_skip_bytes (input, 1);
1957 peek_token (&next, input, syntax);
1958 re_string_skip_bytes (input, -1);
1959 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1960 break;
1961 }
1962 token->type = ANCHOR;
1963 token->opr.ctx_type = LINE_LAST;
1964 break;
1965 default:
1966 break;
1967 }
1968 return 1;
1969 }
1970
1971
1972
1973
1974 static int
1975 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1976 {
1977 unsigned char c;
1978 if (re_string_eoi (input))
1979 {
1980 token->type = END_OF_RE;
1981 return 0;
1982 }
1983 c = re_string_peek_byte (input, 0);
1984 token->opr.c = c;
1985
1986 if (input->mb_cur_max > 1
1987 && !re_string_first_byte (input, re_string_cur_idx (input)))
1988 {
1989 token->type = CHARACTER;
1990 return 1;
1991 }
1992
1993 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1994 && re_string_cur_idx (input) + 1 < re_string_length (input))
1995 {
1996
1997 unsigned char c2;
1998 re_string_skip_bytes (input, 1);
1999 c2 = re_string_peek_byte (input, 0);
2000 token->opr.c = c2;
2001 token->type = CHARACTER;
2002 return 1;
2003 }
2004 if (c == '[')
2005 {
2006 unsigned char c2;
2007 int token_len;
2008 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2009 c2 = re_string_peek_byte (input, 1);
2010 else
2011 c2 = 0;
2012 token->opr.c = c2;
2013 token_len = 2;
2014 switch (c2)
2015 {
2016 case '.':
2017 token->type = OP_OPEN_COLL_ELEM;
2018 break;
2019
2020 case '=':
2021 token->type = OP_OPEN_EQUIV_CLASS;
2022 break;
2023
2024 case ':':
2025 if (syntax & RE_CHAR_CLASSES)
2026 {
2027 token->type = OP_OPEN_CHAR_CLASS;
2028 break;
2029 }
2030 FALLTHROUGH;
2031 default:
2032 token->type = CHARACTER;
2033 token->opr.c = c;
2034 token_len = 1;
2035 break;
2036 }
2037 return token_len;
2038 }
2039 switch (c)
2040 {
2041 case '-':
2042 token->type = OP_CHARSET_RANGE;
2043 break;
2044 case ']':
2045 token->type = OP_CLOSE_BRACKET;
2046 break;
2047 case '^':
2048 token->type = OP_NON_MATCH_LIST;
2049 break;
2050 default:
2051 token->type = CHARACTER;
2052 }
2053 return 1;
2054 }
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070 static bin_tree_t *
2071 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2072 reg_errcode_t *err)
2073 {
2074 re_dfa_t *dfa = preg->buffer;
2075 bin_tree_t *tree, *eor, *root;
2076 re_token_t current_token;
2077 dfa->syntax = syntax;
2078 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2079 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2080 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2081 return NULL;
2082 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2083 if (tree != NULL)
2084 root = create_tree (dfa, tree, eor, CONCAT);
2085 else
2086 root = eor;
2087 if (__glibc_unlikely (eor == NULL || root == NULL))
2088 {
2089 *err = REG_ESPACE;
2090 return NULL;
2091 }
2092 return root;
2093 }
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104 static bin_tree_t *
2105 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2106 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2107 {
2108 re_dfa_t *dfa = preg->buffer;
2109 bin_tree_t *tree, *branch = NULL;
2110 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2111 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2112 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2113 return NULL;
2114
2115 while (token->type == OP_ALT)
2116 {
2117 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2118 if (token->type != OP_ALT && token->type != END_OF_RE
2119 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2120 {
2121 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2122 dfa->completed_bkref_map = initial_bkref_map;
2123 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2124 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2125 {
2126 if (tree != NULL)
2127 postorder (tree, free_tree, NULL);
2128 return NULL;
2129 }
2130 dfa->completed_bkref_map |= accumulated_bkref_map;
2131 }
2132 else
2133 branch = NULL;
2134 tree = create_tree (dfa, tree, branch, OP_ALT);
2135 if (__glibc_unlikely (tree == NULL))
2136 {
2137 *err = REG_ESPACE;
2138 return NULL;
2139 }
2140 }
2141 return tree;
2142 }
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153 static bin_tree_t *
2154 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2155 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2156 {
2157 bin_tree_t *tree, *expr;
2158 re_dfa_t *dfa = preg->buffer;
2159 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2160 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2161 return NULL;
2162
2163 while (token->type != OP_ALT && token->type != END_OF_RE
2164 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2165 {
2166 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2167 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2168 {
2169 if (tree != NULL)
2170 postorder (tree, free_tree, NULL);
2171 return NULL;
2172 }
2173 if (tree != NULL && expr != NULL)
2174 {
2175 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2176 if (newtree == NULL)
2177 {
2178 postorder (expr, free_tree, NULL);
2179 postorder (tree, free_tree, NULL);
2180 *err = REG_ESPACE;
2181 return NULL;
2182 }
2183 tree = newtree;
2184 }
2185 else if (tree == NULL)
2186 tree = expr;
2187
2188 }
2189 return tree;
2190 }
2191
2192
2193
2194
2195
2196
2197
2198 static bin_tree_t *
2199 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2200 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2201 {
2202 re_dfa_t *dfa = preg->buffer;
2203 bin_tree_t *tree;
2204 switch (token->type)
2205 {
2206 case CHARACTER:
2207 tree = create_token_tree (dfa, NULL, NULL, token);
2208 if (__glibc_unlikely (tree == NULL))
2209 {
2210 *err = REG_ESPACE;
2211 return NULL;
2212 }
2213 if (dfa->mb_cur_max > 1)
2214 {
2215 while (!re_string_eoi (regexp)
2216 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2217 {
2218 bin_tree_t *mbc_remain;
2219 fetch_token (token, regexp, syntax);
2220 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2221 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2222 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2223 {
2224 *err = REG_ESPACE;
2225 return NULL;
2226 }
2227 }
2228 }
2229 break;
2230
2231 case OP_OPEN_SUBEXP:
2232 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2233 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2234 return NULL;
2235 break;
2236
2237 case OP_OPEN_BRACKET:
2238 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2239 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2240 return NULL;
2241 break;
2242
2243 case OP_BACK_REF:
2244 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2245 {
2246 *err = REG_ESUBREG;
2247 return NULL;
2248 }
2249 dfa->used_bkref_map |= 1 << token->opr.idx;
2250 tree = create_token_tree (dfa, NULL, NULL, token);
2251 if (__glibc_unlikely (tree == NULL))
2252 {
2253 *err = REG_ESPACE;
2254 return NULL;
2255 }
2256 ++dfa->nbackref;
2257 dfa->has_mb_node = 1;
2258 break;
2259
2260 case OP_OPEN_DUP_NUM:
2261 if (syntax & RE_CONTEXT_INVALID_DUP)
2262 {
2263 *err = REG_BADRPT;
2264 return NULL;
2265 }
2266 FALLTHROUGH;
2267 case OP_DUP_ASTERISK:
2268 case OP_DUP_PLUS:
2269 case OP_DUP_QUESTION:
2270 if (syntax & RE_CONTEXT_INVALID_OPS)
2271 {
2272 *err = REG_BADRPT;
2273 return NULL;
2274 }
2275 else if (syntax & RE_CONTEXT_INDEP_OPS)
2276 {
2277 fetch_token (token, regexp, syntax);
2278 return parse_expression (regexp, preg, token, syntax, nest, err);
2279 }
2280 FALLTHROUGH;
2281 case OP_CLOSE_SUBEXP:
2282 if ((token->type == OP_CLOSE_SUBEXP)
2283 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2284 {
2285 *err = REG_ERPAREN;
2286 return NULL;
2287 }
2288 FALLTHROUGH;
2289 case OP_CLOSE_DUP_NUM:
2290
2291
2292
2293 token->type = CHARACTER;
2294
2295
2296 tree = create_token_tree (dfa, NULL, NULL, token);
2297 if (__glibc_unlikely (tree == NULL))
2298 {
2299 *err = REG_ESPACE;
2300 return NULL;
2301 }
2302 break;
2303
2304 case ANCHOR:
2305 if ((token->opr.ctx_type
2306 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2307 && dfa->word_ops_used == 0)
2308 init_word_char (dfa);
2309 if (token->opr.ctx_type == WORD_DELIM
2310 || token->opr.ctx_type == NOT_WORD_DELIM)
2311 {
2312 bin_tree_t *tree_first, *tree_last;
2313 if (token->opr.ctx_type == WORD_DELIM)
2314 {
2315 token->opr.ctx_type = WORD_FIRST;
2316 tree_first = create_token_tree (dfa, NULL, NULL, token);
2317 token->opr.ctx_type = WORD_LAST;
2318 }
2319 else
2320 {
2321 token->opr.ctx_type = INSIDE_WORD;
2322 tree_first = create_token_tree (dfa, NULL, NULL, token);
2323 token->opr.ctx_type = INSIDE_NOTWORD;
2324 }
2325 tree_last = create_token_tree (dfa, NULL, NULL, token);
2326 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2327 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2328 || tree == NULL))
2329 {
2330 *err = REG_ESPACE;
2331 return NULL;
2332 }
2333 }
2334 else
2335 {
2336 tree = create_token_tree (dfa, NULL, NULL, token);
2337 if (__glibc_unlikely (tree == NULL))
2338 {
2339 *err = REG_ESPACE;
2340 return NULL;
2341 }
2342 }
2343
2344
2345
2346
2347 fetch_token (token, regexp, syntax);
2348 return tree;
2349
2350 case OP_PERIOD:
2351 tree = create_token_tree (dfa, NULL, NULL, token);
2352 if (__glibc_unlikely (tree == NULL))
2353 {
2354 *err = REG_ESPACE;
2355 return NULL;
2356 }
2357 if (dfa->mb_cur_max > 1)
2358 dfa->has_mb_node = 1;
2359 break;
2360
2361 case OP_WORD:
2362 case OP_NOTWORD:
2363 tree = build_charclass_op (dfa, regexp->trans,
2364 "alnum",
2365 "_",
2366 token->type == OP_NOTWORD, err);
2367 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2368 return NULL;
2369 break;
2370
2371 case OP_SPACE:
2372 case OP_NOTSPACE:
2373 tree = build_charclass_op (dfa, regexp->trans,
2374 "space",
2375 "",
2376 token->type == OP_NOTSPACE, err);
2377 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2378 return NULL;
2379 break;
2380
2381 case OP_ALT:
2382 case END_OF_RE:
2383 return NULL;
2384
2385 case BACK_SLASH:
2386 *err = REG_EESCAPE;
2387 return NULL;
2388
2389 default:
2390
2391 DEBUG_ASSERT (false);
2392 return NULL;
2393 }
2394 fetch_token (token, regexp, syntax);
2395
2396 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2397 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2398 {
2399 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2400 syntax, err);
2401 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2402 {
2403 if (tree != NULL)
2404 postorder (tree, free_tree, NULL);
2405 return NULL;
2406 }
2407 tree = dup_tree;
2408
2409 if ((syntax & RE_CONTEXT_INVALID_DUP)
2410 && (token->type == OP_DUP_ASTERISK
2411 || token->type == OP_OPEN_DUP_NUM))
2412 {
2413 if (tree != NULL)
2414 postorder (tree, free_tree, NULL);
2415 *err = REG_BADRPT;
2416 return NULL;
2417 }
2418 }
2419
2420 return tree;
2421 }
2422
2423
2424
2425
2426
2427
2428
2429
2430 static bin_tree_t *
2431 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2432 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2433 {
2434 re_dfa_t *dfa = preg->buffer;
2435 bin_tree_t *tree;
2436 size_t cur_nsub;
2437 cur_nsub = preg->re_nsub++;
2438
2439 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2440
2441
2442 if (token->type == OP_CLOSE_SUBEXP)
2443 tree = NULL;
2444 else
2445 {
2446 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2447 if (__glibc_unlikely (*err == REG_NOERROR
2448 && token->type != OP_CLOSE_SUBEXP))
2449 {
2450 if (tree != NULL)
2451 postorder (tree, free_tree, NULL);
2452 *err = REG_EPAREN;
2453 }
2454 if (__glibc_unlikely (*err != REG_NOERROR))
2455 return NULL;
2456 }
2457
2458 if (cur_nsub <= '9' - '1')
2459 dfa->completed_bkref_map |= 1 << cur_nsub;
2460
2461 tree = create_tree (dfa, tree, NULL, SUBEXP);
2462 if (__glibc_unlikely (tree == NULL))
2463 {
2464 *err = REG_ESPACE;
2465 return NULL;
2466 }
2467 tree->token.opr.idx = cur_nsub;
2468 return tree;
2469 }
2470
2471
2472
2473 static bin_tree_t *
2474 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2475 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2476 {
2477 bin_tree_t *tree = NULL, *old_tree = NULL;
2478 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2479 re_token_t start_token = *token;
2480
2481 if (token->type == OP_OPEN_DUP_NUM)
2482 {
2483 end = 0;
2484 start = fetch_number (regexp, token, syntax);
2485 if (start == -1)
2486 {
2487 if (token->type == CHARACTER && token->opr.c == ',')
2488 start = 0;
2489 else
2490 {
2491 *err = REG_BADBR;
2492 return NULL;
2493 }
2494 }
2495 if (__glibc_likely (start != -2))
2496 {
2497
2498 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2499 : ((token->type == CHARACTER && token->opr.c == ',')
2500 ? fetch_number (regexp, token, syntax) : -2));
2501 }
2502 if (__glibc_unlikely (start == -2 || end == -2))
2503 {
2504
2505 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2506 {
2507 if (token->type == END_OF_RE)
2508 *err = REG_EBRACE;
2509 else
2510 *err = REG_BADBR;
2511
2512 return NULL;
2513 }
2514
2515
2516 re_string_set_index (regexp, start_idx);
2517 *token = start_token;
2518 token->type = CHARACTER;
2519
2520
2521 return elem;
2522 }
2523
2524 if (__glibc_unlikely ((end != -1 && start > end)
2525 || token->type != OP_CLOSE_DUP_NUM))
2526 {
2527
2528 *err = REG_BADBR;
2529 return NULL;
2530 }
2531
2532 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2533 {
2534 *err = REG_ESIZE;
2535 return NULL;
2536 }
2537 }
2538 else
2539 {
2540 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2541 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2542 }
2543
2544 fetch_token (token, regexp, syntax);
2545
2546 if (__glibc_unlikely (elem == NULL))
2547 return NULL;
2548 if (__glibc_unlikely (start == 0 && end == 0))
2549 {
2550 postorder (elem, free_tree, NULL);
2551 return NULL;
2552 }
2553
2554
2555 if (__glibc_unlikely (start > 0))
2556 {
2557 tree = elem;
2558 for (i = 2; i <= start; ++i)
2559 {
2560 elem = duplicate_tree (elem, dfa);
2561 tree = create_tree (dfa, tree, elem, CONCAT);
2562 if (__glibc_unlikely (elem == NULL || tree == NULL))
2563 goto parse_dup_op_espace;
2564 }
2565
2566 if (start == end)
2567 return tree;
2568
2569
2570 elem = duplicate_tree (elem, dfa);
2571 if (__glibc_unlikely (elem == NULL))
2572 goto parse_dup_op_espace;
2573 old_tree = tree;
2574 }
2575 else
2576 old_tree = NULL;
2577
2578 if (elem->token.type == SUBEXP)
2579 {
2580 uintptr_t subidx = elem->token.opr.idx;
2581 postorder (elem, mark_opt_subexp, (void *) subidx);
2582 }
2583
2584 tree = create_tree (dfa, elem, NULL,
2585 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2586 if (__glibc_unlikely (tree == NULL))
2587 goto parse_dup_op_espace;
2588
2589
2590
2591
2592 if (TYPE_SIGNED (Idx) || end != -1)
2593 for (i = start + 2; i <= end; ++i)
2594 {
2595 elem = duplicate_tree (elem, dfa);
2596 tree = create_tree (dfa, tree, elem, CONCAT);
2597 if (__glibc_unlikely (elem == NULL || tree == NULL))
2598 goto parse_dup_op_espace;
2599
2600 tree = create_tree (dfa, tree, NULL, OP_ALT);
2601 if (__glibc_unlikely (tree == NULL))
2602 goto parse_dup_op_espace;
2603 }
2604
2605 if (old_tree)
2606 tree = create_tree (dfa, old_tree, tree, CONCAT);
2607
2608 return tree;
2609
2610 parse_dup_op_espace:
2611 *err = REG_ESPACE;
2612 return NULL;
2613 }
2614
2615
2616
2617 #define BRACKET_NAME_BUF_SIZE 32
2618
2619 #ifndef _LIBC
2620
2621
2622
2623
2624 static wint_t
2625 parse_byte (unsigned char b, re_dfa_t const *dfa)
2626 {
2627 return dfa->mb_cur_max > 1 ? __btowc (b) : b;
2628 }
2629
2630
2631
2632
2633
2634
2635
2636
2637 static reg_errcode_t
2638 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2639 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2640 re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
2641 const unsigned char *collseqmb, const char *collseqwc,
2642 int_fast32_t table_size, const void *symb_table,
2643 const unsigned char *extra)
2644 {
2645
2646 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2647 || start_elem->type == CHAR_CLASS
2648 || end_elem->type == EQUIV_CLASS
2649 || end_elem->type == CHAR_CLASS))
2650 return REG_ERANGE;
2651
2652
2653
2654 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2655 && strlen ((char *) start_elem->opr.name) > 1)
2656 || (end_elem->type == COLL_SYM
2657 && strlen ((char *) end_elem->opr.name) > 1)))
2658 return REG_ECOLLATE;
2659
2660 unsigned int
2661 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2662 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663 : 0)),
2664 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2665 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2666 : 0));
2667 wint_t
2668 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2669 ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
2670 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2671 ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2672
2673 if (start_wc == WEOF || end_wc == WEOF)
2674 return REG_ECOLLATE;
2675 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2676 && start_wc > end_wc))
2677 return REG_ERANGE;
2678
2679
2680
2681
2682
2683
2684 if (dfa->mb_cur_max > 1)
2685 {
2686
2687 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2688 {
2689
2690 wchar_t *new_array_start, *new_array_end;
2691 Idx new_nranges;
2692
2693
2694 new_nranges = 2 * mbcset->nranges + 1;
2695
2696
2697 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2698 new_nranges);
2699 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2700 new_nranges);
2701
2702 if (__glibc_unlikely (new_array_start == NULL
2703 || new_array_end == NULL))
2704 {
2705 re_free (new_array_start);
2706 re_free (new_array_end);
2707 return REG_ESPACE;
2708 }
2709
2710 mbcset->range_starts = new_array_start;
2711 mbcset->range_ends = new_array_end;
2712 *range_alloc = new_nranges;
2713 }
2714
2715 mbcset->range_starts[mbcset->nranges] = start_wc;
2716 mbcset->range_ends[mbcset->nranges++] = end_wc;
2717 }
2718
2719
2720 for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2721 {
2722 if (start_wc <= wc && wc <= end_wc)
2723 bitset_set (sbcset, wc);
2724 }
2725
2726 return REG_NOERROR;
2727 }
2728 #endif
2729
2730 #ifndef _LIBC
2731
2732
2733
2734
2735
2736
2737 static reg_errcode_t
2738 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2739 Idx *coll_sym_alloc, const unsigned char *name,
2740 uint_fast32_t nrules, int_fast32_t table_size,
2741 const void *symb_table, const unsigned char *extra)
2742 {
2743 size_t name_len = strlen ((const char *) name);
2744 if (__glibc_unlikely (name_len != 1))
2745 return REG_ECOLLATE;
2746 else
2747 {
2748 bitset_set (sbcset, name[0]);
2749 return REG_NOERROR;
2750 }
2751 }
2752 #endif
2753
2754 #ifdef _LIBC
2755
2756
2757
2758
2759
2760 static __always_inline int32_t
2761 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2762 const int32_t *symb_table,
2763 int_fast32_t table_size,
2764 const unsigned char *extra)
2765 {
2766 int_fast32_t elem;
2767
2768 for (elem = 0; elem < table_size; elem++)
2769 if (symb_table[2 * elem] != 0)
2770 {
2771 int32_t idx = symb_table[2 * elem + 1];
2772
2773 idx += 1 + extra[idx];
2774 if (
2775 name_len == extra[idx]
2776
2777 && memcmp (name, &extra[idx + 1], name_len) == 0)
2778
2779 return elem;
2780 }
2781 return -1;
2782 }
2783
2784
2785
2786
2787
2788 static __always_inline unsigned int
2789 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2790 const unsigned char *collseqmb,
2791 const char *collseqwc,
2792 int_fast32_t table_size,
2793 const int32_t *symb_table,
2794 const unsigned char *extra)
2795 {
2796 if (br_elem->type == SB_CHAR)
2797 {
2798
2799 if (nrules == 0)
2800 return collseqmb[br_elem->opr.ch];
2801 else
2802 {
2803 wint_t wc = __btowc (br_elem->opr.ch);
2804 return __collseq_table_lookup (collseqwc, wc);
2805 }
2806 }
2807 else if (br_elem->type == MB_CHAR)
2808 {
2809 if (nrules != 0)
2810 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2811 }
2812 else if (br_elem->type == COLL_SYM)
2813 {
2814 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2815 if (nrules != 0)
2816 {
2817 int32_t elem, idx;
2818 elem = seek_collating_symbol_entry (br_elem->opr.name,
2819 sym_name_len,
2820 symb_table, table_size,
2821 extra);
2822 if (elem != -1)
2823 {
2824
2825 idx = symb_table[2 * elem + 1];
2826
2827 idx += 1 + extra[idx];
2828
2829 idx += 1 + extra[idx];
2830
2831 idx = (idx + 3) & ~3;
2832
2833 idx += sizeof (unsigned int);
2834
2835 idx += sizeof (unsigned int) *
2836 (1 + *(unsigned int *) (extra + idx));
2837
2838 return *(unsigned int *) (extra + idx);
2839 }
2840 else if (sym_name_len == 1)
2841 {
2842
2843
2844 return collseqmb[br_elem->opr.name[0]];
2845 }
2846 }
2847 else if (sym_name_len == 1)
2848 return collseqmb[br_elem->opr.name[0]];
2849 }
2850 return UINT_MAX;
2851 }
2852
2853
2854
2855
2856
2857
2858
2859
2860 static __always_inline reg_errcode_t
2861 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2862 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2863 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2864 const unsigned char *collseqmb, const char *collseqwc,
2865 int_fast32_t table_size, const int32_t *symb_table,
2866 const unsigned char *extra)
2867 {
2868 unsigned int ch;
2869 uint32_t start_collseq;
2870 uint32_t end_collseq;
2871
2872
2873
2874 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2875 || start_elem->type == CHAR_CLASS
2876 || end_elem->type == EQUIV_CLASS
2877 || end_elem->type == CHAR_CLASS))
2878 return REG_ERANGE;
2879
2880
2881 start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2882 table_size, symb_table, extra);
2883 end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2884 table_size, symb_table, extra);
2885
2886 if (__glibc_unlikely (start_collseq == UINT_MAX
2887 || end_collseq == UINT_MAX))
2888 return REG_ECOLLATE;
2889 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2890 && start_collseq > end_collseq))
2891 return REG_ERANGE;
2892
2893
2894
2895
2896
2897 if (nrules > 0 || dfa->mb_cur_max > 1)
2898 {
2899
2900 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2901 {
2902
2903 uint32_t *new_array_start;
2904 uint32_t *new_array_end;
2905 int new_nranges;
2906
2907
2908 new_nranges = 2 * mbcset->nranges + 1;
2909 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2910 new_nranges);
2911 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2912 new_nranges);
2913
2914 if (__glibc_unlikely (new_array_start == NULL
2915 || new_array_end == NULL))
2916 return REG_ESPACE;
2917
2918 mbcset->range_starts = new_array_start;
2919 mbcset->range_ends = new_array_end;
2920 *range_alloc = new_nranges;
2921 }
2922
2923 mbcset->range_starts[mbcset->nranges] = start_collseq;
2924 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2925 }
2926
2927
2928 for (ch = 0; ch < SBC_MAX; ch++)
2929 {
2930 uint32_t ch_collseq;
2931
2932 if (nrules == 0)
2933 ch_collseq = collseqmb[ch];
2934 else
2935 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2936 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2937 bitset_set (sbcset, ch);
2938 }
2939 return REG_NOERROR;
2940 }
2941
2942
2943
2944
2945
2946
2947
2948 static __always_inline reg_errcode_t
2949 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2950 Idx *coll_sym_alloc, const unsigned char *name,
2951 uint_fast32_t nrules, int_fast32_t table_size,
2952 const int32_t *symb_table, const unsigned char *extra)
2953 {
2954 int32_t elem, idx;
2955 size_t name_len = strlen ((const char *) name);
2956 if (nrules != 0)
2957 {
2958 elem = seek_collating_symbol_entry (name, name_len, symb_table,
2959 table_size, extra);
2960 if (elem != -1)
2961 {
2962
2963 idx = symb_table[2 * elem + 1];
2964
2965 idx += 1 + extra[idx];
2966 }
2967 else if (name_len == 1)
2968 {
2969
2970
2971 bitset_set (sbcset, name[0]);
2972 return REG_NOERROR;
2973 }
2974 else
2975 return REG_ECOLLATE;
2976
2977
2978
2979 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
2980 {
2981
2982
2983 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2984
2985
2986 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2987 new_coll_sym_alloc);
2988 if (__glibc_unlikely (new_coll_syms == NULL))
2989 return REG_ESPACE;
2990 mbcset->coll_syms = new_coll_syms;
2991 *coll_sym_alloc = new_coll_sym_alloc;
2992 }
2993 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2994 return REG_NOERROR;
2995 }
2996 else
2997 {
2998 if (__glibc_unlikely (name_len != 1))
2999 return REG_ECOLLATE;
3000 else
3001 {
3002 bitset_set (sbcset, name[0]);
3003 return REG_NOERROR;
3004 }
3005 }
3006 }
3007 #endif
3008
3009
3010
3011
3012 static bin_tree_t *
3013 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3014 reg_syntax_t syntax, reg_errcode_t *err)
3015 {
3016 const unsigned char *collseqmb = NULL;
3017 const char *collseqwc = NULL;
3018 uint_fast32_t nrules = 0;
3019 int_fast32_t table_size = 0;
3020 const void *symb_table = NULL;
3021 const unsigned char *extra = NULL;
3022
3023 re_token_t br_token;
3024 re_bitset_ptr_t sbcset;
3025 re_charset_t *mbcset;
3026 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3027 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3028 bool non_match = false;
3029 bin_tree_t *work_tree;
3030 int token_len;
3031 bool first_round = true;
3032 #ifdef _LIBC
3033 collseqmb = (const unsigned char *)
3034 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3035 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3036 if (nrules)
3037 {
3038
3039
3040
3041 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3042 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3043 symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3044 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3045 _NL_COLLATE_SYMB_EXTRAMB);
3046 }
3047 #endif
3048 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3049 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3050 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3051 {
3052 re_free (sbcset);
3053 re_free (mbcset);
3054 *err = REG_ESPACE;
3055 return NULL;
3056 }
3057
3058 token_len = peek_token_bracket (token, regexp, syntax);
3059 if (__glibc_unlikely (token->type == END_OF_RE))
3060 {
3061 *err = REG_BADPAT;
3062 goto parse_bracket_exp_free_return;
3063 }
3064 if (token->type == OP_NON_MATCH_LIST)
3065 {
3066 mbcset->non_match = 1;
3067 non_match = true;
3068 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3069 bitset_set (sbcset, '\n');
3070 re_string_skip_bytes (regexp, token_len);
3071 token_len = peek_token_bracket (token, regexp, syntax);
3072 if (__glibc_unlikely (token->type == END_OF_RE))
3073 {
3074 *err = REG_BADPAT;
3075 goto parse_bracket_exp_free_return;
3076 }
3077 }
3078
3079
3080 if (token->type == OP_CLOSE_BRACKET)
3081 token->type = CHARACTER;
3082
3083 while (1)
3084 {
3085 bracket_elem_t start_elem, end_elem;
3086 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3087 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3088 reg_errcode_t ret;
3089 int token_len2 = 0;
3090 bool is_range_exp = false;
3091 re_token_t token2;
3092
3093 start_elem.opr.name = start_name_buf;
3094 start_elem.type = COLL_SYM;
3095 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3096 syntax, first_round);
3097 if (__glibc_unlikely (ret != REG_NOERROR))
3098 {
3099 *err = ret;
3100 goto parse_bracket_exp_free_return;
3101 }
3102 first_round = false;
3103
3104
3105 token_len = peek_token_bracket (token, regexp, syntax);
3106
3107
3108 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3109 {
3110 if (__glibc_unlikely (token->type == END_OF_RE))
3111 {
3112 *err = REG_EBRACK;
3113 goto parse_bracket_exp_free_return;
3114 }
3115 if (token->type == OP_CHARSET_RANGE)
3116 {
3117 re_string_skip_bytes (regexp, token_len);
3118 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3119 if (__glibc_unlikely (token2.type == END_OF_RE))
3120 {
3121 *err = REG_EBRACK;
3122 goto parse_bracket_exp_free_return;
3123 }
3124 if (token2.type == OP_CLOSE_BRACKET)
3125 {
3126
3127 re_string_skip_bytes (regexp, -token_len);
3128 token->type = CHARACTER;
3129 }
3130 else
3131 is_range_exp = true;
3132 }
3133 }
3134
3135 if (is_range_exp == true)
3136 {
3137 end_elem.opr.name = end_name_buf;
3138 end_elem.type = COLL_SYM;
3139 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3140 dfa, syntax, true);
3141 if (__glibc_unlikely (ret != REG_NOERROR))
3142 {
3143 *err = ret;
3144 goto parse_bracket_exp_free_return;
3145 }
3146
3147 token_len = peek_token_bracket (token, regexp, syntax);
3148
3149 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3150 &start_elem, &end_elem,
3151 dfa, syntax, nrules, collseqmb, collseqwc,
3152 table_size, symb_table, extra);
3153 if (__glibc_unlikely (*err != REG_NOERROR))
3154 goto parse_bracket_exp_free_return;
3155 }
3156 else
3157 {
3158 switch (start_elem.type)
3159 {
3160 case SB_CHAR:
3161 bitset_set (sbcset, start_elem.opr.ch);
3162 break;
3163 case MB_CHAR:
3164
3165 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3166 {
3167 wchar_t *new_mbchars;
3168
3169
3170 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3171
3172 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3173 mbchar_alloc);
3174 if (__glibc_unlikely (new_mbchars == NULL))
3175 goto parse_bracket_exp_espace;
3176 mbcset->mbchars = new_mbchars;
3177 }
3178 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3179 break;
3180 case EQUIV_CLASS:
3181 *err = build_equiv_class (sbcset,
3182 mbcset, &equiv_class_alloc,
3183 start_elem.opr.name);
3184 if (__glibc_unlikely (*err != REG_NOERROR))
3185 goto parse_bracket_exp_free_return;
3186 break;
3187 case COLL_SYM:
3188 *err = build_collating_symbol (sbcset,
3189 mbcset, &coll_sym_alloc,
3190 start_elem.opr.name,
3191 nrules, table_size, symb_table, extra);
3192 if (__glibc_unlikely (*err != REG_NOERROR))
3193 goto parse_bracket_exp_free_return;
3194 break;
3195 case CHAR_CLASS:
3196 *err = build_charclass (regexp->trans, sbcset,
3197 mbcset, &char_class_alloc,
3198 (const char *) start_elem.opr.name,
3199 syntax);
3200 if (__glibc_unlikely (*err != REG_NOERROR))
3201 goto parse_bracket_exp_free_return;
3202 break;
3203 default:
3204 DEBUG_ASSERT (false);
3205 break;
3206 }
3207 }
3208 if (__glibc_unlikely (token->type == END_OF_RE))
3209 {
3210 *err = REG_EBRACK;
3211 goto parse_bracket_exp_free_return;
3212 }
3213 if (token->type == OP_CLOSE_BRACKET)
3214 break;
3215 }
3216
3217 re_string_skip_bytes (regexp, token_len);
3218
3219
3220 if (non_match)
3221 bitset_not (sbcset);
3222
3223
3224 if (dfa->mb_cur_max > 1)
3225 bitset_mask (sbcset, dfa->sb_char);
3226
3227 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3228 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3229 || mbcset->non_match)))
3230 {
3231 bin_tree_t *mbc_tree;
3232 int sbc_idx;
3233
3234 dfa->has_mb_node = 1;
3235 br_token.type = COMPLEX_BRACKET;
3236 br_token.opr.mbcset = mbcset;
3237 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3238 if (__glibc_unlikely (mbc_tree == NULL))
3239 goto parse_bracket_exp_espace;
3240 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3241 if (sbcset[sbc_idx])
3242 break;
3243
3244
3245 if (sbc_idx < BITSET_WORDS)
3246 {
3247
3248 br_token.type = SIMPLE_BRACKET;
3249 br_token.opr.sbcset = sbcset;
3250 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3251 if (__glibc_unlikely (work_tree == NULL))
3252 goto parse_bracket_exp_espace;
3253
3254
3255 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3256 if (__glibc_unlikely (work_tree == NULL))
3257 goto parse_bracket_exp_espace;
3258 }
3259 else
3260 {
3261 re_free (sbcset);
3262 work_tree = mbc_tree;
3263 }
3264 }
3265 else
3266 {
3267 free_charset (mbcset);
3268
3269 br_token.type = SIMPLE_BRACKET;
3270 br_token.opr.sbcset = sbcset;
3271 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272 if (__glibc_unlikely (work_tree == NULL))
3273 goto parse_bracket_exp_espace;
3274 }
3275 return work_tree;
3276
3277 parse_bracket_exp_espace:
3278 *err = REG_ESPACE;
3279 parse_bracket_exp_free_return:
3280 re_free (sbcset);
3281 free_charset (mbcset);
3282 return NULL;
3283 }
3284
3285
3286
3287 static reg_errcode_t
3288 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3289 re_token_t *token, int token_len, re_dfa_t *dfa,
3290 reg_syntax_t syntax, bool accept_hyphen)
3291 {
3292 int cur_char_size;
3293 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3294 if (cur_char_size > 1)
3295 {
3296 elem->type = MB_CHAR;
3297 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3298 re_string_skip_bytes (regexp, cur_char_size);
3299 return REG_NOERROR;
3300 }
3301 re_string_skip_bytes (regexp, token_len);
3302 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3303 || token->type == OP_OPEN_EQUIV_CLASS)
3304 return parse_bracket_symbol (elem, regexp, token);
3305 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3306 {
3307
3308
3309 re_token_t token2;
3310 (void) peek_token_bracket (&token2, regexp, syntax);
3311 if (token2.type != OP_CLOSE_BRACKET)
3312
3313
3314 return REG_ERANGE;
3315 }
3316 elem->type = SB_CHAR;
3317 elem->opr.ch = token->opr.c;
3318 return REG_NOERROR;
3319 }
3320
3321
3322
3323
3324
3325 static reg_errcode_t
3326 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3327 re_token_t *token)
3328 {
3329 unsigned char ch, delim = token->opr.c;
3330 int i = 0;
3331 if (re_string_eoi(regexp))
3332 return REG_EBRACK;
3333 for (;; ++i)
3334 {
3335 if (i >= BRACKET_NAME_BUF_SIZE)
3336 return REG_EBRACK;
3337 if (token->type == OP_OPEN_CHAR_CLASS)
3338 ch = re_string_fetch_byte_case (regexp);
3339 else
3340 ch = re_string_fetch_byte (regexp);
3341 if (re_string_eoi(regexp))
3342 return REG_EBRACK;
3343 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3344 break;
3345 elem->opr.name[i] = ch;
3346 }
3347 re_string_skip_bytes (regexp, 1);
3348 elem->opr.name[i] = '\0';
3349 switch (token->type)
3350 {
3351 case OP_OPEN_COLL_ELEM:
3352 elem->type = COLL_SYM;
3353 break;
3354 case OP_OPEN_EQUIV_CLASS:
3355 elem->type = EQUIV_CLASS;
3356 break;
3357 case OP_OPEN_CHAR_CLASS:
3358 elem->type = CHAR_CLASS;
3359 break;
3360 default:
3361 break;
3362 }
3363 return REG_NOERROR;
3364 }
3365
3366
3367
3368
3369
3370
3371
3372 static reg_errcode_t
3373 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3374 Idx *equiv_class_alloc, const unsigned char *name)
3375 {
3376 #ifdef _LIBC
3377 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3378 if (nrules != 0)
3379 {
3380 const int32_t *table, *indirect;
3381 const unsigned char *weights, *extra, *cp;
3382 unsigned char char_buf[2];
3383 int32_t idx1, idx2;
3384 unsigned int ch;
3385 size_t len;
3386
3387 cp = name;
3388 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3389 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3390 _NL_COLLATE_WEIGHTMB);
3391 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3392 _NL_COLLATE_EXTRAMB);
3393 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3394 _NL_COLLATE_INDIRECTMB);
3395 idx1 = findidx (table, indirect, extra, &cp, -1);
3396 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3397
3398 return REG_ECOLLATE;
3399
3400
3401 len = weights[idx1 & 0xffffff];
3402 for (ch = 0; ch < SBC_MAX; ++ch)
3403 {
3404 char_buf[0] = ch;
3405 cp = char_buf;
3406 idx2 = findidx (table, indirect, extra, &cp, 1);
3407
3408
3409
3410 if (idx2 == 0)
3411
3412 continue;
3413
3414
3415 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3416 && memcmp (weights + (idx1 & 0xffffff) + 1,
3417 weights + (idx2 & 0xffffff) + 1, len) == 0)
3418 bitset_set (sbcset, ch);
3419 }
3420
3421 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3422 {
3423
3424
3425 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3426
3427 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3428 int32_t,
3429 new_equiv_class_alloc);
3430 if (__glibc_unlikely (new_equiv_classes == NULL))
3431 return REG_ESPACE;
3432 mbcset->equiv_classes = new_equiv_classes;
3433 *equiv_class_alloc = new_equiv_class_alloc;
3434 }
3435 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3436 }
3437 else
3438 #endif
3439 {
3440 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3441 return REG_ECOLLATE;
3442 bitset_set (sbcset, *name);
3443 }
3444 return REG_NOERROR;
3445 }
3446
3447
3448
3449
3450
3451
3452
3453 static reg_errcode_t
3454 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3455 re_charset_t *mbcset, Idx *char_class_alloc,
3456 const char *class_name, reg_syntax_t syntax)
3457 {
3458 int i;
3459 const char *name = class_name;
3460
3461
3462
3463 if ((syntax & RE_ICASE)
3464 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3465 name = "alpha";
3466
3467
3468 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3469 {
3470
3471
3472 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3473
3474 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3475 new_char_class_alloc);
3476 if (__glibc_unlikely (new_char_classes == NULL))
3477 return REG_ESPACE;
3478 mbcset->char_classes = new_char_classes;
3479 *char_class_alloc = new_char_class_alloc;
3480 }
3481 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3482
3483 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3484 do { \
3485 if (__glibc_unlikely (trans != NULL)) \
3486 { \
3487 for (i = 0; i < SBC_MAX; ++i) \
3488 if (ctype_func (i)) \
3489 bitset_set (sbcset, trans[i]); \
3490 } \
3491 else \
3492 { \
3493 for (i = 0; i < SBC_MAX; ++i) \
3494 if (ctype_func (i)) \
3495 bitset_set (sbcset, i); \
3496 } \
3497 } while (0)
3498
3499 if (strcmp (name, "alnum") == 0)
3500 BUILD_CHARCLASS_LOOP (isalnum);
3501 else if (strcmp (name, "cntrl") == 0)
3502 BUILD_CHARCLASS_LOOP (iscntrl);
3503 else if (strcmp (name, "lower") == 0)
3504 BUILD_CHARCLASS_LOOP (islower);
3505 else if (strcmp (name, "space") == 0)
3506 BUILD_CHARCLASS_LOOP (isspace);
3507 else if (strcmp (name, "alpha") == 0)
3508 BUILD_CHARCLASS_LOOP (isalpha);
3509 else if (strcmp (name, "digit") == 0)
3510 BUILD_CHARCLASS_LOOP (isdigit);
3511 else if (strcmp (name, "print") == 0)
3512 BUILD_CHARCLASS_LOOP (isprint);
3513 else if (strcmp (name, "upper") == 0)
3514 BUILD_CHARCLASS_LOOP (isupper);
3515 else if (strcmp (name, "blank") == 0)
3516 BUILD_CHARCLASS_LOOP (isblank);
3517 else if (strcmp (name, "graph") == 0)
3518 BUILD_CHARCLASS_LOOP (isgraph);
3519 else if (strcmp (name, "punct") == 0)
3520 BUILD_CHARCLASS_LOOP (ispunct);
3521 else if (strcmp (name, "xdigit") == 0)
3522 BUILD_CHARCLASS_LOOP (isxdigit);
3523 else
3524 return REG_ECTYPE;
3525
3526 return REG_NOERROR;
3527 }
3528
3529 static bin_tree_t *
3530 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3531 const char *class_name,
3532 const char *extra, bool non_match,
3533 reg_errcode_t *err)
3534 {
3535 re_bitset_ptr_t sbcset;
3536 re_charset_t *mbcset;
3537 Idx alloc = 0;
3538 reg_errcode_t ret;
3539 bin_tree_t *tree;
3540
3541 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3542 if (__glibc_unlikely (sbcset == NULL))
3543 {
3544 *err = REG_ESPACE;
3545 return NULL;
3546 }
3547 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3548 if (__glibc_unlikely (mbcset == NULL))
3549 {
3550 re_free (sbcset);
3551 *err = REG_ESPACE;
3552 return NULL;
3553 }
3554 mbcset->non_match = non_match;
3555
3556
3557 ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3558
3559 if (__glibc_unlikely (ret != REG_NOERROR))
3560 {
3561 re_free (sbcset);
3562 free_charset (mbcset);
3563 *err = ret;
3564 return NULL;
3565 }
3566
3567 for (; *extra; extra++)
3568 bitset_set (sbcset, *extra);
3569
3570
3571 if (non_match)
3572 bitset_not (sbcset);
3573
3574
3575 if (dfa->mb_cur_max > 1)
3576 bitset_mask (sbcset, dfa->sb_char);
3577
3578
3579 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3580 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3581 if (__glibc_unlikely (tree == NULL))
3582 goto build_word_op_espace;
3583
3584 if (dfa->mb_cur_max > 1)
3585 {
3586 bin_tree_t *mbc_tree;
3587
3588 br_token.type = COMPLEX_BRACKET;
3589 br_token.opr.mbcset = mbcset;
3590 dfa->has_mb_node = 1;
3591 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3592 if (__glibc_unlikely (mbc_tree == NULL))
3593 goto build_word_op_espace;
3594
3595 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3596 if (__glibc_likely (mbc_tree != NULL))
3597 return tree;
3598 }
3599 else
3600 {
3601 free_charset (mbcset);
3602 return tree;
3603 }
3604
3605 build_word_op_espace:
3606 re_free (sbcset);
3607 free_charset (mbcset);
3608 *err = REG_ESPACE;
3609 return NULL;
3610 }
3611
3612
3613
3614
3615
3616
3617
3618 static Idx
3619 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3620 {
3621 Idx num = -1;
3622 unsigned char c;
3623 while (1)
3624 {
3625 fetch_token (token, input, syntax);
3626 c = token->opr.c;
3627 if (__glibc_unlikely (token->type == END_OF_RE))
3628 return -2;
3629 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3630 break;
3631 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3632 ? -2
3633 : num == -1
3634 ? c - '0'
3635 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3636 }
3637 return num;
3638 }
3639
3640 static void
3641 free_charset (re_charset_t *cset)
3642 {
3643 re_free (cset->mbchars);
3644 #ifdef _LIBC
3645 re_free (cset->coll_syms);
3646 re_free (cset->equiv_classes);
3647 #endif
3648 re_free (cset->range_starts);
3649 re_free (cset->range_ends);
3650 re_free (cset->char_classes);
3651 re_free (cset);
3652 }
3653
3654
3655
3656
3657
3658 static bin_tree_t *
3659 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3660 re_token_type_t type)
3661 {
3662 re_token_t t = { .type = type };
3663 return create_token_tree (dfa, left, right, &t);
3664 }
3665
3666 static bin_tree_t *
3667 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3668 const re_token_t *token)
3669 {
3670 bin_tree_t *tree;
3671 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3672 {
3673 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3674
3675 if (storage == NULL)
3676 return NULL;
3677 storage->next = dfa->str_tree_storage;
3678 dfa->str_tree_storage = storage;
3679 dfa->str_tree_storage_idx = 0;
3680 }
3681 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3682
3683 tree->parent = NULL;
3684 tree->left = left;
3685 tree->right = right;
3686 tree->token = *token;
3687 tree->token.duplicated = 0;
3688 tree->token.opt_subexp = 0;
3689 tree->first = NULL;
3690 tree->next = NULL;
3691 tree->node_idx = -1;
3692
3693 if (left != NULL)
3694 left->parent = tree;
3695 if (right != NULL)
3696 right->parent = tree;
3697 return tree;
3698 }
3699
3700
3701
3702
3703 static reg_errcode_t
3704 mark_opt_subexp (void *extra, bin_tree_t *node)
3705 {
3706 Idx idx = (uintptr_t) extra;
3707 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3708 node->token.opt_subexp = 1;
3709
3710 return REG_NOERROR;
3711 }
3712
3713
3714
3715 static void
3716 free_token (re_token_t *node)
3717 {
3718 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3719 free_charset (node->opr.mbcset);
3720 else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3721 re_free (node->opr.sbcset);
3722 }
3723
3724
3725
3726
3727 static reg_errcode_t
3728 free_tree (void *extra, bin_tree_t *node)
3729 {
3730 free_token (&node->token);
3731 return REG_NOERROR;
3732 }
3733
3734
3735
3736
3737
3738
3739
3740 static bin_tree_t *
3741 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3742 {
3743 const bin_tree_t *node;
3744 bin_tree_t *dup_root;
3745 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3746
3747 for (node = root; ; )
3748 {
3749
3750 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3751 if (*p_new == NULL)
3752 return NULL;
3753 (*p_new)->parent = dup_node;
3754 (*p_new)->token.duplicated = 1;
3755 dup_node = *p_new;
3756
3757
3758 if (node->left)
3759 {
3760 node = node->left;
3761 p_new = &dup_node->left;
3762 }
3763 else
3764 {
3765 const bin_tree_t *prev = NULL;
3766 while (node->right == prev || node->right == NULL)
3767 {
3768 prev = node;
3769 node = node->parent;
3770 dup_node = dup_node->parent;
3771 if (!node)
3772 return dup_root;
3773 }
3774 node = node->right;
3775 p_new = &dup_node->right;
3776 }
3777 }
3778 }