This source file includes following definitions.
- re_compile_pattern
- weak_alias
- weak_alias
- weak_alias
- re_compile_fastmap_iter
- regcomp
- libc_hidden_def
- free_dfa_content
- regfree
- libc_hidden_def
- libc_freeres_fn
- re_compile_internal
- init_dfa
- init_word_char
- free_workarea_compile
- create_initial_state
- optimize_utf8
- analyze
- postorder
- preorder
- optimize_subexps
- lower_subexps
- lower_subexp
- calc_first
- calc_next
- link_nfa_nodes
- duplicate_node_closure
- search_duplicated_node
- duplicate_node
- calc_inveclosure
- calc_eclosure
- calc_eclosure_iter
- fetch_token
- peek_token
- peek_token_bracket
- parse
- parse_reg_exp
- parse_branch
- parse_expression
- parse_sub_exp
- parse_dup_op
- parse_byte
- build_range_exp
- build_collating_symbol
- parse_bracket_exp
- parse_bracket_element
- parse_bracket_symbol
- build_equiv_class
- build_charclass
- build_charclass_op
- fetch_number
- free_charset
- create_tree
- create_token_tree
- mark_opt_subexp
- free_token
- free_tree
- duplicate_tree
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126
127
128
129
130
131
132 static const char __re_error_msgid[] =
133 {
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success")
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match")
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression")
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character")
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name")
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash")
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference")
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=")
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(")
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{")
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}")
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end")
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted")
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression")
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression")
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big")
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)")
184 };
185
186 static const size_t __re_error_msgid_idx[] =
187 {
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
205 };
206
207
208
209
210
211
212
213
214
215
216 const char *
217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
219 {
220 reg_errcode_t ret;
221
222
223
224
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227
228 bufp->newline_anchor = 1;
229
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
236 weak_alias (__re_compile_pattern, re_compile_pattern)
237
238
239
240
241
242
243 reg_syntax_t re_syntax_options;
244
245
246
247
248
249
250
251
252
253 reg_syntax_t
254 re_set_syntax (reg_syntax_t syntax)
255 {
256 reg_syntax_t ret = re_syntax_options;
257
258 re_syntax_options = syntax;
259 return ret;
260 }
261 weak_alias (__re_set_syntax, re_set_syntax)
262
263 int
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
265 {
266 re_dfa_t *dfa = bufp->buffer;
267 char *fastmap = bufp->fastmap;
268
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->fastmap_accurate = 1;
278 return 0;
279 }
280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
281
282 static inline void
283 __attribute__ ((always_inline))
284 re_set_fastmap (char *fastmap, bool icase, int ch)
285 {
286 fastmap[ch] = 1;
287 if (icase)
288 fastmap[tolower (ch)] = 1;
289 }
290
291
292
293
294 static void
295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
296 char *fastmap)
297 {
298 re_dfa_t *dfa = bufp->buffer;
299 Idx node_cnt;
300 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
301 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
302 {
303 Idx node = init_state->nodes.elems[node_cnt];
304 re_token_type_t type = dfa->nodes[node].type;
305
306 if (type == CHARACTER)
307 {
308 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309 #ifdef RE_ENABLE_I18N
310 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
311 {
312 unsigned char buf[MB_LEN_MAX];
313 unsigned char *p;
314 wchar_t wc;
315 mbstate_t state;
316
317 p = buf;
318 *p++ = dfa->nodes[node].opr.c;
319 while (++node < dfa->nodes_len
320 && dfa->nodes[node].type == CHARACTER
321 && dfa->nodes[node].mb_partial)
322 *p++ = dfa->nodes[node].opr.c;
323 memset (&state, '\0', sizeof (state));
324 if (__mbrtowc (&wc, (const char *) buf, p - buf,
325 &state) == p - buf
326 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
327 != (size_t) -1))
328 re_set_fastmap (fastmap, false, buf[0]);
329 }
330 #endif
331 }
332 else if (type == SIMPLE_BRACKET)
333 {
334 int i, ch;
335 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
336 {
337 int j;
338 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
340 if (w & ((bitset_word_t) 1 << j))
341 re_set_fastmap (fastmap, icase, ch);
342 }
343 }
344 #ifdef RE_ENABLE_I18N
345 else if (type == COMPLEX_BRACKET)
346 {
347 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348 Idx i;
349
350 # ifdef _LIBC
351
352
353
354
355
356
357 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
358 && (cset->ncoll_syms || cset->nranges))
359 {
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0; i < SBC_MAX; ++i)
363 if (table[i] < 0)
364 re_set_fastmap (fastmap, icase, i);
365 }
366 # endif
367
368
369
370
371
372 if (dfa->mb_cur_max > 1
373 && (cset->nchar_classes || cset->non_match || cset->nranges
374 # ifdef _LIBC
375 || cset->nequiv_classes
376 # endif
377 ))
378 {
379 unsigned char c = 0;
380 do
381 {
382 mbstate_t mbs;
383 memset (&mbs, 0, sizeof (mbs));
384 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
385 re_set_fastmap (fastmap, false, (int) c);
386 }
387 while (++c != 0);
388 }
389
390 else
391 {
392
393 for (i = 0; i < cset->nmbchars; ++i)
394 {
395 char buf[256];
396 mbstate_t state;
397 memset (&state, '\0', sizeof (state));
398 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
399 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
400 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
401 {
402 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403 != (size_t) -1)
404 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
405 }
406 }
407 }
408 }
409 #endif
410 else if (type == OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type == OP_UTF8_PERIOD
413 #endif
414 || type == END_OF_RE)
415 {
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
419 return;
420 }
421 }
422 }
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460 int
461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
462 {
463 reg_errcode_t ret;
464 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465 : RE_SYNTAX_POSIX_BASIC);
466
467 preg->buffer = NULL;
468 preg->allocated = 0;
469 preg->used = 0;
470
471
472 preg->fastmap = re_malloc (char, SBC_MAX);
473 if (__glibc_unlikely (preg->fastmap == NULL))
474 return REG_ESPACE;
475
476 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
477
478
479 if (cflags & REG_NEWLINE)
480 {
481 syntax &= ~RE_DOT_NEWLINE;
482 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
483
484 preg->newline_anchor = 1;
485 }
486 else
487 preg->newline_anchor = 0;
488 preg->no_sub = !!(cflags & REG_NOSUB);
489 preg->translate = NULL;
490
491 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
492
493
494
495 if (ret == REG_ERPAREN)
496 ret = REG_EPAREN;
497
498
499 if (__glibc_likely (ret == REG_NOERROR))
500
501
502 (void) re_compile_fastmap (preg);
503 else
504 {
505
506 re_free (preg->fastmap);
507 preg->fastmap = NULL;
508 }
509
510 return (int) ret;
511 }
512 libc_hidden_def (__regcomp)
513 weak_alias (__regcomp, regcomp)
514
515
516
517
518 size_t
519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520 size_t errbuf_size)
521 {
522 const char *msg;
523 size_t msg_size;
524 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
525
526 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
527
528
529
530
531 abort ();
532
533 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
534
535 msg_size = strlen (msg) + 1;
536
537 if (__glibc_likely (errbuf_size != 0))
538 {
539 size_t cpy_size = msg_size;
540 if (__glibc_unlikely (msg_size > errbuf_size))
541 {
542 cpy_size = errbuf_size - 1;
543 errbuf[cpy_size] = '\0';
544 }
545 memcpy (errbuf, msg, cpy_size);
546 }
547
548 return msg_size;
549 }
550 weak_alias (__regerror, regerror)
551
552
553 #ifdef RE_ENABLE_I18N
554
555
556
557
558 static const bitset_t utf8_sb_map =
559 {
560
561 # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
562 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563 # else
564 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
565 # error "bitset_word_t is narrower than 32 bits"
566 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569 BITSET_WORD_MAX, BITSET_WORD_MAX,
570 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571 BITSET_WORD_MAX,
572 # endif
573 (BITSET_WORD_MAX
574 >> (SBC_MAX % BITSET_WORD_BITS == 0
575 ? 0
576 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577 # endif
578 };
579 #endif
580
581
582 static void
583 free_dfa_content (re_dfa_t *dfa)
584 {
585 Idx i, j;
586
587 if (dfa->nodes)
588 for (i = 0; i < dfa->nodes_len; ++i)
589 free_token (dfa->nodes + i);
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
592 {
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
599 }
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
604
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
607 {
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
610 {
611 re_dfastate_t *state = entry->array[j];
612 free_state (state);
613 }
614 re_free (entry->array);
615 }
616 re_free (dfa->state_table);
617 #ifdef RE_ENABLE_I18N
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
620 #endif
621 re_free (dfa->subexp_map);
622 #ifdef DEBUG
623 re_free (dfa->re_str);
624 #endif
625
626 re_free (dfa);
627 }
628
629
630
631
632 void
633 regfree (regex_t *preg)
634 {
635 re_dfa_t *dfa = preg->buffer;
636 if (__glibc_likely (dfa != NULL))
637 {
638 lock_fini (dfa->lock);
639 free_dfa_content (dfa);
640 }
641 preg->buffer = NULL;
642 preg->allocated = 0;
643
644 re_free (preg->fastmap);
645 preg->fastmap = NULL;
646
647 re_free (preg->translate);
648 preg->translate = NULL;
649 }
650 libc_hidden_def (__regfree)
651 weak_alias (__regfree, regfree)
652
653
654
655
656 #if defined _REGEX_RE_COMP || defined _LIBC
657
658
659 static struct re_pattern_buffer re_comp_buf;
660
661 char *
662 # ifdef _LIBC
663
664
665
666 weak_function
667 # endif
668 re_comp (const char *s)
669 {
670 reg_errcode_t ret;
671 char *fastmap;
672
673 if (!s)
674 {
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
677 return 0;
678 }
679
680 if (re_comp_buf.buffer)
681 {
682 fastmap = re_comp_buf.fastmap;
683 re_comp_buf.fastmap = NULL;
684 __regfree (&re_comp_buf);
685 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
686 re_comp_buf.fastmap = fastmap;
687 }
688
689 if (re_comp_buf.fastmap == NULL)
690 {
691 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692 if (re_comp_buf.fastmap == NULL)
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx[(int) REG_ESPACE]);
695 }
696
697
698
699
700
701 re_comp_buf.newline_anchor = 1;
702
703 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704
705 if (!ret)
706 return NULL;
707
708
709 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
710 }
711
712 #ifdef _LIBC
713 libc_freeres_fn (free_mem)
714 {
715 __regfree (&re_comp_buf);
716 }
717 #endif
718
719 #endif
720
721
722
723
724
725 static reg_errcode_t
726 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
727 reg_syntax_t syntax)
728 {
729 reg_errcode_t err = REG_NOERROR;
730 re_dfa_t *dfa;
731 re_string_t regexp;
732
733
734 preg->fastmap_accurate = 0;
735 preg->syntax = syntax;
736 preg->not_bol = preg->not_eol = 0;
737 preg->used = 0;
738 preg->re_nsub = 0;
739 preg->can_be_null = 0;
740 preg->regs_allocated = REGS_UNALLOCATED;
741
742
743 dfa = preg->buffer;
744 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
745 {
746
747
748
749
750 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
751 if (dfa == NULL)
752 return REG_ESPACE;
753 preg->allocated = sizeof (re_dfa_t);
754 preg->buffer = dfa;
755 }
756 preg->used = sizeof (re_dfa_t);
757
758 err = init_dfa (dfa, length);
759 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
760 err = REG_ESPACE;
761 if (__glibc_unlikely (err != REG_NOERROR))
762 {
763 free_dfa_content (dfa);
764 preg->buffer = NULL;
765 preg->allocated = 0;
766 return err;
767 }
768 #ifdef DEBUG
769
770 dfa->re_str = re_malloc (char, length + 1);
771 strncpy (dfa->re_str, pattern, length + 1);
772 #endif
773
774 err = re_string_construct (®exp, pattern, length, preg->translate,
775 (syntax & RE_ICASE) != 0, dfa);
776 if (__glibc_unlikely (err != REG_NOERROR))
777 {
778 re_compile_internal_free_return:
779 free_workarea_compile (preg);
780 re_string_destruct (®exp);
781 lock_fini (dfa->lock);
782 free_dfa_content (dfa);
783 preg->buffer = NULL;
784 preg->allocated = 0;
785 return err;
786 }
787
788
789 preg->re_nsub = 0;
790 dfa->str_tree = parse (®exp, preg, syntax, &err);
791 if (__glibc_unlikely (dfa->str_tree == NULL))
792 goto re_compile_internal_free_return;
793
794
795 err = analyze (preg);
796 if (__glibc_unlikely (err != REG_NOERROR))
797 goto re_compile_internal_free_return;
798
799 #ifdef RE_ENABLE_I18N
800
801 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
802 optimize_utf8 (dfa);
803 #endif
804
805
806 err = create_initial_state (dfa);
807
808
809 free_workarea_compile (preg);
810 re_string_destruct (®exp);
811
812 if (__glibc_unlikely (err != REG_NOERROR))
813 {
814 lock_fini (dfa->lock);
815 free_dfa_content (dfa);
816 preg->buffer = NULL;
817 preg->allocated = 0;
818 }
819
820 return err;
821 }
822
823
824
825
826 static reg_errcode_t
827 init_dfa (re_dfa_t *dfa, size_t pat_len)
828 {
829 __re_size_t table_size;
830 #ifndef _LIBC
831 const char *codeset_name;
832 #endif
833 #ifdef RE_ENABLE_I18N
834 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835 #else
836 size_t max_i18n_object_size = 0;
837 #endif
838 size_t max_object_size =
839 MAX (sizeof (struct re_state_table_entry),
840 MAX (sizeof (re_token_t),
841 MAX (sizeof (re_node_set),
842 MAX (sizeof (regmatch_t),
843 max_i18n_object_size))));
844
845 memset (dfa, '\0', sizeof (re_dfa_t));
846
847
848 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
849
850
851
852
853
854 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
855 <= pat_len))
856 return REG_ESPACE;
857
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860
861
862 for (table_size = 1; ; table_size <<= 1)
863 if (table_size > pat_len)
864 break;
865
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
868
869 dfa->mb_cur_max = MB_CUR_MAX;
870 #ifdef _LIBC
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873 dfa->is_utf8 = 1;
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 != 0);
876 #else
877 codeset_name = nl_langinfo (CODESET);
878 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
879 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
880 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
881 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
882 dfa->is_utf8 = 1;
883
884
885
886 dfa->map_notascii = 0;
887 #endif
888
889 #ifdef RE_ENABLE_I18N
890 if (dfa->mb_cur_max > 1)
891 {
892 if (dfa->is_utf8)
893 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
894 else
895 {
896 int i, j, ch;
897
898 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
899 if (__glibc_unlikely (dfa->sb_char == NULL))
900 return REG_ESPACE;
901
902
903 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
904 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
905 {
906 wint_t wch = __btowc (ch);
907 if (wch != WEOF)
908 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
909 # ifndef _LIBC
910 if (isascii (ch) && wch != ch)
911 dfa->map_notascii = 1;
912 # endif
913 }
914 }
915 }
916 #endif
917
918 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
919 return REG_ESPACE;
920 return REG_NOERROR;
921 }
922
923
924
925
926
927 static void
928 init_word_char (re_dfa_t *dfa)
929 {
930 int i = 0;
931 int j;
932 int ch = 0;
933 dfa->word_ops_used = 1;
934 if (__glibc_likely (dfa->map_notascii == 0))
935 {
936
937
938 bitset_word_t bits0 = 0x00000000;
939 bitset_word_t bits1 = 0x03ff0000;
940 bitset_word_t bits2 = 0x87fffffe;
941 bitset_word_t bits3 = 0x07fffffe;
942 if (BITSET_WORD_BITS == 64)
943 {
944
945 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
946 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
947 i = 2;
948 }
949 else if (BITSET_WORD_BITS == 32)
950 {
951 dfa->word_char[0] = bits0;
952 dfa->word_char[1] = bits1;
953 dfa->word_char[2] = bits2;
954 dfa->word_char[3] = bits3;
955 i = 4;
956 }
957 else
958 goto general_case;
959 ch = 128;
960
961 if (__glibc_likely (dfa->is_utf8))
962 {
963 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
964 return;
965 }
966 }
967
968 general_case:
969 for (; i < BITSET_WORDS; ++i)
970 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
971 if (isalnum (ch) || ch == '_')
972 dfa->word_char[i] |= (bitset_word_t) 1 << j;
973 }
974
975
976
977 static void
978 free_workarea_compile (regex_t *preg)
979 {
980 re_dfa_t *dfa = preg->buffer;
981 bin_tree_storage_t *storage, *next;
982 for (storage = dfa->str_tree_storage; storage; storage = next)
983 {
984 next = storage->next;
985 re_free (storage);
986 }
987 dfa->str_tree_storage = NULL;
988 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989 dfa->str_tree = NULL;
990 re_free (dfa->org_indices);
991 dfa->org_indices = NULL;
992 }
993
994
995
996 static reg_errcode_t
997 create_initial_state (re_dfa_t *dfa)
998 {
999 Idx first, i;
1000 reg_errcode_t err;
1001 re_node_set init_nodes;
1002
1003
1004
1005 first = dfa->str_tree->first->node_idx;
1006 dfa->init_node = first;
1007 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008 if (__glibc_unlikely (err != REG_NOERROR))
1009 return err;
1010
1011
1012
1013
1014
1015 if (dfa->nbackref > 0)
1016 for (i = 0; i < init_nodes.nelem; ++i)
1017 {
1018 Idx node_idx = init_nodes.elems[i];
1019 re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021 Idx clexp_idx;
1022 if (type != OP_BACK_REF)
1023 continue;
1024 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025 {
1026 re_token_t *clexp_node;
1027 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028 if (clexp_node->type == OP_CLOSE_SUBEXP
1029 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030 break;
1031 }
1032 if (clexp_idx == init_nodes.nelem)
1033 continue;
1034
1035 if (type == OP_BACK_REF)
1036 {
1037 Idx dest_idx = dfa->edests[node_idx].elems[0];
1038 if (!re_node_set_contains (&init_nodes, dest_idx))
1039 {
1040 reg_errcode_t merge_err
1041 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042 if (merge_err != REG_NOERROR)
1043 return merge_err;
1044 i = 0;
1045 }
1046 }
1047 }
1048
1049
1050 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051
1052 if (__glibc_unlikely (dfa->init_state == NULL))
1053 return err;
1054 if (dfa->init_state->has_constraint)
1055 {
1056 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057 CONTEXT_WORD);
1058 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059 CONTEXT_NEWLINE);
1060 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061 &init_nodes,
1062 CONTEXT_NEWLINE
1063 | CONTEXT_BEGBUF);
1064 if (__glibc_unlikely (dfa->init_state_word == NULL
1065 || dfa->init_state_nl == NULL
1066 || dfa->init_state_begbuf == NULL))
1067 return err;
1068 }
1069 else
1070 dfa->init_state_word = dfa->init_state_nl
1071 = dfa->init_state_begbuf = dfa->init_state;
1072
1073 re_node_set_free (&init_nodes);
1074 return REG_NOERROR;
1075 }
1076
1077 #ifdef RE_ENABLE_I18N
1078
1079
1080
1081
1082 static void
1083 optimize_utf8 (re_dfa_t *dfa)
1084 {
1085 Idx node;
1086 int i;
1087 bool mb_chars = false;
1088 bool has_period = false;
1089
1090 for (node = 0; node < dfa->nodes_len; ++node)
1091 switch (dfa->nodes[node].type)
1092 {
1093 case CHARACTER:
1094 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1095 mb_chars = true;
1096 break;
1097 case ANCHOR:
1098 switch (dfa->nodes[node].opr.ctx_type)
1099 {
1100 case LINE_FIRST:
1101 case LINE_LAST:
1102 case BUF_FIRST:
1103 case BUF_LAST:
1104 break;
1105 default:
1106
1107
1108
1109 return;
1110 }
1111 break;
1112 case OP_PERIOD:
1113 has_period = true;
1114 break;
1115 case OP_BACK_REF:
1116 case OP_ALT:
1117 case END_OF_RE:
1118 case OP_DUP_ASTERISK:
1119 case OP_OPEN_SUBEXP:
1120 case OP_CLOSE_SUBEXP:
1121 break;
1122 case COMPLEX_BRACKET:
1123 return;
1124 case SIMPLE_BRACKET:
1125
1126 {
1127 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1128 ? 0
1129 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131 {
1132 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1133 return;
1134 rshift = 0;
1135 }
1136 }
1137 break;
1138 default:
1139 abort ();
1140 }
1141
1142 if (mb_chars || has_period)
1143 for (node = 0; node < dfa->nodes_len; ++node)
1144 {
1145 if (dfa->nodes[node].type == CHARACTER
1146 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1147 dfa->nodes[node].mb_partial = 0;
1148 else if (dfa->nodes[node].type == OP_PERIOD)
1149 dfa->nodes[node].type = OP_UTF8_PERIOD;
1150 }
1151
1152
1153 dfa->mb_cur_max = 1;
1154 dfa->is_utf8 = 0;
1155 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1156 }
1157 #endif
1158
1159
1160
1161
1162 static reg_errcode_t
1163 analyze (regex_t *preg)
1164 {
1165 re_dfa_t *dfa = preg->buffer;
1166 reg_errcode_t ret;
1167
1168
1169 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1171 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1174 || dfa->edests == NULL || dfa->eclosures == NULL))
1175 return REG_ESPACE;
1176
1177 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1178 if (dfa->subexp_map != NULL)
1179 {
1180 Idx i;
1181 for (i = 0; i < preg->re_nsub; i++)
1182 dfa->subexp_map[i] = i;
1183 preorder (dfa->str_tree, optimize_subexps, dfa);
1184 for (i = 0; i < preg->re_nsub; i++)
1185 if (dfa->subexp_map[i] != i)
1186 break;
1187 if (i == preg->re_nsub)
1188 {
1189 re_free (dfa->subexp_map);
1190 dfa->subexp_map = NULL;
1191 }
1192 }
1193
1194 ret = postorder (dfa->str_tree, lower_subexps, preg);
1195 if (__glibc_unlikely (ret != REG_NOERROR))
1196 return ret;
1197 ret = postorder (dfa->str_tree, calc_first, dfa);
1198 if (__glibc_unlikely (ret != REG_NOERROR))
1199 return ret;
1200 preorder (dfa->str_tree, calc_next, dfa);
1201 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202 if (__glibc_unlikely (ret != REG_NOERROR))
1203 return ret;
1204 ret = calc_eclosure (dfa);
1205 if (__glibc_unlikely (ret != REG_NOERROR))
1206 return ret;
1207
1208
1209
1210 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1211 || dfa->nbackref)
1212 {
1213 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214 if (__glibc_unlikely (dfa->inveclosures == NULL))
1215 return REG_ESPACE;
1216 ret = calc_inveclosure (dfa);
1217 }
1218
1219 return ret;
1220 }
1221
1222
1223
1224
1225 static reg_errcode_t
1226 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1227 void *extra)
1228 {
1229 bin_tree_t *node, *prev;
1230
1231 for (node = root; ; )
1232 {
1233
1234
1235 while (node->left || node->right)
1236 if (node->left)
1237 node = node->left;
1238 else
1239 node = node->right;
1240
1241 do
1242 {
1243 reg_errcode_t err = fn (extra, node);
1244 if (__glibc_unlikely (err != REG_NOERROR))
1245 return err;
1246 if (node->parent == NULL)
1247 return REG_NOERROR;
1248 prev = node;
1249 node = node->parent;
1250 }
1251
1252 while (node->right == prev || node->right == NULL);
1253 node = node->right;
1254 }
1255 }
1256
1257 static reg_errcode_t
1258 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1259 void *extra)
1260 {
1261 bin_tree_t *node;
1262
1263 for (node = root; ; )
1264 {
1265 reg_errcode_t err = fn (extra, node);
1266 if (__glibc_unlikely (err != REG_NOERROR))
1267 return err;
1268
1269
1270 if (node->left)
1271 node = node->left;
1272 else
1273 {
1274 bin_tree_t *prev = NULL;
1275 while (node->right == prev || node->right == NULL)
1276 {
1277 prev = node;
1278 node = node->parent;
1279 if (!node)
1280 return REG_NOERROR;
1281 }
1282 node = node->right;
1283 }
1284 }
1285 }
1286
1287
1288
1289
1290 static reg_errcode_t
1291 optimize_subexps (void *extra, bin_tree_t *node)
1292 {
1293 re_dfa_t *dfa = (re_dfa_t *) extra;
1294
1295 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1296 {
1297 int idx = node->token.opr.idx;
1298 node->token.opr.idx = dfa->subexp_map[idx];
1299 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1300 }
1301
1302 else if (node->token.type == SUBEXP
1303 && node->left && node->left->token.type == SUBEXP)
1304 {
1305 Idx other_idx = node->left->token.opr.idx;
1306
1307 node->left = node->left->left;
1308 if (node->left)
1309 node->left->parent = node;
1310
1311 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1312 if (other_idx < BITSET_WORD_BITS)
1313 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1314 }
1315
1316 return REG_NOERROR;
1317 }
1318
1319
1320
1321 static reg_errcode_t
1322 lower_subexps (void *extra, bin_tree_t *node)
1323 {
1324 regex_t *preg = (regex_t *) extra;
1325 reg_errcode_t err = REG_NOERROR;
1326
1327 if (node->left && node->left->token.type == SUBEXP)
1328 {
1329 node->left = lower_subexp (&err, preg, node->left);
1330 if (node->left)
1331 node->left->parent = node;
1332 }
1333 if (node->right && node->right->token.type == SUBEXP)
1334 {
1335 node->right = lower_subexp (&err, preg, node->right);
1336 if (node->right)
1337 node->right->parent = node;
1338 }
1339
1340 return err;
1341 }
1342
1343 static bin_tree_t *
1344 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1345 {
1346 re_dfa_t *dfa = preg->buffer;
1347 bin_tree_t *body = node->left;
1348 bin_tree_t *op, *cls, *tree1, *tree;
1349
1350 if (preg->no_sub
1351
1352
1353
1354
1355 && node->left != NULL
1356 && (node->token.opr.idx >= BITSET_WORD_BITS
1357 || !(dfa->used_bkref_map
1358 & ((bitset_word_t) 1 << node->token.opr.idx))))
1359 return node->left;
1360
1361
1362
1363 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366 tree = create_tree (dfa, op, tree1, CONCAT);
1367 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1368 || op == NULL || cls == NULL))
1369 {
1370 *err = REG_ESPACE;
1371 return NULL;
1372 }
1373
1374 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1376 return tree;
1377 }
1378
1379
1380
1381 static reg_errcode_t
1382 calc_first (void *extra, bin_tree_t *node)
1383 {
1384 re_dfa_t *dfa = (re_dfa_t *) extra;
1385 if (node->token.type == CONCAT)
1386 {
1387 node->first = node->left->first;
1388 node->node_idx = node->left->node_idx;
1389 }
1390 else
1391 {
1392 node->first = node;
1393 node->node_idx = re_dfa_add_node (dfa, node->token);
1394 if (__glibc_unlikely (node->node_idx == -1))
1395 return REG_ESPACE;
1396 if (node->token.type == ANCHOR)
1397 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1398 }
1399 return REG_NOERROR;
1400 }
1401
1402
1403 static reg_errcode_t
1404 calc_next (void *extra, bin_tree_t *node)
1405 {
1406 switch (node->token.type)
1407 {
1408 case OP_DUP_ASTERISK:
1409 node->left->next = node;
1410 break;
1411 case CONCAT:
1412 node->left->next = node->right->first;
1413 node->right->next = node->next;
1414 break;
1415 default:
1416 if (node->left)
1417 node->left->next = node->next;
1418 if (node->right)
1419 node->right->next = node->next;
1420 break;
1421 }
1422 return REG_NOERROR;
1423 }
1424
1425
1426 static reg_errcode_t
1427 link_nfa_nodes (void *extra, bin_tree_t *node)
1428 {
1429 re_dfa_t *dfa = (re_dfa_t *) extra;
1430 Idx idx = node->node_idx;
1431 reg_errcode_t err = REG_NOERROR;
1432
1433 switch (node->token.type)
1434 {
1435 case CONCAT:
1436 break;
1437
1438 case END_OF_RE:
1439 DEBUG_ASSERT (node->next == NULL);
1440 break;
1441
1442 case OP_DUP_ASTERISK:
1443 case OP_ALT:
1444 {
1445 Idx left, right;
1446 dfa->has_plural_match = 1;
1447 if (node->left != NULL)
1448 left = node->left->first->node_idx;
1449 else
1450 left = node->next->node_idx;
1451 if (node->right != NULL)
1452 right = node->right->first->node_idx;
1453 else
1454 right = node->next->node_idx;
1455 DEBUG_ASSERT (left > -1);
1456 DEBUG_ASSERT (right > -1);
1457 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1458 }
1459 break;
1460
1461 case ANCHOR:
1462 case OP_OPEN_SUBEXP:
1463 case OP_CLOSE_SUBEXP:
1464 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1465 break;
1466
1467 case OP_BACK_REF:
1468 dfa->nexts[idx] = node->next->node_idx;
1469 if (node->token.type == OP_BACK_REF)
1470 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1471 break;
1472
1473 default:
1474 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1475 dfa->nexts[idx] = node->next->node_idx;
1476 break;
1477 }
1478
1479 return err;
1480 }
1481
1482
1483
1484
1485
1486 static reg_errcode_t
1487 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488 Idx root_node, unsigned int init_constraint)
1489 {
1490 Idx org_node, clone_node;
1491 bool ok;
1492 unsigned int constraint = init_constraint;
1493 for (org_node = top_org_node, clone_node = top_clone_node;;)
1494 {
1495 Idx org_dest, clone_dest;
1496 if (dfa->nodes[org_node].type == OP_BACK_REF)
1497 {
1498
1499
1500
1501
1502 org_dest = dfa->nexts[org_node];
1503 re_node_set_empty (dfa->edests + clone_node);
1504 clone_dest = duplicate_node (dfa, org_dest, constraint);
1505 if (__glibc_unlikely (clone_dest == -1))
1506 return REG_ESPACE;
1507 dfa->nexts[clone_node] = dfa->nexts[org_node];
1508 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509 if (__glibc_unlikely (! ok))
1510 return REG_ESPACE;
1511 }
1512 else if (dfa->edests[org_node].nelem == 0)
1513 {
1514
1515
1516
1517 dfa->nexts[clone_node] = dfa->nexts[org_node];
1518 break;
1519 }
1520 else if (dfa->edests[org_node].nelem == 1)
1521 {
1522
1523
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
1526
1527
1528 if (org_node == root_node && clone_node != org_node)
1529 {
1530 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531 if (__glibc_unlikely (! ok))
1532 return REG_ESPACE;
1533 break;
1534 }
1535
1536 constraint |= dfa->nodes[org_node].constraint;
1537 clone_dest = duplicate_node (dfa, org_dest, constraint);
1538 if (__glibc_unlikely (clone_dest == -1))
1539 return REG_ESPACE;
1540 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541 if (__glibc_unlikely (! ok))
1542 return REG_ESPACE;
1543 }
1544 else
1545 {
1546
1547
1548 org_dest = dfa->edests[org_node].elems[0];
1549 re_node_set_empty (dfa->edests + clone_node);
1550
1551 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552 if (clone_dest == -1)
1553 {
1554
1555 reg_errcode_t err;
1556 clone_dest = duplicate_node (dfa, org_dest, constraint);
1557 if (__glibc_unlikely (clone_dest == -1))
1558 return REG_ESPACE;
1559 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 if (__glibc_unlikely (! ok))
1561 return REG_ESPACE;
1562 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563 root_node, constraint);
1564 if (__glibc_unlikely (err != REG_NOERROR))
1565 return err;
1566 }
1567 else
1568 {
1569
1570
1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 if (__glibc_unlikely (! ok))
1573 return REG_ESPACE;
1574 }
1575
1576 org_dest = dfa->edests[org_node].elems[1];
1577 clone_dest = duplicate_node (dfa, org_dest, constraint);
1578 if (__glibc_unlikely (clone_dest == -1))
1579 return REG_ESPACE;
1580 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581 if (__glibc_unlikely (! ok))
1582 return REG_ESPACE;
1583 }
1584 org_node = org_dest;
1585 clone_node = clone_dest;
1586 }
1587 return REG_NOERROR;
1588 }
1589
1590
1591
1592
1593 static Idx
1594 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1595 unsigned int constraint)
1596 {
1597 Idx idx;
1598 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1599 {
1600 if (org_node == dfa->org_indices[idx]
1601 && constraint == dfa->nodes[idx].constraint)
1602 return idx;
1603 }
1604 return -1;
1605 }
1606
1607
1608
1609
1610
1611 static Idx
1612 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1613 {
1614 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615 if (__glibc_likely (dup_idx != -1))
1616 {
1617 dfa->nodes[dup_idx].constraint = constraint;
1618 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1619 dfa->nodes[dup_idx].duplicated = 1;
1620
1621
1622 dfa->org_indices[dup_idx] = org_idx;
1623 }
1624 return dup_idx;
1625 }
1626
1627 static reg_errcode_t
1628 calc_inveclosure (re_dfa_t *dfa)
1629 {
1630 Idx src, idx;
1631 bool ok;
1632 for (idx = 0; idx < dfa->nodes_len; ++idx)
1633 re_node_set_init_empty (dfa->inveclosures + idx);
1634
1635 for (src = 0; src < dfa->nodes_len; ++src)
1636 {
1637 Idx *elems = dfa->eclosures[src].elems;
1638 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1639 {
1640 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641 if (__glibc_unlikely (! ok))
1642 return REG_ESPACE;
1643 }
1644 }
1645
1646 return REG_NOERROR;
1647 }
1648
1649
1650
1651 static reg_errcode_t
1652 calc_eclosure (re_dfa_t *dfa)
1653 {
1654 Idx node_idx;
1655 bool incomplete;
1656 DEBUG_ASSERT (dfa->nodes_len > 0);
1657 incomplete = false;
1658
1659 for (node_idx = 0; ; ++node_idx)
1660 {
1661 reg_errcode_t err;
1662 re_node_set eclosure_elem;
1663 if (node_idx == dfa->nodes_len)
1664 {
1665 if (!incomplete)
1666 break;
1667 incomplete = false;
1668 node_idx = 0;
1669 }
1670
1671 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1672
1673
1674 if (dfa->eclosures[node_idx].nelem != 0)
1675 continue;
1676
1677 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1678 if (__glibc_unlikely (err != REG_NOERROR))
1679 return err;
1680
1681 if (dfa->eclosures[node_idx].nelem == 0)
1682 {
1683 incomplete = true;
1684 re_node_set_free (&eclosure_elem);
1685 }
1686 }
1687 return REG_NOERROR;
1688 }
1689
1690
1691
1692 static reg_errcode_t
1693 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1694 {
1695 reg_errcode_t err;
1696 Idx i;
1697 re_node_set eclosure;
1698 bool incomplete = false;
1699 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1700 if (__glibc_unlikely (err != REG_NOERROR))
1701 return err;
1702
1703
1704 eclosure.elems[eclosure.nelem++] = node;
1705
1706
1707
1708 dfa->eclosures[node].nelem = -1;
1709
1710
1711
1712 if (dfa->nodes[node].constraint
1713 && dfa->edests[node].nelem
1714 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1715 {
1716 err = duplicate_node_closure (dfa, node, node, node,
1717 dfa->nodes[node].constraint);
1718 if (__glibc_unlikely (err != REG_NOERROR))
1719 return err;
1720 }
1721
1722
1723 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1724 for (i = 0; i < dfa->edests[node].nelem; ++i)
1725 {
1726 re_node_set eclosure_elem;
1727 Idx edest = dfa->edests[node].elems[i];
1728
1729
1730 if (dfa->eclosures[edest].nelem == -1)
1731 {
1732 incomplete = true;
1733 continue;
1734 }
1735
1736
1737 if (dfa->eclosures[edest].nelem == 0)
1738 {
1739 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1740 if (__glibc_unlikely (err != REG_NOERROR))
1741 return err;
1742 }
1743 else
1744 eclosure_elem = dfa->eclosures[edest];
1745
1746 err = re_node_set_merge (&eclosure, &eclosure_elem);
1747 if (__glibc_unlikely (err != REG_NOERROR))
1748 return err;
1749
1750
1751 if (dfa->eclosures[edest].nelem == 0)
1752 {
1753 incomplete = true;
1754 re_node_set_free (&eclosure_elem);
1755 }
1756 }
1757
1758 if (incomplete && !root)
1759 dfa->eclosures[node].nelem = 0;
1760 else
1761 dfa->eclosures[node] = eclosure;
1762 *new_set = eclosure;
1763 return REG_NOERROR;
1764 }
1765
1766
1767
1768
1769
1770
1771 static void
1772 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1773 {
1774 re_string_skip_bytes (input, peek_token (result, input, syntax));
1775 }
1776
1777
1778
1779
1780 static int
1781 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1782 {
1783 unsigned char c;
1784
1785 if (re_string_eoi (input))
1786 {
1787 token->type = END_OF_RE;
1788 return 0;
1789 }
1790
1791 c = re_string_peek_byte (input, 0);
1792 token->opr.c = c;
1793
1794 token->word_char = 0;
1795 #ifdef RE_ENABLE_I18N
1796 token->mb_partial = 0;
1797 if (input->mb_cur_max > 1
1798 && !re_string_first_byte (input, re_string_cur_idx (input)))
1799 {
1800 token->type = CHARACTER;
1801 token->mb_partial = 1;
1802 return 1;
1803 }
1804 #endif
1805 if (c == '\\')
1806 {
1807 unsigned char c2;
1808 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1809 {
1810 token->type = BACK_SLASH;
1811 return 1;
1812 }
1813
1814 c2 = re_string_peek_byte_case (input, 1);
1815 token->opr.c = c2;
1816 token->type = CHARACTER;
1817 #ifdef RE_ENABLE_I18N
1818 if (input->mb_cur_max > 1)
1819 {
1820 wint_t wc = re_string_wchar_at (input,
1821 re_string_cur_idx (input) + 1);
1822 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1823 }
1824 else
1825 #endif
1826 token->word_char = IS_WORD_CHAR (c2) != 0;
1827
1828 switch (c2)
1829 {
1830 case '|':
1831 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832 token->type = OP_ALT;
1833 break;
1834 case '1': case '2': case '3': case '4': case '5':
1835 case '6': case '7': case '8': case '9':
1836 if (!(syntax & RE_NO_BK_REFS))
1837 {
1838 token->type = OP_BACK_REF;
1839 token->opr.idx = c2 - '1';
1840 }
1841 break;
1842 case '<':
1843 if (!(syntax & RE_NO_GNU_OPS))
1844 {
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_FIRST;
1847 }
1848 break;
1849 case '>':
1850 if (!(syntax & RE_NO_GNU_OPS))
1851 {
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_LAST;
1854 }
1855 break;
1856 case 'b':
1857 if (!(syntax & RE_NO_GNU_OPS))
1858 {
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = WORD_DELIM;
1861 }
1862 break;
1863 case 'B':
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 {
1866 token->type = ANCHOR;
1867 token->opr.ctx_type = NOT_WORD_DELIM;
1868 }
1869 break;
1870 case 'w':
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = OP_WORD;
1873 break;
1874 case 'W':
1875 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = OP_NOTWORD;
1877 break;
1878 case 's':
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = OP_SPACE;
1881 break;
1882 case 'S':
1883 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = OP_NOTSPACE;
1885 break;
1886 case '`':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 {
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_FIRST;
1891 }
1892 break;
1893 case '\'':
1894 if (!(syntax & RE_NO_GNU_OPS))
1895 {
1896 token->type = ANCHOR;
1897 token->opr.ctx_type = BUF_LAST;
1898 }
1899 break;
1900 case '(':
1901 if (!(syntax & RE_NO_BK_PARENS))
1902 token->type = OP_OPEN_SUBEXP;
1903 break;
1904 case ')':
1905 if (!(syntax & RE_NO_BK_PARENS))
1906 token->type = OP_CLOSE_SUBEXP;
1907 break;
1908 case '+':
1909 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910 token->type = OP_DUP_PLUS;
1911 break;
1912 case '?':
1913 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914 token->type = OP_DUP_QUESTION;
1915 break;
1916 case '{':
1917 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918 token->type = OP_OPEN_DUP_NUM;
1919 break;
1920 case '}':
1921 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922 token->type = OP_CLOSE_DUP_NUM;
1923 break;
1924 default:
1925 break;
1926 }
1927 return 2;
1928 }
1929
1930 token->type = CHARACTER;
1931 #ifdef RE_ENABLE_I18N
1932 if (input->mb_cur_max > 1)
1933 {
1934 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1936 }
1937 else
1938 #endif
1939 token->word_char = IS_WORD_CHAR (token->opr.c);
1940
1941 switch (c)
1942 {
1943 case '\n':
1944 if (syntax & RE_NEWLINE_ALT)
1945 token->type = OP_ALT;
1946 break;
1947 case '|':
1948 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949 token->type = OP_ALT;
1950 break;
1951 case '*':
1952 token->type = OP_DUP_ASTERISK;
1953 break;
1954 case '+':
1955 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956 token->type = OP_DUP_PLUS;
1957 break;
1958 case '?':
1959 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960 token->type = OP_DUP_QUESTION;
1961 break;
1962 case '{':
1963 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964 token->type = OP_OPEN_DUP_NUM;
1965 break;
1966 case '}':
1967 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968 token->type = OP_CLOSE_DUP_NUM;
1969 break;
1970 case '(':
1971 if (syntax & RE_NO_BK_PARENS)
1972 token->type = OP_OPEN_SUBEXP;
1973 break;
1974 case ')':
1975 if (syntax & RE_NO_BK_PARENS)
1976 token->type = OP_CLOSE_SUBEXP;
1977 break;
1978 case '[':
1979 token->type = OP_OPEN_BRACKET;
1980 break;
1981 case '.':
1982 token->type = OP_PERIOD;
1983 break;
1984 case '^':
1985 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1986 && re_string_cur_idx (input) != 0)
1987 {
1988 char prev = re_string_peek_byte (input, -1);
1989 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1990 break;
1991 }
1992 token->type = ANCHOR;
1993 token->opr.ctx_type = LINE_FIRST;
1994 break;
1995 case '$':
1996 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1997 && re_string_cur_idx (input) + 1 != re_string_length (input))
1998 {
1999 re_token_t next;
2000 re_string_skip_bytes (input, 1);
2001 peek_token (&next, input, syntax);
2002 re_string_skip_bytes (input, -1);
2003 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2004 break;
2005 }
2006 token->type = ANCHOR;
2007 token->opr.ctx_type = LINE_LAST;
2008 break;
2009 default:
2010 break;
2011 }
2012 return 1;
2013 }
2014
2015
2016
2017
2018 static int
2019 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2020 {
2021 unsigned char c;
2022 if (re_string_eoi (input))
2023 {
2024 token->type = END_OF_RE;
2025 return 0;
2026 }
2027 c = re_string_peek_byte (input, 0);
2028 token->opr.c = c;
2029
2030 #ifdef RE_ENABLE_I18N
2031 if (input->mb_cur_max > 1
2032 && !re_string_first_byte (input, re_string_cur_idx (input)))
2033 {
2034 token->type = CHARACTER;
2035 return 1;
2036 }
2037 #endif
2038
2039 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2040 && re_string_cur_idx (input) + 1 < re_string_length (input))
2041 {
2042
2043 unsigned char c2;
2044 re_string_skip_bytes (input, 1);
2045 c2 = re_string_peek_byte (input, 0);
2046 token->opr.c = c2;
2047 token->type = CHARACTER;
2048 return 1;
2049 }
2050 if (c == '[')
2051 {
2052 unsigned char c2;
2053 int token_len;
2054 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2055 c2 = re_string_peek_byte (input, 1);
2056 else
2057 c2 = 0;
2058 token->opr.c = c2;
2059 token_len = 2;
2060 switch (c2)
2061 {
2062 case '.':
2063 token->type = OP_OPEN_COLL_ELEM;
2064 break;
2065
2066 case '=':
2067 token->type = OP_OPEN_EQUIV_CLASS;
2068 break;
2069
2070 case ':':
2071 if (syntax & RE_CHAR_CLASSES)
2072 {
2073 token->type = OP_OPEN_CHAR_CLASS;
2074 break;
2075 }
2076 FALLTHROUGH;
2077 default:
2078 token->type = CHARACTER;
2079 token->opr.c = c;
2080 token_len = 1;
2081 break;
2082 }
2083 return token_len;
2084 }
2085 switch (c)
2086 {
2087 case '-':
2088 token->type = OP_CHARSET_RANGE;
2089 break;
2090 case ']':
2091 token->type = OP_CLOSE_BRACKET;
2092 break;
2093 case '^':
2094 token->type = OP_NON_MATCH_LIST;
2095 break;
2096 default:
2097 token->type = CHARACTER;
2098 }
2099 return 1;
2100 }
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116 static bin_tree_t *
2117 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2118 reg_errcode_t *err)
2119 {
2120 re_dfa_t *dfa = preg->buffer;
2121 bin_tree_t *tree, *eor, *root;
2122 re_token_t current_token;
2123 dfa->syntax = syntax;
2124 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2125 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2126 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2127 return NULL;
2128 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2129 if (tree != NULL)
2130 root = create_tree (dfa, tree, eor, CONCAT);
2131 else
2132 root = eor;
2133 if (__glibc_unlikely (eor == NULL || root == NULL))
2134 {
2135 *err = REG_ESPACE;
2136 return NULL;
2137 }
2138 return root;
2139 }
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150 static bin_tree_t *
2151 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2152 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2153 {
2154 re_dfa_t *dfa = preg->buffer;
2155 bin_tree_t *tree, *branch = NULL;
2156 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2157 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2158 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2159 return NULL;
2160
2161 while (token->type == OP_ALT)
2162 {
2163 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2164 if (token->type != OP_ALT && token->type != END_OF_RE
2165 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166 {
2167 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2168 dfa->completed_bkref_map = initial_bkref_map;
2169 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2170 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2171 {
2172 if (tree != NULL)
2173 postorder (tree, free_tree, NULL);
2174 return NULL;
2175 }
2176 dfa->completed_bkref_map |= accumulated_bkref_map;
2177 }
2178 else
2179 branch = NULL;
2180 tree = create_tree (dfa, tree, branch, OP_ALT);
2181 if (__glibc_unlikely (tree == NULL))
2182 {
2183 *err = REG_ESPACE;
2184 return NULL;
2185 }
2186 }
2187 return tree;
2188 }
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199 static bin_tree_t *
2200 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2201 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2202 {
2203 bin_tree_t *tree, *expr;
2204 re_dfa_t *dfa = preg->buffer;
2205 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2206 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2207 return NULL;
2208
2209 while (token->type != OP_ALT && token->type != END_OF_RE
2210 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2211 {
2212 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2213 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2214 {
2215 if (tree != NULL)
2216 postorder (tree, free_tree, NULL);
2217 return NULL;
2218 }
2219 if (tree != NULL && expr != NULL)
2220 {
2221 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2222 if (newtree == NULL)
2223 {
2224 postorder (expr, free_tree, NULL);
2225 postorder (tree, free_tree, NULL);
2226 *err = REG_ESPACE;
2227 return NULL;
2228 }
2229 tree = newtree;
2230 }
2231 else if (tree == NULL)
2232 tree = expr;
2233
2234 }
2235 return tree;
2236 }
2237
2238
2239
2240
2241
2242
2243
2244 static bin_tree_t *
2245 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2246 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2247 {
2248 re_dfa_t *dfa = preg->buffer;
2249 bin_tree_t *tree;
2250 switch (token->type)
2251 {
2252 case CHARACTER:
2253 tree = create_token_tree (dfa, NULL, NULL, token);
2254 if (__glibc_unlikely (tree == NULL))
2255 {
2256 *err = REG_ESPACE;
2257 return NULL;
2258 }
2259 #ifdef RE_ENABLE_I18N
2260 if (dfa->mb_cur_max > 1)
2261 {
2262 while (!re_string_eoi (regexp)
2263 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2264 {
2265 bin_tree_t *mbc_remain;
2266 fetch_token (token, regexp, syntax);
2267 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2268 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2269 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2270 {
2271 *err = REG_ESPACE;
2272 return NULL;
2273 }
2274 }
2275 }
2276 #endif
2277 break;
2278
2279 case OP_OPEN_SUBEXP:
2280 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2281 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2282 return NULL;
2283 break;
2284
2285 case OP_OPEN_BRACKET:
2286 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2287 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2288 return NULL;
2289 break;
2290
2291 case OP_BACK_REF:
2292 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2293 {
2294 *err = REG_ESUBREG;
2295 return NULL;
2296 }
2297 dfa->used_bkref_map |= 1 << token->opr.idx;
2298 tree = create_token_tree (dfa, NULL, NULL, token);
2299 if (__glibc_unlikely (tree == NULL))
2300 {
2301 *err = REG_ESPACE;
2302 return NULL;
2303 }
2304 ++dfa->nbackref;
2305 dfa->has_mb_node = 1;
2306 break;
2307
2308 case OP_OPEN_DUP_NUM:
2309 if (syntax & RE_CONTEXT_INVALID_DUP)
2310 {
2311 *err = REG_BADRPT;
2312 return NULL;
2313 }
2314 FALLTHROUGH;
2315 case OP_DUP_ASTERISK:
2316 case OP_DUP_PLUS:
2317 case OP_DUP_QUESTION:
2318 if (syntax & RE_CONTEXT_INVALID_OPS)
2319 {
2320 *err = REG_BADRPT;
2321 return NULL;
2322 }
2323 else if (syntax & RE_CONTEXT_INDEP_OPS)
2324 {
2325 fetch_token (token, regexp, syntax);
2326 return parse_expression (regexp, preg, token, syntax, nest, err);
2327 }
2328 FALLTHROUGH;
2329 case OP_CLOSE_SUBEXP:
2330 if ((token->type == OP_CLOSE_SUBEXP)
2331 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2332 {
2333 *err = REG_ERPAREN;
2334 return NULL;
2335 }
2336 FALLTHROUGH;
2337 case OP_CLOSE_DUP_NUM:
2338
2339
2340
2341 token->type = CHARACTER;
2342
2343
2344 tree = create_token_tree (dfa, NULL, NULL, token);
2345 if (__glibc_unlikely (tree == NULL))
2346 {
2347 *err = REG_ESPACE;
2348 return NULL;
2349 }
2350 break;
2351
2352 case ANCHOR:
2353 if ((token->opr.ctx_type
2354 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2355 && dfa->word_ops_used == 0)
2356 init_word_char (dfa);
2357 if (token->opr.ctx_type == WORD_DELIM
2358 || token->opr.ctx_type == NOT_WORD_DELIM)
2359 {
2360 bin_tree_t *tree_first, *tree_last;
2361 if (token->opr.ctx_type == WORD_DELIM)
2362 {
2363 token->opr.ctx_type = WORD_FIRST;
2364 tree_first = create_token_tree (dfa, NULL, NULL, token);
2365 token->opr.ctx_type = WORD_LAST;
2366 }
2367 else
2368 {
2369 token->opr.ctx_type = INSIDE_WORD;
2370 tree_first = create_token_tree (dfa, NULL, NULL, token);
2371 token->opr.ctx_type = INSIDE_NOTWORD;
2372 }
2373 tree_last = create_token_tree (dfa, NULL, NULL, token);
2374 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2375 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2376 || tree == NULL))
2377 {
2378 *err = REG_ESPACE;
2379 return NULL;
2380 }
2381 }
2382 else
2383 {
2384 tree = create_token_tree (dfa, NULL, NULL, token);
2385 if (__glibc_unlikely (tree == NULL))
2386 {
2387 *err = REG_ESPACE;
2388 return NULL;
2389 }
2390 }
2391
2392
2393
2394
2395 fetch_token (token, regexp, syntax);
2396 return tree;
2397
2398 case OP_PERIOD:
2399 tree = create_token_tree (dfa, NULL, NULL, token);
2400 if (__glibc_unlikely (tree == NULL))
2401 {
2402 *err = REG_ESPACE;
2403 return NULL;
2404 }
2405 if (dfa->mb_cur_max > 1)
2406 dfa->has_mb_node = 1;
2407 break;
2408
2409 case OP_WORD:
2410 case OP_NOTWORD:
2411 tree = build_charclass_op (dfa, regexp->trans,
2412 "alnum",
2413 "_",
2414 token->type == OP_NOTWORD, err);
2415 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2416 return NULL;
2417 break;
2418
2419 case OP_SPACE:
2420 case OP_NOTSPACE:
2421 tree = build_charclass_op (dfa, regexp->trans,
2422 "space",
2423 "",
2424 token->type == OP_NOTSPACE, err);
2425 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2426 return NULL;
2427 break;
2428
2429 case OP_ALT:
2430 case END_OF_RE:
2431 return NULL;
2432
2433 case BACK_SLASH:
2434 *err = REG_EESCAPE;
2435 return NULL;
2436
2437 default:
2438
2439 DEBUG_ASSERT (false);
2440 return NULL;
2441 }
2442 fetch_token (token, regexp, syntax);
2443
2444 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2445 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2446 {
2447 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2448 syntax, err);
2449 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2450 {
2451 if (tree != NULL)
2452 postorder (tree, free_tree, NULL);
2453 return NULL;
2454 }
2455 tree = dup_tree;
2456
2457 if ((syntax & RE_CONTEXT_INVALID_DUP)
2458 && (token->type == OP_DUP_ASTERISK
2459 || token->type == OP_OPEN_DUP_NUM))
2460 {
2461 if (tree != NULL)
2462 postorder (tree, free_tree, NULL);
2463 *err = REG_BADRPT;
2464 return NULL;
2465 }
2466 }
2467
2468 return tree;
2469 }
2470
2471
2472
2473
2474
2475
2476
2477
2478 static bin_tree_t *
2479 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2480 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2481 {
2482 re_dfa_t *dfa = preg->buffer;
2483 bin_tree_t *tree;
2484 size_t cur_nsub;
2485 cur_nsub = preg->re_nsub++;
2486
2487 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2488
2489
2490 if (token->type == OP_CLOSE_SUBEXP)
2491 tree = NULL;
2492 else
2493 {
2494 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495 if (__glibc_unlikely (*err == REG_NOERROR
2496 && token->type != OP_CLOSE_SUBEXP))
2497 {
2498 if (tree != NULL)
2499 postorder (tree, free_tree, NULL);
2500 *err = REG_EPAREN;
2501 }
2502 if (__glibc_unlikely (*err != REG_NOERROR))
2503 return NULL;
2504 }
2505
2506 if (cur_nsub <= '9' - '1')
2507 dfa->completed_bkref_map |= 1 << cur_nsub;
2508
2509 tree = create_tree (dfa, tree, NULL, SUBEXP);
2510 if (__glibc_unlikely (tree == NULL))
2511 {
2512 *err = REG_ESPACE;
2513 return NULL;
2514 }
2515 tree->token.opr.idx = cur_nsub;
2516 return tree;
2517 }
2518
2519
2520
2521 static bin_tree_t *
2522 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2523 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2524 {
2525 bin_tree_t *tree = NULL, *old_tree = NULL;
2526 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2527 re_token_t start_token = *token;
2528
2529 if (token->type == OP_OPEN_DUP_NUM)
2530 {
2531 end = 0;
2532 start = fetch_number (regexp, token, syntax);
2533 if (start == -1)
2534 {
2535 if (token->type == CHARACTER && token->opr.c == ',')
2536 start = 0;
2537 else
2538 {
2539 *err = REG_BADBR;
2540 return NULL;
2541 }
2542 }
2543 if (__glibc_likely (start != -2))
2544 {
2545
2546 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2547 : ((token->type == CHARACTER && token->opr.c == ',')
2548 ? fetch_number (regexp, token, syntax) : -2));
2549 }
2550 if (__glibc_unlikely (start == -2 || end == -2))
2551 {
2552
2553 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2554 {
2555 if (token->type == END_OF_RE)
2556 *err = REG_EBRACE;
2557 else
2558 *err = REG_BADBR;
2559
2560 return NULL;
2561 }
2562
2563
2564 re_string_set_index (regexp, start_idx);
2565 *token = start_token;
2566 token->type = CHARACTER;
2567
2568
2569 return elem;
2570 }
2571
2572 if (__glibc_unlikely ((end != -1 && start > end)
2573 || token->type != OP_CLOSE_DUP_NUM))
2574 {
2575
2576 *err = REG_BADBR;
2577 return NULL;
2578 }
2579
2580 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2581 {
2582 *err = REG_ESIZE;
2583 return NULL;
2584 }
2585 }
2586 else
2587 {
2588 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2589 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2590 }
2591
2592 fetch_token (token, regexp, syntax);
2593
2594 if (__glibc_unlikely (elem == NULL))
2595 return NULL;
2596 if (__glibc_unlikely (start == 0 && end == 0))
2597 {
2598 postorder (elem, free_tree, NULL);
2599 return NULL;
2600 }
2601
2602
2603 if (__glibc_unlikely (start > 0))
2604 {
2605 tree = elem;
2606 for (i = 2; i <= start; ++i)
2607 {
2608 elem = duplicate_tree (elem, dfa);
2609 tree = create_tree (dfa, tree, elem, CONCAT);
2610 if (__glibc_unlikely (elem == NULL || tree == NULL))
2611 goto parse_dup_op_espace;
2612 }
2613
2614 if (start == end)
2615 return tree;
2616
2617
2618 elem = duplicate_tree (elem, dfa);
2619 if (__glibc_unlikely (elem == NULL))
2620 goto parse_dup_op_espace;
2621 old_tree = tree;
2622 }
2623 else
2624 old_tree = NULL;
2625
2626 if (elem->token.type == SUBEXP)
2627 {
2628 uintptr_t subidx = elem->token.opr.idx;
2629 postorder (elem, mark_opt_subexp, (void *) subidx);
2630 }
2631
2632 tree = create_tree (dfa, elem, NULL,
2633 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2634 if (__glibc_unlikely (tree == NULL))
2635 goto parse_dup_op_espace;
2636
2637
2638
2639
2640 if (TYPE_SIGNED (Idx) || end != -1)
2641 for (i = start + 2; i <= end; ++i)
2642 {
2643 elem = duplicate_tree (elem, dfa);
2644 tree = create_tree (dfa, tree, elem, CONCAT);
2645 if (__glibc_unlikely (elem == NULL || tree == NULL))
2646 goto parse_dup_op_espace;
2647
2648 tree = create_tree (dfa, tree, NULL, OP_ALT);
2649 if (__glibc_unlikely (tree == NULL))
2650 goto parse_dup_op_espace;
2651 }
2652
2653 if (old_tree)
2654 tree = create_tree (dfa, old_tree, tree, CONCAT);
2655
2656 return tree;
2657
2658 parse_dup_op_espace:
2659 *err = REG_ESPACE;
2660 return NULL;
2661 }
2662
2663
2664
2665 #define BRACKET_NAME_BUF_SIZE 32
2666
2667 #ifndef _LIBC
2668
2669 # ifdef RE_ENABLE_I18N
2670
2671
2672
2673 static wint_t
2674 parse_byte (unsigned char b, re_charset_t *mbcset)
2675 {
2676 return mbcset == NULL ? b : __btowc (b);
2677 }
2678 # endif
2679
2680
2681
2682
2683
2684
2685
2686
2687 static reg_errcode_t
2688 # ifdef RE_ENABLE_I18N
2689 build_range_exp (const reg_syntax_t syntax,
2690 bitset_t sbcset,
2691 re_charset_t *mbcset,
2692 Idx *range_alloc,
2693 const bracket_elem_t *start_elem,
2694 const bracket_elem_t *end_elem)
2695 # else
2696 build_range_exp (const reg_syntax_t syntax,
2697 bitset_t sbcset,
2698 const bracket_elem_t *start_elem,
2699 const bracket_elem_t *end_elem)
2700 # endif
2701 {
2702 unsigned int start_ch, end_ch;
2703
2704 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2705 || start_elem->type == CHAR_CLASS
2706 || end_elem->type == EQUIV_CLASS
2707 || end_elem->type == CHAR_CLASS))
2708 return REG_ERANGE;
2709
2710
2711
2712 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2713 && strlen ((char *) start_elem->opr.name) > 1)
2714 || (end_elem->type == COLL_SYM
2715 && strlen ((char *) end_elem->opr.name) > 1)))
2716 return REG_ECOLLATE;
2717
2718 # ifdef RE_ENABLE_I18N
2719 {
2720 wchar_t wc;
2721 wint_t start_wc;
2722 wint_t end_wc;
2723
2724 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2725 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2726 : 0));
2727 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2728 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2729 : 0));
2730 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2731 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2732 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2733 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2734 if (start_wc == WEOF || end_wc == WEOF)
2735 return REG_ECOLLATE;
2736 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2737 && start_wc > end_wc))
2738 return REG_ERANGE;
2739
2740
2741
2742
2743
2744
2745 if (mbcset)
2746 {
2747
2748 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2749 {
2750
2751 wchar_t *new_array_start, *new_array_end;
2752 Idx new_nranges;
2753
2754
2755 new_nranges = 2 * mbcset->nranges + 1;
2756
2757
2758 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2759 new_nranges);
2760 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2761 new_nranges);
2762
2763 if (__glibc_unlikely (new_array_start == NULL
2764 || new_array_end == NULL))
2765 {
2766 re_free (new_array_start);
2767 re_free (new_array_end);
2768 return REG_ESPACE;
2769 }
2770
2771 mbcset->range_starts = new_array_start;
2772 mbcset->range_ends = new_array_end;
2773 *range_alloc = new_nranges;
2774 }
2775
2776 mbcset->range_starts[mbcset->nranges] = start_wc;
2777 mbcset->range_ends[mbcset->nranges++] = end_wc;
2778 }
2779
2780
2781 for (wc = 0; wc < SBC_MAX; ++wc)
2782 {
2783 if (start_wc <= wc && wc <= end_wc)
2784 bitset_set (sbcset, wc);
2785 }
2786 }
2787 # else
2788 {
2789 unsigned int ch;
2790 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2791 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2792 : 0));
2793 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2794 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2795 : 0));
2796 if (start_ch > end_ch)
2797 return REG_ERANGE;
2798
2799 for (ch = 0; ch < SBC_MAX; ++ch)
2800 if (start_ch <= ch && ch <= end_ch)
2801 bitset_set (sbcset, ch);
2802 }
2803 # endif
2804 return REG_NOERROR;
2805 }
2806 #endif
2807
2808 #ifndef _LIBC
2809
2810
2811
2812
2813
2814
2815 static reg_errcode_t
2816 # ifdef RE_ENABLE_I18N
2817 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2818 Idx *coll_sym_alloc, const unsigned char *name)
2819 # else
2820 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2821 # endif
2822 {
2823 size_t name_len = strlen ((const char *) name);
2824 if (__glibc_unlikely (name_len != 1))
2825 return REG_ECOLLATE;
2826 else
2827 {
2828 bitset_set (sbcset, name[0]);
2829 return REG_NOERROR;
2830 }
2831 }
2832 #endif
2833
2834
2835
2836
2837 static bin_tree_t *
2838 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2839 reg_syntax_t syntax, reg_errcode_t *err)
2840 {
2841 #ifdef _LIBC
2842 const unsigned char *collseqmb;
2843 const char *collseqwc;
2844 uint32_t nrules;
2845 int32_t table_size;
2846 const int32_t *symb_table;
2847 const unsigned char *extra;
2848
2849
2850
2851
2852
2853
2854 auto inline int32_t
2855 __attribute__ ((always_inline))
2856 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2857 {
2858 int32_t elem;
2859
2860 for (elem = 0; elem < table_size; elem++)
2861 if (symb_table[2 * elem] != 0)
2862 {
2863 int32_t idx = symb_table[2 * elem + 1];
2864
2865 idx += 1 + extra[idx];
2866 if (
2867 name_len == extra[idx]
2868
2869 && memcmp (name, &extra[idx + 1], name_len) == 0)
2870
2871 return elem;
2872 }
2873 return -1;
2874 }
2875
2876
2877
2878
2879
2880 auto inline unsigned int
2881 __attribute__ ((always_inline))
2882 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2883 {
2884 if (br_elem->type == SB_CHAR)
2885 {
2886
2887
2888
2889 if (nrules == 0)
2890 return collseqmb[br_elem->opr.ch];
2891 else
2892 {
2893 wint_t wc = __btowc (br_elem->opr.ch);
2894 return __collseq_table_lookup (collseqwc, wc);
2895 }
2896 }
2897 else if (br_elem->type == MB_CHAR)
2898 {
2899 if (nrules != 0)
2900 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2901 }
2902 else if (br_elem->type == COLL_SYM)
2903 {
2904 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2905 if (nrules != 0)
2906 {
2907 int32_t elem, idx;
2908 elem = seek_collating_symbol_entry (br_elem->opr.name,
2909 sym_name_len);
2910 if (elem != -1)
2911 {
2912
2913 idx = symb_table[2 * elem + 1];
2914
2915 idx += 1 + extra[idx];
2916
2917 idx += 1 + extra[idx];
2918
2919 idx = (idx + 3) & ~3;
2920
2921 idx += sizeof (unsigned int);
2922
2923 idx += sizeof (unsigned int) *
2924 (1 + *(unsigned int *) (extra + idx));
2925
2926 return *(unsigned int *) (extra + idx);
2927 }
2928 else if (sym_name_len == 1)
2929 {
2930
2931
2932 return collseqmb[br_elem->opr.name[0]];
2933 }
2934 }
2935 else if (sym_name_len == 1)
2936 return collseqmb[br_elem->opr.name[0]];
2937 }
2938 return UINT_MAX;
2939 }
2940
2941
2942
2943
2944
2945
2946
2947
2948 auto inline reg_errcode_t
2949 __attribute__ ((always_inline))
2950 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2951 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2952 {
2953 unsigned int ch;
2954 uint32_t start_collseq;
2955 uint32_t end_collseq;
2956
2957
2958
2959 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2960 || start_elem->type == CHAR_CLASS
2961 || end_elem->type == EQUIV_CLASS
2962 || end_elem->type == CHAR_CLASS))
2963 return REG_ERANGE;
2964
2965
2966 start_collseq = lookup_collation_sequence_value (start_elem);
2967 end_collseq = lookup_collation_sequence_value (end_elem);
2968
2969 if (__glibc_unlikely (start_collseq == UINT_MAX
2970 || end_collseq == UINT_MAX))
2971 return REG_ECOLLATE;
2972 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2973 && start_collseq > end_collseq))
2974 return REG_ERANGE;
2975
2976
2977
2978
2979
2980 if (nrules > 0 || dfa->mb_cur_max > 1)
2981 {
2982
2983 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2984 {
2985
2986 uint32_t *new_array_start;
2987 uint32_t *new_array_end;
2988 Idx new_nranges;
2989
2990
2991 new_nranges = 2 * mbcset->nranges + 1;
2992 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2993 new_nranges);
2994 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2995 new_nranges);
2996
2997 if (__glibc_unlikely (new_array_start == NULL
2998 || new_array_end == NULL))
2999 return REG_ESPACE;
3000
3001 mbcset->range_starts = new_array_start;
3002 mbcset->range_ends = new_array_end;
3003 *range_alloc = new_nranges;
3004 }
3005
3006 mbcset->range_starts[mbcset->nranges] = start_collseq;
3007 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3008 }
3009
3010
3011 for (ch = 0; ch < SBC_MAX; ch++)
3012 {
3013 uint32_t ch_collseq;
3014
3015
3016
3017 if (nrules == 0)
3018 ch_collseq = collseqmb[ch];
3019 else
3020 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3021 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3022 bitset_set (sbcset, ch);
3023 }
3024 return REG_NOERROR;
3025 }
3026
3027
3028
3029
3030
3031
3032
3033 auto inline reg_errcode_t
3034 __attribute__ ((always_inline))
3035 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3036 Idx *coll_sym_alloc, const unsigned char *name)
3037 {
3038 int32_t elem, idx;
3039 size_t name_len = strlen ((const char *) name);
3040 if (nrules != 0)
3041 {
3042 elem = seek_collating_symbol_entry (name, name_len);
3043 if (elem != -1)
3044 {
3045
3046 idx = symb_table[2 * elem + 1];
3047
3048 idx += 1 + extra[idx];
3049 }
3050 else if (name_len == 1)
3051 {
3052
3053
3054 bitset_set (sbcset, name[0]);
3055 return REG_NOERROR;
3056 }
3057 else
3058 return REG_ECOLLATE;
3059
3060
3061
3062 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3063 {
3064
3065
3066 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3067
3068
3069 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3070 new_coll_sym_alloc);
3071 if (__glibc_unlikely (new_coll_syms == NULL))
3072 return REG_ESPACE;
3073 mbcset->coll_syms = new_coll_syms;
3074 *coll_sym_alloc = new_coll_sym_alloc;
3075 }
3076 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3077 return REG_NOERROR;
3078 }
3079 else
3080 {
3081 if (__glibc_unlikely (name_len != 1))
3082 return REG_ECOLLATE;
3083 else
3084 {
3085 bitset_set (sbcset, name[0]);
3086 return REG_NOERROR;
3087 }
3088 }
3089 }
3090 #endif
3091
3092 re_token_t br_token;
3093 re_bitset_ptr_t sbcset;
3094 #ifdef RE_ENABLE_I18N
3095 re_charset_t *mbcset;
3096 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3097 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3098 #endif
3099 bool non_match = false;
3100 bin_tree_t *work_tree;
3101 int token_len;
3102 bool first_round = true;
3103 #ifdef _LIBC
3104 collseqmb = (const unsigned char *)
3105 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3106 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3107 if (nrules)
3108 {
3109
3110
3111
3112 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3113 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3114 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3115 _NL_COLLATE_SYMB_TABLEMB);
3116 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3117 _NL_COLLATE_SYMB_EXTRAMB);
3118 }
3119 #endif
3120 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3121 #ifdef RE_ENABLE_I18N
3122 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3123 #endif
3124 #ifdef RE_ENABLE_I18N
3125 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3126 #else
3127 if (__glibc_unlikely (sbcset == NULL))
3128 #endif
3129 {
3130 re_free (sbcset);
3131 #ifdef RE_ENABLE_I18N
3132 re_free (mbcset);
3133 #endif
3134 *err = REG_ESPACE;
3135 return NULL;
3136 }
3137
3138 token_len = peek_token_bracket (token, regexp, syntax);
3139 if (__glibc_unlikely (token->type == END_OF_RE))
3140 {
3141 *err = REG_BADPAT;
3142 goto parse_bracket_exp_free_return;
3143 }
3144 if (token->type == OP_NON_MATCH_LIST)
3145 {
3146 #ifdef RE_ENABLE_I18N
3147 mbcset->non_match = 1;
3148 #endif
3149 non_match = true;
3150 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3151 bitset_set (sbcset, '\n');
3152 re_string_skip_bytes (regexp, token_len);
3153 token_len = peek_token_bracket (token, regexp, syntax);
3154 if (__glibc_unlikely (token->type == END_OF_RE))
3155 {
3156 *err = REG_BADPAT;
3157 goto parse_bracket_exp_free_return;
3158 }
3159 }
3160
3161
3162 if (token->type == OP_CLOSE_BRACKET)
3163 token->type = CHARACTER;
3164
3165 while (1)
3166 {
3167 bracket_elem_t start_elem, end_elem;
3168 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3169 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3170 reg_errcode_t ret;
3171 int token_len2 = 0;
3172 bool is_range_exp = false;
3173 re_token_t token2;
3174
3175 start_elem.opr.name = start_name_buf;
3176 start_elem.type = COLL_SYM;
3177 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3178 syntax, first_round);
3179 if (__glibc_unlikely (ret != REG_NOERROR))
3180 {
3181 *err = ret;
3182 goto parse_bracket_exp_free_return;
3183 }
3184 first_round = false;
3185
3186
3187 token_len = peek_token_bracket (token, regexp, syntax);
3188
3189
3190 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3191 {
3192 if (__glibc_unlikely (token->type == END_OF_RE))
3193 {
3194 *err = REG_EBRACK;
3195 goto parse_bracket_exp_free_return;
3196 }
3197 if (token->type == OP_CHARSET_RANGE)
3198 {
3199 re_string_skip_bytes (regexp, token_len);
3200 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3201 if (__glibc_unlikely (token2.type == END_OF_RE))
3202 {
3203 *err = REG_EBRACK;
3204 goto parse_bracket_exp_free_return;
3205 }
3206 if (token2.type == OP_CLOSE_BRACKET)
3207 {
3208
3209 re_string_skip_bytes (regexp, -token_len);
3210 token->type = CHARACTER;
3211 }
3212 else
3213 is_range_exp = true;
3214 }
3215 }
3216
3217 if (is_range_exp == true)
3218 {
3219 end_elem.opr.name = end_name_buf;
3220 end_elem.type = COLL_SYM;
3221 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3222 dfa, syntax, true);
3223 if (__glibc_unlikely (ret != REG_NOERROR))
3224 {
3225 *err = ret;
3226 goto parse_bracket_exp_free_return;
3227 }
3228
3229 token_len = peek_token_bracket (token, regexp, syntax);
3230
3231 #ifdef _LIBC
3232 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3233 &start_elem, &end_elem);
3234 #else
3235 # ifdef RE_ENABLE_I18N
3236 *err = build_range_exp (syntax, sbcset,
3237 dfa->mb_cur_max > 1 ? mbcset : NULL,
3238 &range_alloc, &start_elem, &end_elem);
3239 # else
3240 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3241 # endif
3242 #endif
3243 if (__glibc_unlikely (*err != REG_NOERROR))
3244 goto parse_bracket_exp_free_return;
3245 }
3246 else
3247 {
3248 switch (start_elem.type)
3249 {
3250 case SB_CHAR:
3251 bitset_set (sbcset, start_elem.opr.ch);
3252 break;
3253 #ifdef RE_ENABLE_I18N
3254 case MB_CHAR:
3255
3256 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3257 {
3258 wchar_t *new_mbchars;
3259
3260
3261 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3262
3263 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3264 mbchar_alloc);
3265 if (__glibc_unlikely (new_mbchars == NULL))
3266 goto parse_bracket_exp_espace;
3267 mbcset->mbchars = new_mbchars;
3268 }
3269 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3270 break;
3271 #endif
3272 case EQUIV_CLASS:
3273 *err = build_equiv_class (sbcset,
3274 #ifdef RE_ENABLE_I18N
3275 mbcset, &equiv_class_alloc,
3276 #endif
3277 start_elem.opr.name);
3278 if (__glibc_unlikely (*err != REG_NOERROR))
3279 goto parse_bracket_exp_free_return;
3280 break;
3281 case COLL_SYM:
3282 *err = build_collating_symbol (sbcset,
3283 #ifdef RE_ENABLE_I18N
3284 mbcset, &coll_sym_alloc,
3285 #endif
3286 start_elem.opr.name);
3287 if (__glibc_unlikely (*err != REG_NOERROR))
3288 goto parse_bracket_exp_free_return;
3289 break;
3290 case CHAR_CLASS:
3291 *err = build_charclass (regexp->trans, sbcset,
3292 #ifdef RE_ENABLE_I18N
3293 mbcset, &char_class_alloc,
3294 #endif
3295 (const char *) start_elem.opr.name,
3296 syntax);
3297 if (__glibc_unlikely (*err != REG_NOERROR))
3298 goto parse_bracket_exp_free_return;
3299 break;
3300 default:
3301 DEBUG_ASSERT (false);
3302 break;
3303 }
3304 }
3305 if (__glibc_unlikely (token->type == END_OF_RE))
3306 {
3307 *err = REG_EBRACK;
3308 goto parse_bracket_exp_free_return;
3309 }
3310 if (token->type == OP_CLOSE_BRACKET)
3311 break;
3312 }
3313
3314 re_string_skip_bytes (regexp, token_len);
3315
3316
3317 if (non_match)
3318 bitset_not (sbcset);
3319
3320 #ifdef RE_ENABLE_I18N
3321
3322 if (dfa->mb_cur_max > 1)
3323 bitset_mask (sbcset, dfa->sb_char);
3324
3325 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3326 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3327 || mbcset->non_match)))
3328 {
3329 bin_tree_t *mbc_tree;
3330 int sbc_idx;
3331
3332 dfa->has_mb_node = 1;
3333 br_token.type = COMPLEX_BRACKET;
3334 br_token.opr.mbcset = mbcset;
3335 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3336 if (__glibc_unlikely (mbc_tree == NULL))
3337 goto parse_bracket_exp_espace;
3338 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3339 if (sbcset[sbc_idx])
3340 break;
3341
3342
3343 if (sbc_idx < BITSET_WORDS)
3344 {
3345
3346 br_token.type = SIMPLE_BRACKET;
3347 br_token.opr.sbcset = sbcset;
3348 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3349 if (__glibc_unlikely (work_tree == NULL))
3350 goto parse_bracket_exp_espace;
3351
3352
3353 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3354 if (__glibc_unlikely (work_tree == NULL))
3355 goto parse_bracket_exp_espace;
3356 }
3357 else
3358 {
3359 re_free (sbcset);
3360 work_tree = mbc_tree;
3361 }
3362 }
3363 else
3364 #endif
3365 {
3366 #ifdef RE_ENABLE_I18N
3367 free_charset (mbcset);
3368 #endif
3369
3370 br_token.type = SIMPLE_BRACKET;
3371 br_token.opr.sbcset = sbcset;
3372 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3373 if (__glibc_unlikely (work_tree == NULL))
3374 goto parse_bracket_exp_espace;
3375 }
3376 return work_tree;
3377
3378 parse_bracket_exp_espace:
3379 *err = REG_ESPACE;
3380 parse_bracket_exp_free_return:
3381 re_free (sbcset);
3382 #ifdef RE_ENABLE_I18N
3383 free_charset (mbcset);
3384 #endif
3385 return NULL;
3386 }
3387
3388
3389
3390 static reg_errcode_t
3391 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3392 re_token_t *token, int token_len, re_dfa_t *dfa,
3393 reg_syntax_t syntax, bool accept_hyphen)
3394 {
3395 #ifdef RE_ENABLE_I18N
3396 int cur_char_size;
3397 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3398 if (cur_char_size > 1)
3399 {
3400 elem->type = MB_CHAR;
3401 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3402 re_string_skip_bytes (regexp, cur_char_size);
3403 return REG_NOERROR;
3404 }
3405 #endif
3406 re_string_skip_bytes (regexp, token_len);
3407 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3408 || token->type == OP_OPEN_EQUIV_CLASS)
3409 return parse_bracket_symbol (elem, regexp, token);
3410 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3411 {
3412
3413
3414 re_token_t token2;
3415 (void) peek_token_bracket (&token2, regexp, syntax);
3416 if (token2.type != OP_CLOSE_BRACKET)
3417
3418
3419 return REG_ERANGE;
3420 }
3421 elem->type = SB_CHAR;
3422 elem->opr.ch = token->opr.c;
3423 return REG_NOERROR;
3424 }
3425
3426
3427
3428
3429
3430 static reg_errcode_t
3431 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3432 re_token_t *token)
3433 {
3434 unsigned char ch, delim = token->opr.c;
3435 int i = 0;
3436 if (re_string_eoi(regexp))
3437 return REG_EBRACK;
3438 for (;; ++i)
3439 {
3440 if (i >= BRACKET_NAME_BUF_SIZE)
3441 return REG_EBRACK;
3442 if (token->type == OP_OPEN_CHAR_CLASS)
3443 ch = re_string_fetch_byte_case (regexp);
3444 else
3445 ch = re_string_fetch_byte (regexp);
3446 if (re_string_eoi(regexp))
3447 return REG_EBRACK;
3448 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3449 break;
3450 elem->opr.name[i] = ch;
3451 }
3452 re_string_skip_bytes (regexp, 1);
3453 elem->opr.name[i] = '\0';
3454 switch (token->type)
3455 {
3456 case OP_OPEN_COLL_ELEM:
3457 elem->type = COLL_SYM;
3458 break;
3459 case OP_OPEN_EQUIV_CLASS:
3460 elem->type = EQUIV_CLASS;
3461 break;
3462 case OP_OPEN_CHAR_CLASS:
3463 elem->type = CHAR_CLASS;
3464 break;
3465 default:
3466 break;
3467 }
3468 return REG_NOERROR;
3469 }
3470
3471
3472
3473
3474
3475
3476
3477 static reg_errcode_t
3478 #ifdef RE_ENABLE_I18N
3479 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3480 Idx *equiv_class_alloc, const unsigned char *name)
3481 #else
3482 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3483 #endif
3484 {
3485 #ifdef _LIBC
3486 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3487 if (nrules != 0)
3488 {
3489 const int32_t *table, *indirect;
3490 const unsigned char *weights, *extra, *cp;
3491 unsigned char char_buf[2];
3492 int32_t idx1, idx2;
3493 unsigned int ch;
3494 size_t len;
3495
3496 cp = name;
3497 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3498 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3499 _NL_COLLATE_WEIGHTMB);
3500 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3501 _NL_COLLATE_EXTRAMB);
3502 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3503 _NL_COLLATE_INDIRECTMB);
3504 idx1 = findidx (table, indirect, extra, &cp, -1);
3505 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3506
3507 return REG_ECOLLATE;
3508
3509
3510 len = weights[idx1 & 0xffffff];
3511 for (ch = 0; ch < SBC_MAX; ++ch)
3512 {
3513 char_buf[0] = ch;
3514 cp = char_buf;
3515 idx2 = findidx (table, indirect, extra, &cp, 1);
3516
3517
3518
3519 if (idx2 == 0)
3520
3521 continue;
3522
3523
3524 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3525 && memcmp (weights + (idx1 & 0xffffff) + 1,
3526 weights + (idx2 & 0xffffff) + 1, len) == 0)
3527 bitset_set (sbcset, ch);
3528 }
3529
3530 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3531 {
3532
3533
3534 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3535
3536 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3537 int32_t,
3538 new_equiv_class_alloc);
3539 if (__glibc_unlikely (new_equiv_classes == NULL))
3540 return REG_ESPACE;
3541 mbcset->equiv_classes = new_equiv_classes;
3542 *equiv_class_alloc = new_equiv_class_alloc;
3543 }
3544 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3545 }
3546 else
3547 #endif
3548 {
3549 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3550 return REG_ECOLLATE;
3551 bitset_set (sbcset, *name);
3552 }
3553 return REG_NOERROR;
3554 }
3555
3556
3557
3558
3559
3560
3561
3562 static reg_errcode_t
3563 #ifdef RE_ENABLE_I18N
3564 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3565 re_charset_t *mbcset, Idx *char_class_alloc,
3566 const char *class_name, reg_syntax_t syntax)
3567 #else
3568 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3569 const char *class_name, reg_syntax_t syntax)
3570 #endif
3571 {
3572 int i;
3573 const char *name = class_name;
3574
3575
3576
3577 if ((syntax & RE_ICASE)
3578 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3579 name = "alpha";
3580
3581 #ifdef RE_ENABLE_I18N
3582
3583 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3584 {
3585
3586
3587 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3588
3589 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3590 new_char_class_alloc);
3591 if (__glibc_unlikely (new_char_classes == NULL))
3592 return REG_ESPACE;
3593 mbcset->char_classes = new_char_classes;
3594 *char_class_alloc = new_char_class_alloc;
3595 }
3596 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3597 #endif
3598
3599 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3600 do { \
3601 if (__glibc_unlikely (trans != NULL)) \
3602 { \
3603 for (i = 0; i < SBC_MAX; ++i) \
3604 if (ctype_func (i)) \
3605 bitset_set (sbcset, trans[i]); \
3606 } \
3607 else \
3608 { \
3609 for (i = 0; i < SBC_MAX; ++i) \
3610 if (ctype_func (i)) \
3611 bitset_set (sbcset, i); \
3612 } \
3613 } while (0)
3614
3615 if (strcmp (name, "alnum") == 0)
3616 BUILD_CHARCLASS_LOOP (isalnum);
3617 else if (strcmp (name, "cntrl") == 0)
3618 BUILD_CHARCLASS_LOOP (iscntrl);
3619 else if (strcmp (name, "lower") == 0)
3620 BUILD_CHARCLASS_LOOP (islower);
3621 else if (strcmp (name, "space") == 0)
3622 BUILD_CHARCLASS_LOOP (isspace);
3623 else if (strcmp (name, "alpha") == 0)
3624 BUILD_CHARCLASS_LOOP (isalpha);
3625 else if (strcmp (name, "digit") == 0)
3626 BUILD_CHARCLASS_LOOP (isdigit);
3627 else if (strcmp (name, "print") == 0)
3628 BUILD_CHARCLASS_LOOP (isprint);
3629 else if (strcmp (name, "upper") == 0)
3630 BUILD_CHARCLASS_LOOP (isupper);
3631 else if (strcmp (name, "blank") == 0)
3632 BUILD_CHARCLASS_LOOP (isblank);
3633 else if (strcmp (name, "graph") == 0)
3634 BUILD_CHARCLASS_LOOP (isgraph);
3635 else if (strcmp (name, "punct") == 0)
3636 BUILD_CHARCLASS_LOOP (ispunct);
3637 else if (strcmp (name, "xdigit") == 0)
3638 BUILD_CHARCLASS_LOOP (isxdigit);
3639 else
3640 return REG_ECTYPE;
3641
3642 return REG_NOERROR;
3643 }
3644
3645 static bin_tree_t *
3646 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3647 const char *class_name,
3648 const char *extra, bool non_match,
3649 reg_errcode_t *err)
3650 {
3651 re_bitset_ptr_t sbcset;
3652 #ifdef RE_ENABLE_I18N
3653 re_charset_t *mbcset;
3654 Idx alloc = 0;
3655 #endif
3656 reg_errcode_t ret;
3657 bin_tree_t *tree;
3658
3659 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3660 if (__glibc_unlikely (sbcset == NULL))
3661 {
3662 *err = REG_ESPACE;
3663 return NULL;
3664 }
3665 #ifdef RE_ENABLE_I18N
3666 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3667 if (__glibc_unlikely (mbcset == NULL))
3668 {
3669 re_free (sbcset);
3670 *err = REG_ESPACE;
3671 return NULL;
3672 }
3673 mbcset->non_match = non_match;
3674 #endif
3675
3676
3677 ret = build_charclass (trans, sbcset,
3678 #ifdef RE_ENABLE_I18N
3679 mbcset, &alloc,
3680 #endif
3681 class_name, 0);
3682
3683 if (__glibc_unlikely (ret != REG_NOERROR))
3684 {
3685 re_free (sbcset);
3686 #ifdef RE_ENABLE_I18N
3687 free_charset (mbcset);
3688 #endif
3689 *err = ret;
3690 return NULL;
3691 }
3692
3693 for (; *extra; extra++)
3694 bitset_set (sbcset, *extra);
3695
3696
3697 if (non_match)
3698 bitset_not (sbcset);
3699
3700 #ifdef RE_ENABLE_I18N
3701
3702 if (dfa->mb_cur_max > 1)
3703 bitset_mask (sbcset, dfa->sb_char);
3704 #endif
3705
3706
3707 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3708 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3709 if (__glibc_unlikely (tree == NULL))
3710 goto build_word_op_espace;
3711
3712 #ifdef RE_ENABLE_I18N
3713 if (dfa->mb_cur_max > 1)
3714 {
3715 bin_tree_t *mbc_tree;
3716
3717 br_token.type = COMPLEX_BRACKET;
3718 br_token.opr.mbcset = mbcset;
3719 dfa->has_mb_node = 1;
3720 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3721 if (__glibc_unlikely (mbc_tree == NULL))
3722 goto build_word_op_espace;
3723
3724 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3725 if (__glibc_likely (mbc_tree != NULL))
3726 return tree;
3727 }
3728 else
3729 {
3730 free_charset (mbcset);
3731 return tree;
3732 }
3733 #else
3734 return tree;
3735 #endif
3736
3737 build_word_op_espace:
3738 re_free (sbcset);
3739 #ifdef RE_ENABLE_I18N
3740 free_charset (mbcset);
3741 #endif
3742 *err = REG_ESPACE;
3743 return NULL;
3744 }
3745
3746
3747
3748
3749
3750
3751
3752 static Idx
3753 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3754 {
3755 Idx num = -1;
3756 unsigned char c;
3757 while (1)
3758 {
3759 fetch_token (token, input, syntax);
3760 c = token->opr.c;
3761 if (__glibc_unlikely (token->type == END_OF_RE))
3762 return -2;
3763 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3764 break;
3765 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3766 ? -2
3767 : num == -1
3768 ? c - '0'
3769 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3770 }
3771 return num;
3772 }
3773
3774 #ifdef RE_ENABLE_I18N
3775 static void
3776 free_charset (re_charset_t *cset)
3777 {
3778 re_free (cset->mbchars);
3779 # ifdef _LIBC
3780 re_free (cset->coll_syms);
3781 re_free (cset->equiv_classes);
3782 # endif
3783 re_free (cset->range_starts);
3784 re_free (cset->range_ends);
3785 re_free (cset->char_classes);
3786 re_free (cset);
3787 }
3788 #endif
3789
3790
3791
3792
3793
3794 static bin_tree_t *
3795 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3796 re_token_type_t type)
3797 {
3798 re_token_t t = { .type = type };
3799 return create_token_tree (dfa, left, right, &t);
3800 }
3801
3802 static bin_tree_t *
3803 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3804 const re_token_t *token)
3805 {
3806 bin_tree_t *tree;
3807 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3808 {
3809 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3810
3811 if (storage == NULL)
3812 return NULL;
3813 storage->next = dfa->str_tree_storage;
3814 dfa->str_tree_storage = storage;
3815 dfa->str_tree_storage_idx = 0;
3816 }
3817 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3818
3819 tree->parent = NULL;
3820 tree->left = left;
3821 tree->right = right;
3822 tree->token = *token;
3823 tree->token.duplicated = 0;
3824 tree->token.opt_subexp = 0;
3825 tree->first = NULL;
3826 tree->next = NULL;
3827 tree->node_idx = -1;
3828
3829 if (left != NULL)
3830 left->parent = tree;
3831 if (right != NULL)
3832 right->parent = tree;
3833 return tree;
3834 }
3835
3836
3837
3838
3839 static reg_errcode_t
3840 mark_opt_subexp (void *extra, bin_tree_t *node)
3841 {
3842 Idx idx = (uintptr_t) extra;
3843 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3844 node->token.opt_subexp = 1;
3845
3846 return REG_NOERROR;
3847 }
3848
3849
3850
3851 static void
3852 free_token (re_token_t *node)
3853 {
3854 #ifdef RE_ENABLE_I18N
3855 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3856 free_charset (node->opr.mbcset);
3857 else
3858 #endif
3859 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3860 re_free (node->opr.sbcset);
3861 }
3862
3863
3864
3865
3866 static reg_errcode_t
3867 free_tree (void *extra, bin_tree_t *node)
3868 {
3869 free_token (&node->token);
3870 return REG_NOERROR;
3871 }
3872
3873
3874
3875
3876
3877
3878
3879 static bin_tree_t *
3880 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3881 {
3882 const bin_tree_t *node;
3883 bin_tree_t *dup_root;
3884 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3885
3886 for (node = root; ; )
3887 {
3888
3889 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3890 if (*p_new == NULL)
3891 return NULL;
3892 (*p_new)->parent = dup_node;
3893 (*p_new)->token.duplicated = 1;
3894 dup_node = *p_new;
3895
3896
3897 if (node->left)
3898 {
3899 node = node->left;
3900 p_new = &dup_node->left;
3901 }
3902 else
3903 {
3904 const bin_tree_t *prev = NULL;
3905 while (node->right == prev || node->right == NULL)
3906 {
3907 prev = node;
3908 node = node->parent;
3909 dup_node = dup_node->parent;
3910 if (!node)
3911 return dup_root;
3912 }
3913 node = node->right;
3914 p_new = &dup_node->right;
3915 }
3916 }
3917 }