root/maint/gnulib/lib/regex_internal.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. re_string_allocate
  2. re_string_construct
  3. re_string_realloc_buffers
  4. re_string_construct_common
  5. build_wcs_buffer
  6. build_wcs_upper_buffer
  7. re_string_skip_chars
  8. build_upper_buffer
  9. re_string_translate_buffer
  10. re_string_reconstruct
  11. re_string_peek_byte_case
  12. re_string_fetch_byte_case
  13. re_string_destruct
  14. re_string_context_at
  15. re_node_set_alloc
  16. re_node_set_init_1
  17. re_node_set_init_2
  18. re_node_set_init_copy
  19. re_node_set_add_intersect
  20. re_node_set_init_union
  21. re_node_set_merge
  22. re_node_set_insert
  23. re_node_set_insert_last
  24. re_node_set_compare
  25. re_node_set_contains
  26. re_node_set_remove_at
  27. re_dfa_add_node
  28. calc_state_hash
  29. re_acquire_state
  30. re_acquire_state_context
  31. register_state
  32. free_state
  33. create_ci_newstate
  34. create_cd_newstate

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5 
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10 
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15 
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19 
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa);
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash);
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash);
  31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
  32                                                 Idx new_buf_len);
  33 static void build_wcs_buffer (re_string_t *pstr);
  34 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
  35 static void build_upper_buffer (re_string_t *pstr);
  36 static void re_string_translate_buffer (re_string_t *pstr);
  37 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
  38                                           int eflags) __attribute__ ((pure));
  39 
  40 /* Functions for string operation.  */
  41 
  42 /* This function allocate the buffers.  It is necessary to call
  43    re_string_reconstruct before using the object.  */
  44 
  45 static reg_errcode_t
  46 __attribute_warn_unused_result__
  47 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
     /* [previous][next][first][last][top][bottom][index][help] */
  48                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  49 {
  50   reg_errcode_t ret;
  51   Idx init_buf_len;
  52 
  53   /* Ensure at least one character fits into the buffers.  */
  54   if (init_len < dfa->mb_cur_max)
  55     init_len = dfa->mb_cur_max;
  56   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  57   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  58 
  59   ret = re_string_realloc_buffers (pstr, init_buf_len);
  60   if (__glibc_unlikely (ret != REG_NOERROR))
  61     return ret;
  62 
  63   pstr->word_char = dfa->word_char;
  64   pstr->word_ops_used = dfa->word_ops_used;
  65   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  66   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  67   pstr->valid_raw_len = pstr->valid_len;
  68   return REG_NOERROR;
  69 }
  70 
  71 /* This function allocate the buffers, and initialize them.  */
  72 
  73 static reg_errcode_t
  74 __attribute_warn_unused_result__
  75 re_string_construct (re_string_t *pstr, const char *str, Idx len,
     /* [previous][next][first][last][top][bottom][index][help] */
  76                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  77 {
  78   reg_errcode_t ret;
  79   memset (pstr, '\0', sizeof (re_string_t));
  80   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  81 
  82   if (len > 0)
  83     {
  84       ret = re_string_realloc_buffers (pstr, len + 1);
  85       if (__glibc_unlikely (ret != REG_NOERROR))
  86         return ret;
  87     }
  88   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  89 
  90   if (icase)
  91     {
  92       if (dfa->mb_cur_max > 1)
  93         {
  94           while (1)
  95             {
  96               ret = build_wcs_upper_buffer (pstr);
  97               if (__glibc_unlikely (ret != REG_NOERROR))
  98                 return ret;
  99               if (pstr->valid_raw_len >= len)
 100                 break;
 101               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 102                 break;
 103               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 104               if (__glibc_unlikely (ret != REG_NOERROR))
 105                 return ret;
 106             }
 107         }
 108       else
 109         build_upper_buffer (pstr);
 110     }
 111   else
 112     {
 113       if (dfa->mb_cur_max > 1)
 114         build_wcs_buffer (pstr);
 115       else
 116         {
 117           if (trans != NULL)
 118             re_string_translate_buffer (pstr);
 119           else
 120             {
 121               pstr->valid_len = pstr->bufs_len;
 122               pstr->valid_raw_len = pstr->bufs_len;
 123             }
 124         }
 125     }
 126 
 127   return REG_NOERROR;
 128 }
 129 
 130 /* Helper functions for re_string_allocate, and re_string_construct.  */
 131 
 132 static reg_errcode_t
 133 __attribute_warn_unused_result__
 134 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
     /* [previous][next][first][last][top][bottom][index][help] */
 135 {
 136   if (pstr->mb_cur_max > 1)
 137     {
 138       wint_t *new_wcs;
 139 
 140       /* Avoid overflow in realloc.  */
 141       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 142       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
 143                             < new_buf_len))
 144         return REG_ESPACE;
 145 
 146       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 147       if (__glibc_unlikely (new_wcs == NULL))
 148         return REG_ESPACE;
 149       pstr->wcs = new_wcs;
 150       if (pstr->offsets != NULL)
 151         {
 152           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 153           if (__glibc_unlikely (new_offsets == NULL))
 154             return REG_ESPACE;
 155           pstr->offsets = new_offsets;
 156         }
 157     }
 158   if (pstr->mbs_allocated)
 159     {
 160       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 161                                            new_buf_len);
 162       if (__glibc_unlikely (new_mbs == NULL))
 163         return REG_ESPACE;
 164       pstr->mbs = new_mbs;
 165     }
 166   pstr->bufs_len = new_buf_len;
 167   return REG_NOERROR;
 168 }
 169 
 170 
 171 static void
 172 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
     /* [previous][next][first][last][top][bottom][index][help] */
 173                             RE_TRANSLATE_TYPE trans, bool icase,
 174                             const re_dfa_t *dfa)
 175 {
 176   pstr->raw_mbs = (const unsigned char *) str;
 177   pstr->len = len;
 178   pstr->raw_len = len;
 179   pstr->trans = trans;
 180   pstr->icase = icase;
 181   pstr->mbs_allocated = (trans != NULL || icase);
 182   pstr->mb_cur_max = dfa->mb_cur_max;
 183   pstr->is_utf8 = dfa->is_utf8;
 184   pstr->map_notascii = dfa->map_notascii;
 185   pstr->stop = pstr->len;
 186   pstr->raw_stop = pstr->stop;
 187 }
 188 
 189 
 190 /* Build wide character buffer PSTR->WCS.
 191    If the byte sequence of the string are:
 192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 193    Then wide character buffer will be:
 194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 195    We use WEOF for padding, they indicate that the position isn't
 196    a first byte of a multibyte character.
 197 
 198    Note that this function assumes PSTR->VALID_LEN elements are already
 199    built and starts from PSTR->VALID_LEN.  */
 200 
 201 static void
 202 build_wcs_buffer (re_string_t *pstr)
     /* [previous][next][first][last][top][bottom][index][help] */
 203 {
 204 #ifdef _LIBC
 205   unsigned char buf[MB_LEN_MAX];
 206   DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
 207 #else
 208   unsigned char buf[64];
 209 #endif
 210   mbstate_t prev_st;
 211   Idx byte_idx, end_idx, remain_len;
 212   size_t mbclen;
 213 
 214   /* Build the buffers from pstr->valid_len to either pstr->len or
 215      pstr->bufs_len.  */
 216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 218     {
 219       wchar_t wc;
 220       const char *p;
 221 
 222       remain_len = end_idx - byte_idx;
 223       prev_st = pstr->cur_state;
 224       /* Apply the translation if we need.  */
 225       if (__glibc_unlikely (pstr->trans != NULL))
 226         {
 227           int i, ch;
 228 
 229           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 230             {
 231               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 232               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 233             }
 234           p = (const char *) buf;
 235         }
 236       else
 237         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 238       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 239       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
 240                             || (mbclen == (size_t) -2
 241                                 && pstr->bufs_len >= pstr->len)))
 242         {
 243           /* We treat these cases as a singlebyte character.  */
 244           mbclen = 1;
 245           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 246           if (__glibc_unlikely (pstr->trans != NULL))
 247             wc = pstr->trans[wc];
 248           pstr->cur_state = prev_st;
 249         }
 250       else if (__glibc_unlikely (mbclen == (size_t) -2))
 251         {
 252           /* The buffer doesn't have enough space, finish to build.  */
 253           pstr->cur_state = prev_st;
 254           break;
 255         }
 256 
 257       /* Write wide character and padding.  */
 258       pstr->wcs[byte_idx++] = wc;
 259       /* Write paddings.  */
 260       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 261         pstr->wcs[byte_idx++] = WEOF;
 262     }
 263   pstr->valid_len = byte_idx;
 264   pstr->valid_raw_len = byte_idx;
 265 }
 266 
 267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 268    but for REG_ICASE.  */
 269 
 270 static reg_errcode_t
 271 __attribute_warn_unused_result__
 272 build_wcs_upper_buffer (re_string_t *pstr)
     /* [previous][next][first][last][top][bottom][index][help] */
 273 {
 274   mbstate_t prev_st;
 275   Idx src_idx, byte_idx, end_idx, remain_len;
 276   size_t mbclen;
 277 #ifdef _LIBC
 278   char buf[MB_LEN_MAX];
 279   DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
 280 #else
 281   char buf[64];
 282 #endif
 283 
 284   byte_idx = pstr->valid_len;
 285   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 286 
 287   /* The following optimization assumes that ASCII characters can be
 288      mapped to wide characters with a simple cast.  */
 289   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 290     {
 291       while (byte_idx < end_idx)
 292         {
 293           wchar_t wc;
 294           unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 295 
 296           if (isascii (ch) && mbsinit (&pstr->cur_state))
 297             {
 298               /* The next step uses the assumption that wchar_t is encoded
 299                  ASCII-safe: all ASCII values can be converted like this.  */
 300               wchar_t wcu = __towupper (ch);
 301               if (isascii (wcu))
 302                 {
 303                   pstr->mbs[byte_idx] = wcu;
 304                   pstr->wcs[byte_idx] = wcu;
 305                   byte_idx++;
 306                   continue;
 307                 }
 308             }
 309 
 310           remain_len = end_idx - byte_idx;
 311           prev_st = pstr->cur_state;
 312           mbclen = __mbrtowc (&wc,
 313                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 314                                + byte_idx), remain_len, &pstr->cur_state);
 315           if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 316             {
 317               wchar_t wcu = __towupper (wc);
 318               if (wcu != wc)
 319                 {
 320                   size_t mbcdlen;
 321 
 322                   mbcdlen = __wcrtomb (buf, wcu, &prev_st);
 323                   if (__glibc_likely (mbclen == mbcdlen))
 324                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 325                   else
 326                     {
 327                       src_idx = byte_idx;
 328                       goto offsets_needed;
 329                     }
 330                 }
 331               else
 332                 memcpy (pstr->mbs + byte_idx,
 333                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 334               pstr->wcs[byte_idx++] = wcu;
 335               /* Write paddings.  */
 336               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 337                 pstr->wcs[byte_idx++] = WEOF;
 338             }
 339           else if (mbclen == (size_t) -1 || mbclen == 0
 340                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 341             {
 342               /* It is an invalid character, an incomplete character
 343                  at the end of the string, or '\0'.  Just use the byte.  */
 344               pstr->mbs[byte_idx] = ch;
 345               /* And also cast it to wide char.  */
 346               pstr->wcs[byte_idx++] = (wchar_t) ch;
 347               if (__glibc_unlikely (mbclen == (size_t) -1))
 348                 pstr->cur_state = prev_st;
 349             }
 350           else
 351             {
 352               /* The buffer doesn't have enough space, finish to build.  */
 353               pstr->cur_state = prev_st;
 354               break;
 355             }
 356         }
 357       pstr->valid_len = byte_idx;
 358       pstr->valid_raw_len = byte_idx;
 359       return REG_NOERROR;
 360     }
 361   else
 362     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 363       {
 364         wchar_t wc;
 365         const char *p;
 366       offsets_needed:
 367         remain_len = end_idx - byte_idx;
 368         prev_st = pstr->cur_state;
 369         if (__glibc_unlikely (pstr->trans != NULL))
 370           {
 371             int i, ch;
 372 
 373             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 374               {
 375                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 376                 buf[i] = pstr->trans[ch];
 377               }
 378             p = (const char *) buf;
 379           }
 380         else
 381           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 382         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 383         if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 384           {
 385             wchar_t wcu = __towupper (wc);
 386             if (wcu != wc)
 387               {
 388                 size_t mbcdlen;
 389 
 390                 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
 391                 if (__glibc_likely (mbclen == mbcdlen))
 392                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 393                 else if (mbcdlen != (size_t) -1)
 394                   {
 395                     size_t i;
 396 
 397                     if (byte_idx + mbcdlen > pstr->bufs_len)
 398                       {
 399                         pstr->cur_state = prev_st;
 400                         break;
 401                       }
 402 
 403                     if (pstr->offsets == NULL)
 404                       {
 405                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 406 
 407                         if (pstr->offsets == NULL)
 408                           return REG_ESPACE;
 409                       }
 410                     if (!pstr->offsets_needed)
 411                       {
 412                         for (i = 0; i < (size_t) byte_idx; ++i)
 413                           pstr->offsets[i] = i;
 414                         pstr->offsets_needed = 1;
 415                       }
 416 
 417                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 418                     pstr->wcs[byte_idx] = wcu;
 419                     pstr->offsets[byte_idx] = src_idx;
 420                     for (i = 1; i < mbcdlen; ++i)
 421                       {
 422                         pstr->offsets[byte_idx + i]
 423                           = src_idx + (i < mbclen ? i : mbclen - 1);
 424                         pstr->wcs[byte_idx + i] = WEOF;
 425                       }
 426                     pstr->len += mbcdlen - mbclen;
 427                     if (pstr->raw_stop > src_idx)
 428                       pstr->stop += mbcdlen - mbclen;
 429                     end_idx = (pstr->bufs_len > pstr->len)
 430                               ? pstr->len : pstr->bufs_len;
 431                     byte_idx += mbcdlen;
 432                     src_idx += mbclen;
 433                     continue;
 434                   }
 435                 else
 436                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 437               }
 438             else
 439               memcpy (pstr->mbs + byte_idx, p, mbclen);
 440 
 441             if (__glibc_unlikely (pstr->offsets_needed != 0))
 442               {
 443                 size_t i;
 444                 for (i = 0; i < mbclen; ++i)
 445                   pstr->offsets[byte_idx + i] = src_idx + i;
 446               }
 447             src_idx += mbclen;
 448 
 449             pstr->wcs[byte_idx++] = wcu;
 450             /* Write paddings.  */
 451             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 452               pstr->wcs[byte_idx++] = WEOF;
 453           }
 454         else if (mbclen == (size_t) -1 || mbclen == 0
 455                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 456           {
 457             /* It is an invalid character or '\0'.  Just use the byte.  */
 458             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 459 
 460             if (__glibc_unlikely (pstr->trans != NULL))
 461               ch = pstr->trans [ch];
 462             pstr->mbs[byte_idx] = ch;
 463 
 464             if (__glibc_unlikely (pstr->offsets_needed != 0))
 465               pstr->offsets[byte_idx] = src_idx;
 466             ++src_idx;
 467 
 468             /* And also cast it to wide char.  */
 469             pstr->wcs[byte_idx++] = (wchar_t) ch;
 470             if (__glibc_unlikely (mbclen == (size_t) -1))
 471               pstr->cur_state = prev_st;
 472           }
 473         else
 474           {
 475             /* The buffer doesn't have enough space, finish to build.  */
 476             pstr->cur_state = prev_st;
 477             break;
 478           }
 479       }
 480   pstr->valid_len = byte_idx;
 481   pstr->valid_raw_len = src_idx;
 482   return REG_NOERROR;
 483 }
 484 
 485 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 486    Return the index.  */
 487 
 488 static Idx
 489 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
     /* [previous][next][first][last][top][bottom][index][help] */
 490 {
 491   mbstate_t prev_st;
 492   Idx rawbuf_idx;
 493   size_t mbclen;
 494   wint_t wc = WEOF;
 495 
 496   /* Skip the characters which are not necessary to check.  */
 497   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 498        rawbuf_idx < new_raw_idx;)
 499     {
 500       wchar_t wc2;
 501       Idx remain_len = pstr->raw_len - rawbuf_idx;
 502       prev_st = pstr->cur_state;
 503       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 504                           remain_len, &pstr->cur_state);
 505       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
 506                             || mbclen == 0))
 507         {
 508           /* We treat these cases as a single byte character.  */
 509           if (mbclen == 0 || remain_len == 0)
 510             wc = L'\0';
 511           else
 512             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 513           mbclen = 1;
 514           pstr->cur_state = prev_st;
 515         }
 516       else
 517         wc = wc2;
 518       /* Then proceed the next character.  */
 519       rawbuf_idx += mbclen;
 520     }
 521   *last_wc = wc;
 522   return rawbuf_idx;
 523 }
 524 
 525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 526    This function is used in case of REG_ICASE.  */
 527 
 528 static void
 529 build_upper_buffer (re_string_t *pstr)
     /* [previous][next][first][last][top][bottom][index][help] */
 530 {
 531   Idx char_idx, end_idx;
 532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 533 
 534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 535     {
 536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 537       if (__glibc_unlikely (pstr->trans != NULL))
 538         ch = pstr->trans[ch];
 539       pstr->mbs[char_idx] = toupper (ch);
 540     }
 541   pstr->valid_len = char_idx;
 542   pstr->valid_raw_len = char_idx;
 543 }
 544 
 545 /* Apply TRANS to the buffer in PSTR.  */
 546 
 547 static void
 548 re_string_translate_buffer (re_string_t *pstr)
     /* [previous][next][first][last][top][bottom][index][help] */
 549 {
 550   Idx buf_idx, end_idx;
 551   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 552 
 553   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 554     {
 555       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 556       pstr->mbs[buf_idx] = pstr->trans[ch];
 557     }
 558 
 559   pstr->valid_len = buf_idx;
 560   pstr->valid_raw_len = buf_idx;
 561 }
 562 
 563 /* This function re-construct the buffers.
 564    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 565    convert to upper case in case of REG_ICASE, apply translation.  */
 566 
 567 static reg_errcode_t
 568 __attribute_warn_unused_result__
 569 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
     /* [previous][next][first][last][top][bottom][index][help] */
 570 {
 571   Idx offset;
 572 
 573   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
 574     offset = idx - pstr->raw_mbs_idx;
 575   else
 576     {
 577       /* Reset buffer.  */
 578       if (pstr->mb_cur_max > 1)
 579         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 580       pstr->len = pstr->raw_len;
 581       pstr->stop = pstr->raw_stop;
 582       pstr->valid_len = 0;
 583       pstr->raw_mbs_idx = 0;
 584       pstr->valid_raw_len = 0;
 585       pstr->offsets_needed = 0;
 586       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 587                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 588       if (!pstr->mbs_allocated)
 589         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 590       offset = idx;
 591     }
 592 
 593   if (__glibc_likely (offset != 0))
 594     {
 595       /* Should the already checked characters be kept?  */
 596       if (__glibc_likely (offset < pstr->valid_raw_len))
 597         {
 598           /* Yes, move them to the front of the buffer.  */
 599           if (__glibc_unlikely (pstr->offsets_needed))
 600             {
 601               Idx low = 0, high = pstr->valid_len, mid;
 602               do
 603                 {
 604                   mid = (high + low) / 2;
 605                   if (pstr->offsets[mid] > offset)
 606                     high = mid;
 607                   else if (pstr->offsets[mid] < offset)
 608                     low = mid + 1;
 609                   else
 610                     break;
 611                 }
 612               while (low < high);
 613               if (pstr->offsets[mid] < offset)
 614                 ++mid;
 615               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 616                                                         eflags);
 617               /* This can be quite complicated, so handle specially
 618                  only the common and easy case where the character with
 619                  different length representation of lower and upper
 620                  case is present at or after offset.  */
 621               if (pstr->valid_len > offset
 622                   && mid == offset && pstr->offsets[mid] == offset)
 623                 {
 624                   memmove (pstr->wcs, pstr->wcs + offset,
 625                            (pstr->valid_len - offset) * sizeof (wint_t));
 626                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 627                   pstr->valid_len -= offset;
 628                   pstr->valid_raw_len -= offset;
 629                   for (low = 0; low < pstr->valid_len; low++)
 630                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 631                 }
 632               else
 633                 {
 634                   /* Otherwise, just find out how long the partial multibyte
 635                      character at offset is and fill it with WEOF/255.  */
 636                   pstr->len = pstr->raw_len - idx + offset;
 637                   pstr->stop = pstr->raw_stop - idx + offset;
 638                   pstr->offsets_needed = 0;
 639                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 640                     --mid;
 641                   while (mid < pstr->valid_len)
 642                     if (pstr->wcs[mid] != WEOF)
 643                       break;
 644                     else
 645                       ++mid;
 646                   if (mid == pstr->valid_len)
 647                     pstr->valid_len = 0;
 648                   else
 649                     {
 650                       pstr->valid_len = pstr->offsets[mid] - offset;
 651                       if (pstr->valid_len)
 652                         {
 653                           for (low = 0; low < pstr->valid_len; ++low)
 654                             pstr->wcs[low] = WEOF;
 655                           memset (pstr->mbs, 255, pstr->valid_len);
 656                         }
 657                     }
 658                   pstr->valid_raw_len = pstr->valid_len;
 659                 }
 660             }
 661           else
 662             {
 663               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 664                                                         eflags);
 665               if (pstr->mb_cur_max > 1)
 666                 memmove (pstr->wcs, pstr->wcs + offset,
 667                          (pstr->valid_len - offset) * sizeof (wint_t));
 668               if (__glibc_unlikely (pstr->mbs_allocated))
 669                 memmove (pstr->mbs, pstr->mbs + offset,
 670                          pstr->valid_len - offset);
 671               pstr->valid_len -= offset;
 672               pstr->valid_raw_len -= offset;
 673               DEBUG_ASSERT (pstr->valid_len > 0);
 674             }
 675         }
 676       else
 677         {
 678           /* No, skip all characters until IDX.  */
 679           Idx prev_valid_len = pstr->valid_len;
 680 
 681           if (__glibc_unlikely (pstr->offsets_needed))
 682             {
 683               pstr->len = pstr->raw_len - idx + offset;
 684               pstr->stop = pstr->raw_stop - idx + offset;
 685               pstr->offsets_needed = 0;
 686             }
 687           pstr->valid_len = 0;
 688           if (pstr->mb_cur_max > 1)
 689             {
 690               Idx wcs_idx;
 691               wint_t wc = WEOF;
 692 
 693               if (pstr->is_utf8)
 694                 {
 695                   const unsigned char *raw, *p, *end;
 696 
 697                   /* Special case UTF-8.  Multi-byte chars start with any
 698                      byte other than 0x80 - 0xbf.  */
 699                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 700                   end = raw + (offset - pstr->mb_cur_max);
 701                   if (end < pstr->raw_mbs)
 702                     end = pstr->raw_mbs;
 703                   p = raw + offset - 1;
 704 #ifdef _LIBC
 705                   /* We know the wchar_t encoding is UCS4, so for the simple
 706                      case, ASCII characters, skip the conversion step.  */
 707                   if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
 708                     {
 709                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 710                       /* pstr->valid_len = 0; */
 711                       wc = (wchar_t) *p;
 712                     }
 713                   else
 714 #endif
 715                     for (; p >= end; --p)
 716                       if ((*p & 0xc0) != 0x80)
 717                         {
 718                           mbstate_t cur_state;
 719                           wchar_t wc2;
 720                           Idx mlen = raw + pstr->len - p;
 721                           unsigned char buf[6];
 722                           size_t mbclen;
 723 
 724                           const unsigned char *pp = p;
 725                           if (__glibc_unlikely (pstr->trans != NULL))
 726                             {
 727                               int i = mlen < 6 ? mlen : 6;
 728                               while (--i >= 0)
 729                                 buf[i] = pstr->trans[p[i]];
 730                               pp = buf;
 731                             }
 732                           /* XXX Don't use mbrtowc, we know which conversion
 733                              to use (UTF-8 -> UCS4).  */
 734                           memset (&cur_state, 0, sizeof (cur_state));
 735                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 736                                               &cur_state);
 737                           if (raw + offset - p <= mbclen
 738                               && mbclen < (size_t) -2)
 739                             {
 740                               memset (&pstr->cur_state, '\0',
 741                                       sizeof (mbstate_t));
 742                               pstr->valid_len = mbclen - (raw + offset - p);
 743                               wc = wc2;
 744                             }
 745                           break;
 746                         }
 747                 }
 748 
 749               if (wc == WEOF)
 750                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 751               if (wc == WEOF)
 752                 pstr->tip_context
 753                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 754               else
 755                 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
 756                                       && IS_WIDE_WORD_CHAR (wc))
 757                                      ? CONTEXT_WORD
 758                                      : ((IS_WIDE_NEWLINE (wc)
 759                                          && pstr->newline_anchor)
 760                                         ? CONTEXT_NEWLINE : 0));
 761               if (__glibc_unlikely (pstr->valid_len))
 762                 {
 763                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 764                     pstr->wcs[wcs_idx] = WEOF;
 765                   if (pstr->mbs_allocated)
 766                     memset (pstr->mbs, 255, pstr->valid_len);
 767                 }
 768               pstr->valid_raw_len = pstr->valid_len;
 769             }
 770           else
 771             {
 772               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 773               pstr->valid_raw_len = 0;
 774               if (pstr->trans)
 775                 c = pstr->trans[c];
 776               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 777                                    ? CONTEXT_WORD
 778                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 779                                       ? CONTEXT_NEWLINE : 0));
 780             }
 781         }
 782       if (!__glibc_unlikely (pstr->mbs_allocated))
 783         pstr->mbs += offset;
 784     }
 785   pstr->raw_mbs_idx = idx;
 786   pstr->len -= offset;
 787   pstr->stop -= offset;
 788 
 789   /* Then build the buffers.  */
 790   if (pstr->mb_cur_max > 1)
 791     {
 792       if (pstr->icase)
 793         {
 794           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 795           if (__glibc_unlikely (ret != REG_NOERROR))
 796             return ret;
 797         }
 798       else
 799         build_wcs_buffer (pstr);
 800     }
 801   else
 802     if (__glibc_unlikely (pstr->mbs_allocated))
 803       {
 804         if (pstr->icase)
 805           build_upper_buffer (pstr);
 806         else if (pstr->trans != NULL)
 807           re_string_translate_buffer (pstr);
 808       }
 809     else
 810       pstr->valid_len = pstr->len;
 811 
 812   pstr->cur_idx = 0;
 813   return REG_NOERROR;
 814 }
 815 
 816 static unsigned char
 817 __attribute__ ((pure))
 818 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
     /* [previous][next][first][last][top][bottom][index][help] */
 819 {
 820   int ch;
 821   Idx off;
 822 
 823   /* Handle the common (easiest) cases first.  */
 824   if (__glibc_likely (!pstr->mbs_allocated))
 825     return re_string_peek_byte (pstr, idx);
 826 
 827   if (pstr->mb_cur_max > 1
 828       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 829     return re_string_peek_byte (pstr, idx);
 830 
 831   off = pstr->cur_idx + idx;
 832   if (pstr->offsets_needed)
 833     off = pstr->offsets[off];
 834 
 835   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 836 
 837   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 838      this function returns CAPITAL LETTER I instead of first byte of
 839      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 840      since peek_byte_case doesn't advance cur_idx in any way.  */
 841   if (pstr->offsets_needed && !isascii (ch))
 842     return re_string_peek_byte (pstr, idx);
 843 
 844   return ch;
 845 }
 846 
 847 static unsigned char
 848 re_string_fetch_byte_case (re_string_t *pstr)
     /* [previous][next][first][last][top][bottom][index][help] */
 849 {
 850   if (__glibc_likely (!pstr->mbs_allocated))
 851     return re_string_fetch_byte (pstr);
 852 
 853   if (pstr->offsets_needed)
 854     {
 855       Idx off;
 856       int ch;
 857 
 858       /* For tr_TR.UTF-8 [[:islower:]] there is
 859          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 860          in that case the whole multi-byte character and return
 861          the original letter.  On the other side, with
 862          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 863          anything else would complicate things too much.  */
 864 
 865       if (!re_string_first_byte (pstr, pstr->cur_idx))
 866         return re_string_fetch_byte (pstr);
 867 
 868       off = pstr->offsets[pstr->cur_idx];
 869       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 870 
 871       if (! isascii (ch))
 872         return re_string_fetch_byte (pstr);
 873 
 874       re_string_skip_bytes (pstr,
 875                             re_string_char_size_at (pstr, pstr->cur_idx));
 876       return ch;
 877     }
 878 
 879   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 880 }
 881 
 882 static void
 883 re_string_destruct (re_string_t *pstr)
     /* [previous][next][first][last][top][bottom][index][help] */
 884 {
 885   re_free (pstr->wcs);
 886   re_free (pstr->offsets);
 887   if (pstr->mbs_allocated)
 888     re_free (pstr->mbs);
 889 }
 890 
 891 /* Return the context at IDX in INPUT.  */
 892 
 893 static unsigned int
 894 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
     /* [previous][next][first][last][top][bottom][index][help] */
 895 {
 896   int c;
 897   if (__glibc_unlikely (idx < 0))
 898     /* In this case, we use the value stored in input->tip_context,
 899        since we can't know the character in input->mbs[-1] here.  */
 900     return input->tip_context;
 901   if (__glibc_unlikely (idx == input->len))
 902     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 903             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 904   if (input->mb_cur_max > 1)
 905     {
 906       wint_t wc;
 907       Idx wc_idx = idx;
 908       while(input->wcs[wc_idx] == WEOF)
 909         {
 910           DEBUG_ASSERT (wc_idx >= 0);
 911           --wc_idx;
 912           if (wc_idx < 0)
 913             return input->tip_context;
 914         }
 915       wc = input->wcs[wc_idx];
 916       if (__glibc_unlikely (input->word_ops_used != 0)
 917           && IS_WIDE_WORD_CHAR (wc))
 918         return CONTEXT_WORD;
 919       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 920               ? CONTEXT_NEWLINE : 0);
 921     }
 922   else
 923     {
 924       c = re_string_byte_at (input, idx);
 925       if (bitset_contain (input->word_char, c))
 926         return CONTEXT_WORD;
 927       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 928     }
 929 }
 930 
 931 /* Functions for set operation.  */
 932 
 933 static reg_errcode_t
 934 __attribute_warn_unused_result__
 935 re_node_set_alloc (re_node_set *set, Idx size)
     /* [previous][next][first][last][top][bottom][index][help] */
 936 {
 937   set->alloc = size;
 938   set->nelem = 0;
 939   set->elems = re_malloc (Idx, size);
 940   if (__glibc_unlikely (set->elems == NULL)
 941       && (MALLOC_0_IS_NONNULL || size != 0))
 942     return REG_ESPACE;
 943   return REG_NOERROR;
 944 }
 945 
 946 static reg_errcode_t
 947 __attribute_warn_unused_result__
 948 re_node_set_init_1 (re_node_set *set, Idx elem)
     /* [previous][next][first][last][top][bottom][index][help] */
 949 {
 950   set->alloc = 1;
 951   set->nelem = 1;
 952   set->elems = re_malloc (Idx, 1);
 953   if (__glibc_unlikely (set->elems == NULL))
 954     {
 955       set->alloc = set->nelem = 0;
 956       return REG_ESPACE;
 957     }
 958   set->elems[0] = elem;
 959   return REG_NOERROR;
 960 }
 961 
 962 static reg_errcode_t
 963 __attribute_warn_unused_result__
 964 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
     /* [previous][next][first][last][top][bottom][index][help] */
 965 {
 966   set->alloc = 2;
 967   set->elems = re_malloc (Idx, 2);
 968   if (__glibc_unlikely (set->elems == NULL))
 969     return REG_ESPACE;
 970   if (elem1 == elem2)
 971     {
 972       set->nelem = 1;
 973       set->elems[0] = elem1;
 974     }
 975   else
 976     {
 977       set->nelem = 2;
 978       if (elem1 < elem2)
 979         {
 980           set->elems[0] = elem1;
 981           set->elems[1] = elem2;
 982         }
 983       else
 984         {
 985           set->elems[0] = elem2;
 986           set->elems[1] = elem1;
 987         }
 988     }
 989   return REG_NOERROR;
 990 }
 991 
 992 static reg_errcode_t
 993 __attribute_warn_unused_result__
 994 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
     /* [previous][next][first][last][top][bottom][index][help] */
 995 {
 996   dest->nelem = src->nelem;
 997   if (src->nelem > 0)
 998     {
 999       dest->alloc = dest->nelem;
1000       dest->elems = re_malloc (Idx, dest->alloc);
1001       if (__glibc_unlikely (dest->elems == NULL))
1002         {
1003           dest->alloc = dest->nelem = 0;
1004           return REG_ESPACE;
1005         }
1006       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1007     }
1008   else
1009     re_node_set_init_empty (dest);
1010   return REG_NOERROR;
1011 }
1012 
1013 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1014    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1015    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1016 
1017 static reg_errcode_t
1018 __attribute_warn_unused_result__
1019 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
     /* [previous][next][first][last][top][bottom][index][help] */
1020                            const re_node_set *src2)
1021 {
1022   Idx i1, i2, is, id, delta, sbase;
1023   if (src1->nelem == 0 || src2->nelem == 0)
1024     return REG_NOERROR;
1025 
1026   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1027      conservative estimate.  */
1028   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1029     {
1030       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1031       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1032       if (__glibc_unlikely (new_elems == NULL))
1033         return REG_ESPACE;
1034       dest->elems = new_elems;
1035       dest->alloc = new_alloc;
1036     }
1037 
1038   /* Find the items in the intersection of SRC1 and SRC2, and copy
1039      into the top of DEST those that are not already in DEST itself.  */
1040   sbase = dest->nelem + src1->nelem + src2->nelem;
1041   i1 = src1->nelem - 1;
1042   i2 = src2->nelem - 1;
1043   id = dest->nelem - 1;
1044   for (;;)
1045     {
1046       if (src1->elems[i1] == src2->elems[i2])
1047         {
1048           /* Try to find the item in DEST.  Maybe we could binary search?  */
1049           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1050             --id;
1051 
1052           if (id < 0 || dest->elems[id] != src1->elems[i1])
1053             dest->elems[--sbase] = src1->elems[i1];
1054 
1055           if (--i1 < 0 || --i2 < 0)
1056             break;
1057         }
1058 
1059       /* Lower the highest of the two items.  */
1060       else if (src1->elems[i1] < src2->elems[i2])
1061         {
1062           if (--i2 < 0)
1063             break;
1064         }
1065       else
1066         {
1067           if (--i1 < 0)
1068             break;
1069         }
1070     }
1071 
1072   id = dest->nelem - 1;
1073   is = dest->nelem + src1->nelem + src2->nelem - 1;
1074   delta = is - sbase + 1;
1075 
1076   /* Now copy.  When DELTA becomes zero, the remaining
1077      DEST elements are already in place; this is more or
1078      less the same loop that is in re_node_set_merge.  */
1079   dest->nelem += delta;
1080   if (delta > 0 && id >= 0)
1081     for (;;)
1082       {
1083         if (dest->elems[is] > dest->elems[id])
1084           {
1085             /* Copy from the top.  */
1086             dest->elems[id + delta--] = dest->elems[is--];
1087             if (delta == 0)
1088               break;
1089           }
1090         else
1091           {
1092             /* Slide from the bottom.  */
1093             dest->elems[id + delta] = dest->elems[id];
1094             if (--id < 0)
1095               break;
1096           }
1097       }
1098 
1099   /* Copy remaining SRC elements.  */
1100   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1101 
1102   return REG_NOERROR;
1103 }
1104 
1105 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1106    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1107 
1108 static reg_errcode_t
1109 __attribute_warn_unused_result__
1110 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
     /* [previous][next][first][last][top][bottom][index][help] */
1111                         const re_node_set *src2)
1112 {
1113   Idx i1, i2, id;
1114   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1115     {
1116       dest->alloc = src1->nelem + src2->nelem;
1117       dest->elems = re_malloc (Idx, dest->alloc);
1118       if (__glibc_unlikely (dest->elems == NULL))
1119         return REG_ESPACE;
1120     }
1121   else
1122     {
1123       if (src1 != NULL && src1->nelem > 0)
1124         return re_node_set_init_copy (dest, src1);
1125       else if (src2 != NULL && src2->nelem > 0)
1126         return re_node_set_init_copy (dest, src2);
1127       else
1128         re_node_set_init_empty (dest);
1129       return REG_NOERROR;
1130     }
1131   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1132     {
1133       if (src1->elems[i1] > src2->elems[i2])
1134         {
1135           dest->elems[id++] = src2->elems[i2++];
1136           continue;
1137         }
1138       if (src1->elems[i1] == src2->elems[i2])
1139         ++i2;
1140       dest->elems[id++] = src1->elems[i1++];
1141     }
1142   if (i1 < src1->nelem)
1143     {
1144       memcpy (dest->elems + id, src1->elems + i1,
1145              (src1->nelem - i1) * sizeof (Idx));
1146       id += src1->nelem - i1;
1147     }
1148   else if (i2 < src2->nelem)
1149     {
1150       memcpy (dest->elems + id, src2->elems + i2,
1151              (src2->nelem - i2) * sizeof (Idx));
1152       id += src2->nelem - i2;
1153     }
1154   dest->nelem = id;
1155   return REG_NOERROR;
1156 }
1157 
1158 /* Calculate the union set of the sets DEST and SRC. And store it to
1159    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1160 
1161 static reg_errcode_t
1162 __attribute_warn_unused_result__
1163 re_node_set_merge (re_node_set *dest, const re_node_set *src)
     /* [previous][next][first][last][top][bottom][index][help] */
1164 {
1165   Idx is, id, sbase, delta;
1166   if (src == NULL || src->nelem == 0)
1167     return REG_NOERROR;
1168   if (dest->alloc < 2 * src->nelem + dest->nelem)
1169     {
1170       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1171       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1172       if (__glibc_unlikely (new_buffer == NULL))
1173         return REG_ESPACE;
1174       dest->elems = new_buffer;
1175       dest->alloc = new_alloc;
1176     }
1177 
1178   if (__glibc_unlikely (dest->nelem == 0))
1179     {
1180       /* Although we already guaranteed above that dest->alloc != 0 and
1181          therefore dest->elems != NULL, add a debug assertion to pacify
1182          GCC 11.2.1's -fanalyzer.  */
1183       DEBUG_ASSERT (dest->elems);
1184       dest->nelem = src->nelem;
1185       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1186       return REG_NOERROR;
1187     }
1188 
1189   /* Copy into the top of DEST the items of SRC that are not
1190      found in DEST.  Maybe we could binary search in DEST?  */
1191   for (sbase = dest->nelem + 2 * src->nelem,
1192        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1193     {
1194       if (dest->elems[id] == src->elems[is])
1195         is--, id--;
1196       else if (dest->elems[id] < src->elems[is])
1197         dest->elems[--sbase] = src->elems[is--];
1198       else /* if (dest->elems[id] > src->elems[is]) */
1199         --id;
1200     }
1201 
1202   if (is >= 0)
1203     {
1204       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1205       sbase -= is + 1;
1206       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1207     }
1208 
1209   id = dest->nelem - 1;
1210   is = dest->nelem + 2 * src->nelem - 1;
1211   delta = is - sbase + 1;
1212   if (delta == 0)
1213     return REG_NOERROR;
1214 
1215   /* Now copy.  When DELTA becomes zero, the remaining
1216      DEST elements are already in place.  */
1217   dest->nelem += delta;
1218   for (;;)
1219     {
1220       if (dest->elems[is] > dest->elems[id])
1221         {
1222           /* Copy from the top.  */
1223           dest->elems[id + delta--] = dest->elems[is--];
1224           if (delta == 0)
1225             break;
1226         }
1227       else
1228         {
1229           /* Slide from the bottom.  */
1230           dest->elems[id + delta] = dest->elems[id];
1231           if (--id < 0)
1232             {
1233               /* Copy remaining SRC elements.  */
1234               memcpy (dest->elems, dest->elems + sbase,
1235                       delta * sizeof (Idx));
1236               break;
1237             }
1238         }
1239     }
1240 
1241   return REG_NOERROR;
1242 }
1243 
1244 /* Insert the new element ELEM to the re_node_set* SET.
1245    SET should not already have ELEM.
1246    Return true if successful.  */
1247 
1248 static bool
1249 __attribute_warn_unused_result__
1250 re_node_set_insert (re_node_set *set, Idx elem)
     /* [previous][next][first][last][top][bottom][index][help] */
1251 {
1252   Idx idx;
1253   /* In case the set is empty.  */
1254   if (set->alloc == 0)
1255     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1256 
1257   if (__glibc_unlikely (set->nelem) == 0)
1258     {
1259       /* Although we already guaranteed above that set->alloc != 0 and
1260          therefore set->elems != NULL, add a debug assertion to pacify
1261          GCC 11.2 -fanalyzer.  */
1262       DEBUG_ASSERT (set->elems);
1263       set->elems[0] = elem;
1264       ++set->nelem;
1265       return true;
1266     }
1267 
1268   /* Realloc if we need.  */
1269   if (set->alloc == set->nelem)
1270     {
1271       Idx *new_elems;
1272       set->alloc = set->alloc * 2;
1273       new_elems = re_realloc (set->elems, Idx, set->alloc);
1274       if (__glibc_unlikely (new_elems == NULL))
1275         return false;
1276       set->elems = new_elems;
1277     }
1278 
1279   /* Move the elements which follows the new element.  Test the
1280      first element separately to skip a check in the inner loop.  */
1281   if (elem < set->elems[0])
1282     {
1283       for (idx = set->nelem; idx > 0; idx--)
1284         set->elems[idx] = set->elems[idx - 1];
1285     }
1286   else
1287     {
1288       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1289         set->elems[idx] = set->elems[idx - 1];
1290       DEBUG_ASSERT (set->elems[idx - 1] < elem);
1291     }
1292 
1293   /* Insert the new element.  */
1294   set->elems[idx] = elem;
1295   ++set->nelem;
1296   return true;
1297 }
1298 
1299 /* Insert the new element ELEM to the re_node_set* SET.
1300    SET should not already have any element greater than or equal to ELEM.
1301    Return true if successful.  */
1302 
1303 static bool
1304 __attribute_warn_unused_result__
1305 re_node_set_insert_last (re_node_set *set, Idx elem)
     /* [previous][next][first][last][top][bottom][index][help] */
1306 {
1307   /* Realloc if we need.  */
1308   if (set->alloc == set->nelem)
1309     {
1310       Idx *new_elems;
1311       set->alloc = (set->alloc + 1) * 2;
1312       new_elems = re_realloc (set->elems, Idx, set->alloc);
1313       if (__glibc_unlikely (new_elems == NULL))
1314         return false;
1315       set->elems = new_elems;
1316     }
1317 
1318   /* Insert the new element.  */
1319   set->elems[set->nelem++] = elem;
1320   return true;
1321 }
1322 
1323 /* Compare two node sets SET1 and SET2.
1324    Return true if SET1 and SET2 are equivalent.  */
1325 
1326 static bool
1327 __attribute__ ((pure))
1328 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
     /* [previous][next][first][last][top][bottom][index][help] */
1329 {
1330   Idx i;
1331   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1332     return false;
1333   for (i = set1->nelem ; --i >= 0 ; )
1334     if (set1->elems[i] != set2->elems[i])
1335       return false;
1336   return true;
1337 }
1338 
1339 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1340 
1341 static Idx
1342 __attribute__ ((pure))
1343 re_node_set_contains (const re_node_set *set, Idx elem)
     /* [previous][next][first][last][top][bottom][index][help] */
1344 {
1345   __re_size_t idx, right, mid;
1346   if (set->nelem <= 0)
1347     return 0;
1348 
1349   /* Binary search the element.  */
1350   idx = 0;
1351   right = set->nelem - 1;
1352   while (idx < right)
1353     {
1354       mid = (idx + right) / 2;
1355       if (set->elems[mid] < elem)
1356         idx = mid + 1;
1357       else
1358         right = mid;
1359     }
1360   return set->elems[idx] == elem ? idx + 1 : 0;
1361 }
1362 
1363 static void
1364 re_node_set_remove_at (re_node_set *set, Idx idx)
     /* [previous][next][first][last][top][bottom][index][help] */
1365 {
1366   if (idx < 0 || idx >= set->nelem)
1367     return;
1368   --set->nelem;
1369   for (; idx < set->nelem; idx++)
1370     set->elems[idx] = set->elems[idx + 1];
1371 }
1372 
1373 
1374 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1375    Or return -1 if an error occurred.  */
1376 
1377 static Idx
1378 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
     /* [previous][next][first][last][top][bottom][index][help] */
1379 {
1380   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1381     {
1382       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1383       Idx *new_nexts, *new_indices;
1384       re_node_set *new_edests, *new_eclosures;
1385       re_token_t *new_nodes;
1386 
1387       /* Avoid overflows in realloc.  */
1388       const size_t max_object_size = MAX (sizeof (re_token_t),
1389                                           MAX (sizeof (re_node_set),
1390                                                sizeof (Idx)));
1391       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1392                             < new_nodes_alloc))
1393         return -1;
1394 
1395       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1396       if (__glibc_unlikely (new_nodes == NULL))
1397         return -1;
1398       dfa->nodes = new_nodes;
1399       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1400       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1401       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1402       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1403       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1404                             || new_edests == NULL || new_eclosures == NULL))
1405         {
1406            re_free (new_nexts);
1407            re_free (new_indices);
1408            re_free (new_edests);
1409            re_free (new_eclosures);
1410            return -1;
1411         }
1412       dfa->nexts = new_nexts;
1413       dfa->org_indices = new_indices;
1414       dfa->edests = new_edests;
1415       dfa->eclosures = new_eclosures;
1416       dfa->nodes_alloc = new_nodes_alloc;
1417     }
1418   dfa->nodes[dfa->nodes_len] = token;
1419   dfa->nodes[dfa->nodes_len].constraint = 0;
1420   dfa->nodes[dfa->nodes_len].accept_mb =
1421     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1422      || token.type == COMPLEX_BRACKET);
1423   dfa->nexts[dfa->nodes_len] = -1;
1424   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1425   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1426   return dfa->nodes_len++;
1427 }
1428 
1429 static re_hashval_t
1430 calc_state_hash (const re_node_set *nodes, unsigned int context)
     /* [previous][next][first][last][top][bottom][index][help] */
1431 {
1432   re_hashval_t hash = nodes->nelem + context;
1433   Idx i;
1434   for (i = 0 ; i < nodes->nelem ; i++)
1435     hash += nodes->elems[i];
1436   return hash;
1437 }
1438 
1439 /* Search for the state whose node_set is equivalent to NODES.
1440    Return the pointer to the state, if we found it in the DFA.
1441    Otherwise create the new one and return it.  In case of an error
1442    return NULL and set the error code in ERR.
1443    Note: - We assume NULL as the invalid state, then it is possible that
1444            return value is NULL and ERR is REG_NOERROR.
1445          - We never return non-NULL value in case of any errors, it is for
1446            optimization.  */
1447 
1448 static re_dfastate_t *
1449 __attribute_warn_unused_result__
1450 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
     /* [previous][next][first][last][top][bottom][index][help] */
1451                   const re_node_set *nodes)
1452 {
1453   re_hashval_t hash;
1454   re_dfastate_t *new_state;
1455   struct re_state_table_entry *spot;
1456   Idx i;
1457 #if defined GCC_LINT || defined lint
1458   /* Suppress bogus uninitialized-variable warnings.  */
1459   *err = REG_NOERROR;
1460 #endif
1461   if (__glibc_unlikely (nodes->nelem == 0))
1462     {
1463       *err = REG_NOERROR;
1464       return NULL;
1465     }
1466   hash = calc_state_hash (nodes, 0);
1467   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1468 
1469   for (i = 0 ; i < spot->num ; i++)
1470     {
1471       re_dfastate_t *state = spot->array[i];
1472       if (hash != state->hash)
1473         continue;
1474       if (re_node_set_compare (&state->nodes, nodes))
1475         return state;
1476     }
1477 
1478   /* There are no appropriate state in the dfa, create the new one.  */
1479   new_state = create_ci_newstate (dfa, nodes, hash);
1480   if (__glibc_unlikely (new_state == NULL))
1481     *err = REG_ESPACE;
1482 
1483   return new_state;
1484 }
1485 
1486 /* Search for the state whose node_set is equivalent to NODES and
1487    whose context is equivalent to CONTEXT.
1488    Return the pointer to the state, if we found it in the DFA.
1489    Otherwise create the new one and return it.  In case of an error
1490    return NULL and set the error code in ERR.
1491    Note: - We assume NULL as the invalid state, then it is possible that
1492            return value is NULL and ERR is REG_NOERROR.
1493          - We never return non-NULL value in case of any errors, it is for
1494            optimization.  */
1495 
1496 static re_dfastate_t *
1497 __attribute_warn_unused_result__
1498 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
     /* [previous][next][first][last][top][bottom][index][help] */
1499                           const re_node_set *nodes, unsigned int context)
1500 {
1501   re_hashval_t hash;
1502   re_dfastate_t *new_state;
1503   struct re_state_table_entry *spot;
1504   Idx i;
1505 #if defined GCC_LINT || defined lint
1506   /* Suppress bogus uninitialized-variable warnings.  */
1507   *err = REG_NOERROR;
1508 #endif
1509   if (nodes->nelem == 0)
1510     {
1511       *err = REG_NOERROR;
1512       return NULL;
1513     }
1514   hash = calc_state_hash (nodes, context);
1515   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1516 
1517   for (i = 0 ; i < spot->num ; i++)
1518     {
1519       re_dfastate_t *state = spot->array[i];
1520       if (state->hash == hash
1521           && state->context == context
1522           && re_node_set_compare (state->entrance_nodes, nodes))
1523         return state;
1524     }
1525   /* There are no appropriate state in 'dfa', create the new one.  */
1526   new_state = create_cd_newstate (dfa, nodes, context, hash);
1527   if (__glibc_unlikely (new_state == NULL))
1528     *err = REG_ESPACE;
1529 
1530   return new_state;
1531 }
1532 
1533 /* Finish initialization of the new state NEWSTATE, and using its hash value
1534    HASH put in the appropriate bucket of DFA's state table.  Return value
1535    indicates the error code if failed.  */
1536 
1537 static reg_errcode_t
1538 __attribute_warn_unused_result__
1539 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
     /* [previous][next][first][last][top][bottom][index][help] */
1540                 re_hashval_t hash)
1541 {
1542   struct re_state_table_entry *spot;
1543   reg_errcode_t err;
1544   Idx i;
1545 
1546   newstate->hash = hash;
1547   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1548   if (__glibc_unlikely (err != REG_NOERROR))
1549     return REG_ESPACE;
1550   for (i = 0; i < newstate->nodes.nelem; i++)
1551     {
1552       Idx elem = newstate->nodes.elems[i];
1553       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1554         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1555           return REG_ESPACE;
1556     }
1557 
1558   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1559   if (__glibc_unlikely (spot->alloc <= spot->num))
1560     {
1561       Idx new_alloc = 2 * spot->num + 2;
1562       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1563                                               new_alloc);
1564       if (__glibc_unlikely (new_array == NULL))
1565         return REG_ESPACE;
1566       spot->array = new_array;
1567       spot->alloc = new_alloc;
1568     }
1569   spot->array[spot->num++] = newstate;
1570   return REG_NOERROR;
1571 }
1572 
1573 static void
1574 free_state (re_dfastate_t *state)
     /* [previous][next][first][last][top][bottom][index][help] */
1575 {
1576   re_node_set_free (&state->non_eps_nodes);
1577   re_node_set_free (&state->inveclosure);
1578   if (state->entrance_nodes != &state->nodes)
1579     {
1580       re_node_set_free (state->entrance_nodes);
1581       re_free (state->entrance_nodes);
1582     }
1583   re_node_set_free (&state->nodes);
1584   re_free (state->word_trtable);
1585   re_free (state->trtable);
1586   re_free (state);
1587 }
1588 
1589 /* Create the new state which is independent of contexts.
1590    Return the new state if succeeded, otherwise return NULL.  */
1591 
1592 static re_dfastate_t *
1593 __attribute_warn_unused_result__
1594 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
     /* [previous][next][first][last][top][bottom][index][help] */
1595                     re_hashval_t hash)
1596 {
1597   Idx i;
1598   reg_errcode_t err;
1599   re_dfastate_t *newstate;
1600 
1601   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1602   if (__glibc_unlikely (newstate == NULL))
1603     return NULL;
1604   err = re_node_set_init_copy (&newstate->nodes, nodes);
1605   if (__glibc_unlikely (err != REG_NOERROR))
1606     {
1607       re_free (newstate);
1608       return NULL;
1609     }
1610 
1611   newstate->entrance_nodes = &newstate->nodes;
1612   for (i = 0 ; i < nodes->nelem ; i++)
1613     {
1614       re_token_t *node = dfa->nodes + nodes->elems[i];
1615       re_token_type_t type = node->type;
1616       if (type == CHARACTER && !node->constraint)
1617         continue;
1618       newstate->accept_mb |= node->accept_mb;
1619 
1620       /* If the state has the halt node, the state is a halt state.  */
1621       if (type == END_OF_RE)
1622         newstate->halt = 1;
1623       else if (type == OP_BACK_REF)
1624         newstate->has_backref = 1;
1625       else if (type == ANCHOR || node->constraint)
1626         newstate->has_constraint = 1;
1627     }
1628   err = register_state (dfa, newstate, hash);
1629   if (__glibc_unlikely (err != REG_NOERROR))
1630     {
1631       free_state (newstate);
1632       newstate = NULL;
1633     }
1634   return newstate;
1635 }
1636 
1637 /* Create the new state which is depend on the context CONTEXT.
1638    Return the new state if succeeded, otherwise return NULL.  */
1639 
1640 static re_dfastate_t *
1641 __attribute_warn_unused_result__
1642 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
     /* [previous][next][first][last][top][bottom][index][help] */
1643                     unsigned int context, re_hashval_t hash)
1644 {
1645   Idx i, nctx_nodes = 0;
1646   reg_errcode_t err;
1647   re_dfastate_t *newstate;
1648 
1649   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1650   if (__glibc_unlikely (newstate == NULL))
1651     return NULL;
1652   err = re_node_set_init_copy (&newstate->nodes, nodes);
1653   if (__glibc_unlikely (err != REG_NOERROR))
1654     {
1655       re_free (newstate);
1656       return NULL;
1657     }
1658 
1659   newstate->context = context;
1660   newstate->entrance_nodes = &newstate->nodes;
1661 
1662   for (i = 0 ; i < nodes->nelem ; i++)
1663     {
1664       re_token_t *node = dfa->nodes + nodes->elems[i];
1665       re_token_type_t type = node->type;
1666       unsigned int constraint = node->constraint;
1667 
1668       if (type == CHARACTER && !constraint)
1669         continue;
1670       newstate->accept_mb |= node->accept_mb;
1671 
1672       /* If the state has the halt node, the state is a halt state.  */
1673       if (type == END_OF_RE)
1674         newstate->halt = 1;
1675       else if (type == OP_BACK_REF)
1676         newstate->has_backref = 1;
1677 
1678       if (constraint)
1679         {
1680           if (newstate->entrance_nodes == &newstate->nodes)
1681             {
1682               re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1683               if (__glibc_unlikely (entrance_nodes == NULL))
1684                 {
1685                   free_state (newstate);
1686                   return NULL;
1687                 }
1688               newstate->entrance_nodes = entrance_nodes;
1689               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1690                   != REG_NOERROR)
1691                 {
1692                   free_state (newstate);
1693                   return NULL;
1694                 }
1695               nctx_nodes = 0;
1696               newstate->has_constraint = 1;
1697             }
1698 
1699           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1700             {
1701               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1702               ++nctx_nodes;
1703             }
1704         }
1705     }
1706   err = register_state (dfa, newstate, hash);
1707   if (__glibc_unlikely (err != REG_NOERROR))
1708     {
1709       free_state (newstate);
1710       newstate = NULL;
1711     }
1712   return  newstate;
1713 }

/* [previous][next][first][last][top][bottom][index][help] */