root/maint/gnulib/lib/mbscasestr.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. knuth_morris_pratt_multibyte
  2. mbscasestr

   1 /* Case-insensitive searching in a string.  -*- coding: utf-8 -*-
   2    Copyright (C) 2005-2021 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2005.
   4 
   5    This file is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU Lesser General Public License as
   7    published by the Free Software Foundation; either version 3 of the
   8    License, or (at your option) any later version.
   9 
  10    This file is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU Lesser General Public License for more details.
  14 
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17 
  18 #include <config.h>
  19 
  20 /* Specification.  */
  21 #include <string.h>
  22 
  23 #include <ctype.h>
  24 #include <stdbool.h>
  25 #include <stddef.h>  /* for NULL, in case a nonstandard string.h lacks it */
  26 #include <stdlib.h>
  27 
  28 #include "malloca.h"
  29 #include "mbuiter.h"
  30 
  31 #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
  32 
  33 /* Knuth-Morris-Pratt algorithm.  */
  34 #define UNIT unsigned char
  35 #define CANON_ELEMENT(c) TOLOWER (c)
  36 #include "str-kmp.h"
  37 
  38 /* Knuth-Morris-Pratt algorithm.
  39    See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
  40    Return a boolean indicating success:
  41    Return true and set *RESULTP if the search was completed.
  42    Return false if it was aborted because not enough memory was available.  */
  43 static bool
  44 knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
     /* [previous][next][first][last][top][bottom][index][help] */
  45                               const char **resultp)
  46 {
  47   size_t m = mbslen (needle);
  48   mbchar_t *needle_mbchars;
  49   size_t *table;
  50 
  51   /* Allocate room for needle_mbchars and the table.  */
  52   char *memory = (char *) nmalloca (m, sizeof (mbchar_t) + sizeof (size_t));
  53   if (memory == NULL)
  54     return false;
  55   needle_mbchars = (mbchar_t *) memory;
  56   table = (size_t *) (memory + m * sizeof (mbchar_t));
  57 
  58   /* Fill needle_mbchars.  */
  59   {
  60     mbui_iterator_t iter;
  61     size_t j;
  62 
  63     j = 0;
  64     for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
  65       {
  66         mb_copy (&needle_mbchars[j], &mbui_cur (iter));
  67         if (needle_mbchars[j].wc_valid)
  68           needle_mbchars[j].wc = towlower (needle_mbchars[j].wc);
  69       }
  70   }
  71 
  72   /* Fill the table.
  73      For 0 < i < m:
  74        0 < table[i] <= i is defined such that
  75        forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
  76        and table[i] is as large as possible with this property.
  77      This implies:
  78      1) For 0 < i < m:
  79           If table[i] < i,
  80           needle[table[i]..i-1] = needle[0..i-1-table[i]].
  81      2) For 0 < i < m:
  82           rhaystack[0..i-1] == needle[0..i-1]
  83           and exists h, i <= h < m: rhaystack[h] != needle[h]
  84           implies
  85           forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
  86      table[0] remains uninitialized.  */
  87   {
  88     size_t i, j;
  89 
  90     /* i = 1: Nothing to verify for x = 0.  */
  91     table[1] = 1;
  92     j = 0;
  93 
  94     for (i = 2; i < m; i++)
  95       {
  96         /* Here: j = i-1 - table[i-1].
  97            The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
  98            for x < table[i-1], by induction.
  99            Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
 100         mbchar_t *b = &needle_mbchars[i - 1];
 101 
 102         for (;;)
 103           {
 104             /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
 105                is known to hold for x < i-1-j.
 106                Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
 107             if (mb_equal (*b, needle_mbchars[j]))
 108               {
 109                 /* Set table[i] := i-1-j.  */
 110                 table[i] = i - ++j;
 111                 break;
 112               }
 113             /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
 114                for x = i-1-j, because
 115                  needle[i-1] != needle[j] = needle[i-1-x].  */
 116             if (j == 0)
 117               {
 118                 /* The inequality holds for all possible x.  */
 119                 table[i] = i;
 120                 break;
 121               }
 122             /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
 123                for i-1-j < x < i-1-j+table[j], because for these x:
 124                  needle[x..i-2]
 125                  = needle[x-(i-1-j)..j-1]
 126                  != needle[0..j-1-(x-(i-1-j))]  (by definition of table[j])
 127                     = needle[0..i-2-x],
 128                hence needle[x..i-1] != needle[0..i-1-x].
 129                Furthermore
 130                  needle[i-1-j+table[j]..i-2]
 131                  = needle[table[j]..j-1]
 132                  = needle[0..j-1-table[j]]  (by definition of table[j]).  */
 133             j = j - table[j];
 134           }
 135         /* Here: j = i - table[i].  */
 136       }
 137   }
 138 
 139   /* Search, using the table to accelerate the processing.  */
 140   {
 141     size_t j;
 142     mbui_iterator_t rhaystack;
 143     mbui_iterator_t phaystack;
 144 
 145     *resultp = NULL;
 146     j = 0;
 147     mbui_init (rhaystack, haystack);
 148     mbui_init (phaystack, haystack);
 149     /* Invariant: phaystack = rhaystack + j.  */
 150     while (mbui_avail (phaystack))
 151       {
 152         mbchar_t c;
 153 
 154         mb_copy (&c, &mbui_cur (phaystack));
 155         if (c.wc_valid)
 156           c.wc = towlower (c.wc);
 157         if (mb_equal (needle_mbchars[j], c))
 158           {
 159             j++;
 160             mbui_advance (phaystack);
 161             if (j == m)
 162               {
 163                 /* The entire needle has been found.  */
 164                 *resultp = mbui_cur_ptr (rhaystack);
 165                 break;
 166               }
 167           }
 168         else if (j > 0)
 169           {
 170             /* Found a match of needle[0..j-1], mismatch at needle[j].  */
 171             size_t count = table[j];
 172             j -= count;
 173             for (; count > 0; count--)
 174               {
 175                 if (!mbui_avail (rhaystack))
 176                   abort ();
 177                 mbui_advance (rhaystack);
 178               }
 179           }
 180         else
 181           {
 182             /* Found a mismatch at needle[0] already.  */
 183             if (!mbui_avail (rhaystack))
 184               abort ();
 185             mbui_advance (rhaystack);
 186             mbui_advance (phaystack);
 187           }
 188       }
 189   }
 190 
 191   freea (memory);
 192   return true;
 193 }
 194 
 195 /* Find the first occurrence of the character string NEEDLE in the character
 196    string HAYSTACK, using case-insensitive comparison.
 197    Note: This function may, in multibyte locales, return success even if
 198    strlen (haystack) < strlen (needle) !  */
 199 char *
 200 mbscasestr (const char *haystack, const char *needle)
     /* [previous][next][first][last][top][bottom][index][help] */
 201 {
 202   /* Be careful not to look at the entire extent of haystack or needle
 203      until needed.  This is useful because of these two cases:
 204        - haystack may be very long, and a match of needle found early,
 205        - needle may be very long, and not even a short initial segment of
 206          needle may be found in haystack.  */
 207   if (MB_CUR_MAX > 1)
 208     {
 209       mbui_iterator_t iter_needle;
 210 
 211       mbui_init (iter_needle, needle);
 212       if (mbui_avail (iter_needle))
 213         {
 214           /* Minimizing the worst-case complexity:
 215              Let n = mbslen(haystack), m = mbslen(needle).
 216              The naïve algorithm is O(n*m) worst-case.
 217              The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
 218              memory allocation.
 219              To achieve linear complexity and yet amortize the cost of the
 220              memory allocation, we activate the Knuth-Morris-Pratt algorithm
 221              only once the naïve algorithm has already run for some time; more
 222              precisely, when
 223                - the outer loop count is >= 10,
 224                - the average number of comparisons per outer loop is >= 5,
 225                - the total number of comparisons is >= m.
 226              But we try it only once.  If the memory allocation attempt failed,
 227              we don't retry it.  */
 228           bool try_kmp = true;
 229           size_t outer_loop_count = 0;
 230           size_t comparison_count = 0;
 231           size_t last_ccount = 0;                  /* last comparison count */
 232           mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */
 233 
 234           mbchar_t b;
 235           mbui_iterator_t iter_haystack;
 236 
 237           mbui_init (iter_needle_last_ccount, needle);
 238 
 239           mb_copy (&b, &mbui_cur (iter_needle));
 240           if (b.wc_valid)
 241             b.wc = towlower (b.wc);
 242 
 243           mbui_init (iter_haystack, haystack);
 244           for (;; mbui_advance (iter_haystack))
 245             {
 246               mbchar_t c;
 247 
 248               if (!mbui_avail (iter_haystack))
 249                 /* No match.  */
 250                 return NULL;
 251 
 252               /* See whether it's advisable to use an asymptotically faster
 253                  algorithm.  */
 254               if (try_kmp
 255                   && outer_loop_count >= 10
 256                   && comparison_count >= 5 * outer_loop_count)
 257                 {
 258                   /* See if needle + comparison_count now reaches the end of
 259                      needle.  */
 260                   size_t count = comparison_count - last_ccount;
 261                   for (;
 262                        count > 0 && mbui_avail (iter_needle_last_ccount);
 263                        count--)
 264                     mbui_advance (iter_needle_last_ccount);
 265                   last_ccount = comparison_count;
 266                   if (!mbui_avail (iter_needle_last_ccount))
 267                     {
 268                       /* Try the Knuth-Morris-Pratt algorithm.  */
 269                       const char *result;
 270                       bool success =
 271                         knuth_morris_pratt_multibyte (haystack, needle,
 272                                                       &result);
 273                       if (success)
 274                         return (char *) result;
 275                       try_kmp = false;
 276                     }
 277                 }
 278 
 279               outer_loop_count++;
 280               comparison_count++;
 281               mb_copy (&c, &mbui_cur (iter_haystack));
 282               if (c.wc_valid)
 283                 c.wc = towlower (c.wc);
 284               if (mb_equal (c, b))
 285                 /* The first character matches.  */
 286                 {
 287                   mbui_iterator_t rhaystack;
 288                   mbui_iterator_t rneedle;
 289 
 290                   memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
 291                   mbui_advance (rhaystack);
 292 
 293                   mbui_init (rneedle, needle);
 294                   if (!mbui_avail (rneedle))
 295                     abort ();
 296                   mbui_advance (rneedle);
 297 
 298                   for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
 299                     {
 300                       if (!mbui_avail (rneedle))
 301                         /* Found a match.  */
 302                         return (char *) mbui_cur_ptr (iter_haystack);
 303                       if (!mbui_avail (rhaystack))
 304                         /* No match.  */
 305                         return NULL;
 306                       comparison_count++;
 307                       if (!mb_caseequal (mbui_cur (rhaystack),
 308                                          mbui_cur (rneedle)))
 309                         /* Nothing in this round.  */
 310                         break;
 311                     }
 312                 }
 313             }
 314         }
 315       else
 316         return (char *) haystack;
 317     }
 318   else
 319     {
 320       if (*needle != '\0')
 321         {
 322           /* Minimizing the worst-case complexity:
 323              Let n = strlen(haystack), m = strlen(needle).
 324              The naïve algorithm is O(n*m) worst-case.
 325              The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
 326              memory allocation.
 327              To achieve linear complexity and yet amortize the cost of the
 328              memory allocation, we activate the Knuth-Morris-Pratt algorithm
 329              only once the naïve algorithm has already run for some time; more
 330              precisely, when
 331                - the outer loop count is >= 10,
 332                - the average number of comparisons per outer loop is >= 5,
 333                - the total number of comparisons is >= m.
 334              But we try it only once.  If the memory allocation attempt failed,
 335              we don't retry it.  */
 336           bool try_kmp = true;
 337           size_t outer_loop_count = 0;
 338           size_t comparison_count = 0;
 339           size_t last_ccount = 0;                  /* last comparison count */
 340           const char *needle_last_ccount = needle; /* = needle + last_ccount */
 341 
 342           /* Speed up the following searches of needle by caching its first
 343              character.  */
 344           unsigned char b = TOLOWER ((unsigned char) *needle);
 345 
 346           needle++;
 347           for (;; haystack++)
 348             {
 349               if (*haystack == '\0')
 350                 /* No match.  */
 351                 return NULL;
 352 
 353               /* See whether it's advisable to use an asymptotically faster
 354                  algorithm.  */
 355               if (try_kmp
 356                   && outer_loop_count >= 10
 357                   && comparison_count >= 5 * outer_loop_count)
 358                 {
 359                   /* See if needle + comparison_count now reaches the end of
 360                      needle.  */
 361                   if (needle_last_ccount != NULL)
 362                     {
 363                       needle_last_ccount +=
 364                         strnlen (needle_last_ccount,
 365                                  comparison_count - last_ccount);
 366                       if (*needle_last_ccount == '\0')
 367                         needle_last_ccount = NULL;
 368                       last_ccount = comparison_count;
 369                     }
 370                   if (needle_last_ccount == NULL)
 371                     {
 372                       /* Try the Knuth-Morris-Pratt algorithm.  */
 373                       const unsigned char *result;
 374                       bool success =
 375                         knuth_morris_pratt ((const unsigned char *) haystack,
 376                                             (const unsigned char *) (needle - 1),
 377                                             strlen (needle - 1),
 378                                             &result);
 379                       if (success)
 380                         return (char *) result;
 381                       try_kmp = false;
 382                     }
 383                 }
 384 
 385               outer_loop_count++;
 386               comparison_count++;
 387               if (TOLOWER ((unsigned char) *haystack) == b)
 388                 /* The first character matches.  */
 389                 {
 390                   const char *rhaystack = haystack + 1;
 391                   const char *rneedle = needle;
 392 
 393                   for (;; rhaystack++, rneedle++)
 394                     {
 395                       if (*rneedle == '\0')
 396                         /* Found a match.  */
 397                         return (char *) haystack;
 398                       if (*rhaystack == '\0')
 399                         /* No match.  */
 400                         return NULL;
 401                       comparison_count++;
 402                       if (TOLOWER ((unsigned char) *rhaystack)
 403                           != TOLOWER ((unsigned char) *rneedle))
 404                         /* Nothing in this round.  */
 405                         break;
 406                     }
 407                 }
 408             }
 409         }
 410       else
 411         return (char *) haystack;
 412     }
 413 }

/* [previous][next][first][last][top][bottom][index][help] */