root/maint/gnulib/lib/str-two-way.h

/* [previous][next][first][last][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. critical_factorization
  2. two_way_short_needle
  3. two_way_long_needle

   1 /* Byte-wise substring search, using the Two-Way algorithm.
   2    Copyright (C) 2008-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Written by Eric Blake <ebb9@byu.net>, 2008.
   5 
   6    This file is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU Lesser General Public License as
   8    published by the Free Software Foundation; either version 2.1 of the
   9    License, or (at your option) any later version.
  10 
  11    This file is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU Lesser General Public License for more details.
  15 
  16    You should have received a copy of the GNU Lesser General Public License
  17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  18 
  19 /* Before including this file, you need to include <config.h> and
  20    <string.h>, and define:
  21      RETURN_TYPE             A macro that expands to the return type.
  22      AVAILABLE(h, h_l, j, n_l)
  23                              A macro that returns nonzero if there are
  24                              at least N_L bytes left starting at H[J].
  25                              H is 'unsigned char *', H_L, J, and N_L
  26                              are 'size_t'; H_L is an lvalue.  For
  27                              NUL-terminated searches, H_L can be
  28                              modified each iteration to avoid having
  29                              to compute the end of H up front.
  30 
  31   For case-insensitivity, you may optionally define:
  32      CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
  33                              characters of P1 and P2 are equal.
  34      CANON_ELEMENT(c)        A macro that canonicalizes an element right after
  35                              it has been fetched from one of the two strings.
  36                              The argument is an 'unsigned char'; the result
  37                              must be an 'unsigned char' as well.
  38 
  39   This file undefines the macros documented above, and defines
  40   LONG_NEEDLE_THRESHOLD.
  41 */
  42 
  43 #include <limits.h>
  44 #include <stdint.h>
  45 
  46 /* We use the Two-Way string matching algorithm (also known as
  47    Chrochemore-Perrin), which guarantees linear complexity with
  48    constant space.  Additionally, for long needles, we also use a bad
  49    character shift table similar to the Boyer-Moore algorithm to
  50    achieve improved (potentially sub-linear) performance.
  51 
  52    See https://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
  53    https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
  54    https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
  55 */
  56 
  57 /* Point at which computing a bad-byte shift table is likely to be
  58    worthwhile.  Small needles should not compute a table, since it
  59    adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
  60    speedup no greater than a factor of NEEDLE_LEN.  The larger the
  61    needle, the better the potential performance gain.  On the other
  62    hand, on non-POSIX systems with CHAR_BIT larger than eight, the
  63    memory required for the table is prohibitive.  */
  64 #if CHAR_BIT < 10
  65 # define LONG_NEEDLE_THRESHOLD 32U
  66 #else
  67 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
  68 #endif
  69 
  70 #ifndef MAX
  71 # define MAX(a, b) ((a < b) ? (b) : (a))
  72 #endif
  73 
  74 #ifndef CANON_ELEMENT
  75 # define CANON_ELEMENT(c) c
  76 #endif
  77 #ifndef CMP_FUNC
  78 # define CMP_FUNC memcmp
  79 #endif
  80 
  81 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
  82    Return the index of the first byte in the right half, and set
  83    *PERIOD to the global period of the right half.
  84 
  85    The global period of a string is the smallest index (possibly its
  86    length) at which all remaining bytes in the string are repetitions
  87    of the prefix (the last repetition may be a subset of the prefix).
  88 
  89    When NEEDLE is factored into two halves, a local period is the
  90    length of the smallest word that shares a suffix with the left half
  91    and shares a prefix with the right half.  All factorizations of a
  92    non-empty NEEDLE have a local period of at least 1 and no greater
  93    than NEEDLE_LEN.
  94 
  95    A critical factorization has the property that the local period
  96    equals the global period.  All strings have at least one critical
  97    factorization with the left half smaller than the global period.
  98    And while some strings have more than one critical factorization,
  99    it is provable that with an ordered alphabet, at least one of the
 100    critical factorizations corresponds to a maximal suffix.
 101 
 102    Given an ordered alphabet, a critical factorization can be computed
 103    in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
 104    shorter of two ordered maximal suffixes.  The ordered maximal
 105    suffixes are determined by lexicographic comparison while tracking
 106    periodicity.  */
 107 static size_t
 108 critical_factorization (const unsigned char *needle, size_t needle_len,
     /* [previous][next][first][last][top][bottom][index][help] */
 109                         size_t *period)
 110 {
 111   /* Index of last byte of left half, or SIZE_MAX.  */
 112   size_t max_suffix, max_suffix_rev;
 113   size_t j; /* Index into NEEDLE for current candidate suffix.  */
 114   size_t k; /* Offset into current period.  */
 115   size_t p; /* Intermediate period.  */
 116   unsigned char a, b; /* Current comparison bytes.  */
 117 
 118   /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
 119      out 0-length needles.  */
 120   if (needle_len < 3)
 121     {
 122       *period = 1;
 123       return needle_len - 1;
 124     }
 125 
 126   /* Invariants:
 127      0 <= j < NEEDLE_LEN - 1
 128      -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
 129      min(max_suffix, max_suffix_rev) < global period of NEEDLE
 130      1 <= p <= global period of NEEDLE
 131      p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
 132      1 <= k <= p
 133   */
 134 
 135   /* Perform lexicographic search.  */
 136   max_suffix = SIZE_MAX;
 137   j = 0;
 138   k = p = 1;
 139   while (j + k < needle_len)
 140     {
 141       a = CANON_ELEMENT (needle[j + k]);
 142       b = CANON_ELEMENT (needle[max_suffix + k]);
 143       if (a < b)
 144         {
 145           /* Suffix is smaller, period is entire prefix so far.  */
 146           j += k;
 147           k = 1;
 148           p = j - max_suffix;
 149         }
 150       else if (a == b)
 151         {
 152           /* Advance through repetition of the current period.  */
 153           if (k != p)
 154             ++k;
 155           else
 156             {
 157               j += p;
 158               k = 1;
 159             }
 160         }
 161       else /* b < a */
 162         {
 163           /* Suffix is larger, start over from current location.  */
 164           max_suffix = j++;
 165           k = p = 1;
 166         }
 167     }
 168   *period = p;
 169 
 170   /* Perform reverse lexicographic search.  */
 171   max_suffix_rev = SIZE_MAX;
 172   j = 0;
 173   k = p = 1;
 174   while (j + k < needle_len)
 175     {
 176       a = CANON_ELEMENT (needle[j + k]);
 177       b = CANON_ELEMENT (needle[max_suffix_rev + k]);
 178       if (b < a)
 179         {
 180           /* Suffix is smaller, period is entire prefix so far.  */
 181           j += k;
 182           k = 1;
 183           p = j - max_suffix_rev;
 184         }
 185       else if (a == b)
 186         {
 187           /* Advance through repetition of the current period.  */
 188           if (k != p)
 189             ++k;
 190           else
 191             {
 192               j += p;
 193               k = 1;
 194             }
 195         }
 196       else /* a < b */
 197         {
 198           /* Suffix is larger, start over from current location.  */
 199           max_suffix_rev = j++;
 200           k = p = 1;
 201         }
 202     }
 203 
 204   /* Choose the shorter suffix.  Return the index of the first byte of
 205      the right half, rather than the last byte of the left half.
 206 
 207      For some examples, 'banana' has two critical factorizations, both
 208      exposed by the two lexicographic extreme suffixes of 'anana' and
 209      'nana', where both suffixes have a period of 2.  On the other
 210      hand, with 'aab' and 'bba', both strings have a single critical
 211      factorization of the last byte, with the suffix having a period
 212      of 1.  While the maximal lexicographic suffix of 'aab' is 'b',
 213      the maximal lexicographic suffix of 'bba' is 'ba', which is not a
 214      critical factorization.  Conversely, the maximal reverse
 215      lexicographic suffix of 'a' works for 'bba', but not 'ab' for
 216      'aab'.  The shorter suffix of the two will always be a critical
 217      factorization.  */
 218   if (max_suffix_rev + 1 < max_suffix + 1)
 219     return max_suffix + 1;
 220   *period = p;
 221   return max_suffix_rev + 1;
 222 }
 223 
 224 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
 225    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
 226    method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
 227    Performance is guaranteed to be linear, with an initialization cost
 228    of 2 * NEEDLE_LEN comparisons.
 229 
 230    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
 231    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
 232    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
 233    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
 234 static RETURN_TYPE
 235 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
     /* [previous][next][first][last][top][bottom][index][help] */
 236                       const unsigned char *needle, size_t needle_len)
 237 {
 238   size_t i; /* Index into current byte of NEEDLE.  */
 239   size_t j; /* Index into current window of HAYSTACK.  */
 240   size_t period; /* The period of the right half of needle.  */
 241   size_t suffix; /* The index of the right half of needle.  */
 242 
 243   /* Factor the needle into two halves, such that the left half is
 244      smaller than the global period, and the right half is
 245      periodic (with a period as large as NEEDLE_LEN - suffix).  */
 246   suffix = critical_factorization (needle, needle_len, &period);
 247 
 248   /* Perform the search.  Each iteration compares the right half
 249      first.  */
 250   if (CMP_FUNC (needle, needle + period, suffix) == 0)
 251     {
 252       /* Entire needle is periodic; a mismatch in the left half can
 253          only advance by the period, so use memory to avoid rescanning
 254          known occurrences of the period in the right half.  */
 255       size_t memory = 0;
 256       j = 0;
 257       while (AVAILABLE (haystack, haystack_len, j, needle_len))
 258         {
 259           /* Scan for matches in right half.  */
 260           i = MAX (suffix, memory);
 261           while (i < needle_len && (CANON_ELEMENT (needle[i])
 262                                     == CANON_ELEMENT (haystack[i + j])))
 263             ++i;
 264           if (needle_len <= i)
 265             {
 266               /* Scan for matches in left half.  */
 267               i = suffix - 1;
 268               while (memory < i + 1 && (CANON_ELEMENT (needle[i])
 269                                         == CANON_ELEMENT (haystack[i + j])))
 270                 --i;
 271               if (i + 1 < memory + 1)
 272                 return (RETURN_TYPE) (haystack + j);
 273               /* No match, so remember how many repetitions of period
 274                  on the right half were scanned.  */
 275               j += period;
 276               memory = needle_len - period;
 277             }
 278           else
 279             {
 280               j += i - suffix + 1;
 281               memory = 0;
 282             }
 283         }
 284     }
 285   else
 286     {
 287       /* The two halves of needle are distinct; no extra memory is
 288          required, and any mismatch results in a maximal shift.  */
 289       period = MAX (suffix, needle_len - suffix) + 1;
 290       j = 0;
 291       while (AVAILABLE (haystack, haystack_len, j, needle_len))
 292         {
 293           /* Scan for matches in right half.  */
 294           i = suffix;
 295           while (i < needle_len && (CANON_ELEMENT (needle[i])
 296                                     == CANON_ELEMENT (haystack[i + j])))
 297             ++i;
 298           if (needle_len <= i)
 299             {
 300               /* Scan for matches in left half.  */
 301               i = suffix - 1;
 302               while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
 303                                        == CANON_ELEMENT (haystack[i + j])))
 304                 --i;
 305               if (i == SIZE_MAX)
 306                 return (RETURN_TYPE) (haystack + j);
 307               j += period;
 308             }
 309           else
 310             j += i - suffix + 1;
 311         }
 312     }
 313   return NULL;
 314 }
 315 
 316 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
 317    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
 318    method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
 319    Performance is guaranteed to be linear, with an initialization cost
 320    of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
 321 
 322    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
 323    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
 324    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
 325    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
 326    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
 327    sublinear performance is not possible.  */
 328 static RETURN_TYPE
 329 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
     /* [previous][next][first][last][top][bottom][index][help] */
 330                      const unsigned char *needle, size_t needle_len)
 331 {
 332   size_t i; /* Index into current byte of NEEDLE.  */
 333   size_t j; /* Index into current window of HAYSTACK.  */
 334   size_t period; /* The period of the right half of needle.  */
 335   size_t suffix; /* The index of the right half of needle.  */
 336   size_t shift_table[1U << CHAR_BIT]; /* See below.  */
 337 
 338   /* Factor the needle into two halves, such that the left half is
 339      smaller than the global period, and the right half is
 340      periodic (with a period as large as NEEDLE_LEN - suffix).  */
 341   suffix = critical_factorization (needle, needle_len, &period);
 342 
 343   /* Populate shift_table.  For each possible byte value c,
 344      shift_table[c] is the distance from the last occurrence of c to
 345      the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
 346      shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
 347   for (i = 0; i < 1U << CHAR_BIT; i++)
 348     shift_table[i] = needle_len;
 349   for (i = 0; i < needle_len; i++)
 350     shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
 351 
 352   /* Perform the search.  Each iteration compares the right half
 353      first.  */
 354   if (CMP_FUNC (needle, needle + period, suffix) == 0)
 355     {
 356       /* Entire needle is periodic; a mismatch in the left half can
 357          only advance by the period, so use memory to avoid rescanning
 358          known occurrences of the period in the right half.  */
 359       size_t memory = 0;
 360       size_t shift;
 361       j = 0;
 362       while (AVAILABLE (haystack, haystack_len, j, needle_len))
 363         {
 364           /* Check the last byte first; if it does not match, then
 365              shift to the next possible match location.  */
 366           shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
 367           if (0 < shift)
 368             {
 369               if (memory && shift < period)
 370                 {
 371                   /* Since needle is periodic, but the last period has
 372                      a byte out of place, there can be no match until
 373                      after the mismatch.  */
 374                   shift = needle_len - period;
 375                 }
 376               memory = 0;
 377               j += shift;
 378               continue;
 379             }
 380           /* Scan for matches in right half.  The last byte has
 381              already been matched, by virtue of the shift table.  */
 382           i = MAX (suffix, memory);
 383           while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
 384                                         == CANON_ELEMENT (haystack[i + j])))
 385             ++i;
 386           if (needle_len - 1 <= i)
 387             {
 388               /* Scan for matches in left half.  */
 389               i = suffix - 1;
 390               while (memory < i + 1 && (CANON_ELEMENT (needle[i])
 391                                         == CANON_ELEMENT (haystack[i + j])))
 392                 --i;
 393               if (i + 1 < memory + 1)
 394                 return (RETURN_TYPE) (haystack + j);
 395               /* No match, so remember how many repetitions of period
 396                  on the right half were scanned.  */
 397               j += period;
 398               memory = needle_len - period;
 399             }
 400           else
 401             {
 402               j += i - suffix + 1;
 403               memory = 0;
 404             }
 405         }
 406     }
 407   else
 408     {
 409       /* The two halves of needle are distinct; no extra memory is
 410          required, and any mismatch results in a maximal shift.  */
 411       size_t shift;
 412       period = MAX (suffix, needle_len - suffix) + 1;
 413       j = 0;
 414       while (AVAILABLE (haystack, haystack_len, j, needle_len))
 415         {
 416           /* Check the last byte first; if it does not match, then
 417              shift to the next possible match location.  */
 418           shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
 419           if (0 < shift)
 420             {
 421               j += shift;
 422               continue;
 423             }
 424           /* Scan for matches in right half.  The last byte has
 425              already been matched, by virtue of the shift table.  */
 426           i = suffix;
 427           while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
 428                                         == CANON_ELEMENT (haystack[i + j])))
 429             ++i;
 430           if (needle_len - 1 <= i)
 431             {
 432               /* Scan for matches in left half.  */
 433               i = suffix - 1;
 434               while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
 435                                        == CANON_ELEMENT (haystack[i + j])))
 436                 --i;
 437               if (i == SIZE_MAX)
 438                 return (RETURN_TYPE) (haystack + j);
 439               j += period;
 440             }
 441           else
 442             j += i - suffix + 1;
 443         }
 444     }
 445   return NULL;
 446 }
 447 
 448 #undef AVAILABLE
 449 #undef CANON_ELEMENT
 450 #undef CMP_FUNC
 451 #undef MAX
 452 #undef RETURN_TYPE

/* [previous][next][first][last][top][bottom][index][help] */