1 /* Substring search in a NUL terminated string of UNIT elements, 2 using the Knuth-Morris-Pratt algorithm. 3 Copyright (C) 2005-2021 Free Software Foundation, Inc. 4 Written by Bruno Haible <bruno@clisp.org>, 2005. 5 6 This file is free software. 7 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". 8 You can redistribute it and/or modify it under either 9 - the terms of the GNU Lesser General Public License as published 10 by the Free Software Foundation; either version 3, or (at your 11 option) any later version, or 12 - the terms of the GNU General Public License as published by the 13 Free Software Foundation; either version 2, or (at your option) 14 any later version, or 15 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". 16 17 This file is distributed in the hope that it will be useful, 18 but WITHOUT ANY WARRANTY; without even the implied warranty of 19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 Lesser General Public License and the GNU General Public License 21 for more details. 22 23 You should have received a copy of the GNU Lesser General Public 24 License and of the GNU General Public License along with this 25 program. If not, see <https://www.gnu.org/licenses/>. */ 26 27 /* Before including this file, you need to define: 28 UNIT The element type of the needle and haystack. 29 CANON_ELEMENT(c) A macro that canonicalizes an element right after 30 it has been fetched from needle or haystack. 31 The argument is of type UNIT; the result must be 32 of type UNIT as well. */ 33 34 /* Knuth-Morris-Pratt algorithm. 35 See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm 36 HAYSTACK is the NUL terminated string in which to search for. 37 NEEDLE is the string to search for in HAYSTACK, consisting of NEEDLE_LEN 38 units. 39 Return a boolean indicating success: 40 Return true and set *RESULTP if the search was completed. 41 Return false if it was aborted because not enough memory was available. */ 42 static bool 43 knuth_morris_pratt (const UNIT *haystack, /* */ 44 const UNIT *needle, size_t needle_len, 45 const UNIT **resultp) 46 { 47 size_t m = needle_len; 48 49 /* Allocate the table. */ 50 size_t *table = (size_t *) nmalloca (m, sizeof (size_t)); 51 if (table == NULL) 52 return false; 53 /* Fill the table. 54 For 0 < i < m: 55 0 < table[i] <= i is defined such that 56 forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], 57 and table[i] is as large as possible with this property. 58 This implies: 59 1) For 0 < i < m: 60 If table[i] < i, 61 needle[table[i]..i-1] = needle[0..i-1-table[i]]. 62 2) For 0 < i < m: 63 rhaystack[0..i-1] == needle[0..i-1] 64 and exists h, i <= h < m: rhaystack[h] != needle[h] 65 implies 66 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. 67 table[0] remains uninitialized. */ 68 { 69 size_t i, j; 70 71 /* i = 1: Nothing to verify for x = 0. */ 72 table[1] = 1; 73 j = 0; 74 75 for (i = 2; i < m; i++) 76 { 77 /* Here: j = i-1 - table[i-1]. 78 The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold 79 for x < table[i-1], by induction. 80 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 81 UNIT b = CANON_ELEMENT (needle[i - 1]); 82 83 for (;;) 84 { 85 /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] 86 is known to hold for x < i-1-j. 87 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 88 if (b == CANON_ELEMENT (needle[j])) 89 { 90 /* Set table[i] := i-1-j. */ 91 table[i] = i - ++j; 92 break; 93 } 94 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 95 for x = i-1-j, because 96 needle[i-1] != needle[j] = needle[i-1-x]. */ 97 if (j == 0) 98 { 99 /* The inequality holds for all possible x. */ 100 table[i] = i; 101 break; 102 } 103 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 104 for i-1-j < x < i-1-j+table[j], because for these x: 105 needle[x..i-2] 106 = needle[x-(i-1-j)..j-1] 107 != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) 108 = needle[0..i-2-x], 109 hence needle[x..i-1] != needle[0..i-1-x]. 110 Furthermore 111 needle[i-1-j+table[j]..i-2] 112 = needle[table[j]..j-1] 113 = needle[0..j-1-table[j]] (by definition of table[j]). */ 114 j = j - table[j]; 115 } 116 /* Here: j = i - table[i]. */ 117 } 118 } 119 120 /* Search, using the table to accelerate the processing. */ 121 { 122 size_t j; 123 const UNIT *rhaystack; 124 const UNIT *phaystack; 125 126 *resultp = NULL; 127 j = 0; 128 rhaystack = haystack; 129 phaystack = haystack; 130 /* Invariant: phaystack = rhaystack + j. */ 131 while (*phaystack != 0) 132 if (CANON_ELEMENT (needle[j]) == CANON_ELEMENT (*phaystack)) 133 { 134 j++; 135 phaystack++; 136 if (j == m) 137 { 138 /* The entire needle has been found. */ 139 *resultp = rhaystack; 140 break; 141 } 142 } 143 else if (j > 0) 144 { 145 /* Found a match of needle[0..j-1], mismatch at needle[j]. */ 146 rhaystack += table[j]; 147 j -= table[j]; 148 } 149 else 150 { 151 /* Found a mismatch at needle[0] already. */ 152 rhaystack++; 153 phaystack++; 154 } 155 } 156 157 freea (table); 158 return true; 159 } 160 161 #undef CANON_ELEMENT